|
|
|
@ -3079,253 +3079,258 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, |
|
|
|
TCGArg a0, ...) |
|
|
|
{ |
|
|
|
va_list va; |
|
|
|
TCGArg a1, a2; |
|
|
|
TCGv_vec v0, t1, t2, t3, t4; |
|
|
|
static void expand_vec_shi(TCGType type, unsigned vece, bool shr, |
|
|
|
TCGv_vec v0, TCGv_vec v1, TCGArg imm) |
|
|
|
{ |
|
|
|
TCGv_vec t1, t2; |
|
|
|
|
|
|
|
tcg_debug_assert(vece == MO_8); |
|
|
|
|
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
|
|
|
|
/* Unpack to W, shift, and repack. Tricky bits:
|
|
|
|
(1) Use punpck*bw x,x to produce DDCCBBAA, |
|
|
|
i.e. duplicate in other half of the 16-bit lane. |
|
|
|
(2) For right-shift, add 8 so that the high half of |
|
|
|
the lane becomes zero. For left-shift, we must |
|
|
|
shift up and down again. |
|
|
|
(3) Step 2 leaves high half zero such that PACKUSWB |
|
|
|
(pack with unsigned saturation) does not modify |
|
|
|
the quantity. */ |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); |
|
|
|
|
|
|
|
if (shr) { |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, imm + 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t2, t2, imm + 8); |
|
|
|
} else { |
|
|
|
tcg_gen_shli_vec(MO_16, t1, t1, imm + 8); |
|
|
|
tcg_gen_shli_vec(MO_16, t2, t2, imm + 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t2, t2, 8); |
|
|
|
} |
|
|
|
|
|
|
|
va_start(va, a0); |
|
|
|
v0 = temp_tcgv_vec(arg_temp(a0)); |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
} |
|
|
|
|
|
|
|
switch (opc) { |
|
|
|
case INDEX_op_shli_vec: |
|
|
|
case INDEX_op_shri_vec: |
|
|
|
tcg_debug_assert(vece == MO_8); |
|
|
|
a1 = va_arg(va, TCGArg); |
|
|
|
a2 = va_arg(va, TCGArg); |
|
|
|
/* Unpack to W, shift, and repack. Tricky bits:
|
|
|
|
(1) Use punpck*bw x,x to produce DDCCBBAA, |
|
|
|
i.e. duplicate in other half of the 16-bit lane. |
|
|
|
(2) For right-shift, add 8 so that the high half of |
|
|
|
the lane becomes zero. For left-shift, we must |
|
|
|
shift up and down again. |
|
|
|
(3) Step 2 leaves high half zero such that PACKUSWB |
|
|
|
(pack with unsigned saturation) does not modify |
|
|
|
the quantity. */ |
|
|
|
static void expand_vec_sari(TCGType type, unsigned vece, |
|
|
|
TCGv_vec v0, TCGv_vec v1, TCGArg imm) |
|
|
|
{ |
|
|
|
TCGv_vec t1, t2; |
|
|
|
|
|
|
|
switch (vece) { |
|
|
|
case MO_8: |
|
|
|
/* Unpack to W, shift, and repack, as in expand_vec_shi. */ |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t1), a1, a1); |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t2), a1, a1); |
|
|
|
if (opc == INDEX_op_shri_vec) { |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); |
|
|
|
} else { |
|
|
|
vec_gen_3(INDEX_op_shli_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); |
|
|
|
vec_gen_3(INDEX_op_shli_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); |
|
|
|
} |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, |
|
|
|
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); |
|
|
|
tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); |
|
|
|
tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_sari_vec: |
|
|
|
a1 = va_arg(va, TCGArg); |
|
|
|
a2 = va_arg(va, TCGArg); |
|
|
|
if (vece == MO_8) { |
|
|
|
/* Unpack to W, shift, and repack, as above. */ |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t1), a1, a1); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t2), a1, a1); |
|
|
|
vec_gen_3(INDEX_op_sari_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); |
|
|
|
vec_gen_3(INDEX_op_sari_vec, type, MO_16, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, |
|
|
|
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
break; |
|
|
|
} |
|
|
|
tcg_debug_assert(vece == MO_64); |
|
|
|
/* MO_64: If the shift is <= 32, we can emulate the sign extend by
|
|
|
|
performing an arithmetic 32-bit shift and overwriting the high |
|
|
|
half of the result (note that the ISA says shift of 32 is valid). */ |
|
|
|
if (a2 <= 32) { |
|
|
|
case MO_64: |
|
|
|
if (imm <= 32) { |
|
|
|
/* We can emulate a small sign extend by performing an arithmetic
|
|
|
|
* 32-bit shift and overwriting the high half of a 64-bit logical |
|
|
|
* shift (note that the ISA says shift of 32 is valid). |
|
|
|
*/ |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); |
|
|
|
tcg_gen_sari_vec(MO_32, t1, v1, imm); |
|
|
|
tcg_gen_shri_vec(MO_64, v0, v1, imm); |
|
|
|
vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, |
|
|
|
a0, a0, tcgv_vec_arg(t1), 0xaa); |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(v0), |
|
|
|
tcgv_vec_arg(t1), 0xaa); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
} else { |
|
|
|
/* Otherwise we will need to use a compare vs 0 to produce
|
|
|
|
* the sign-extend, shift and merge. |
|
|
|
*/ |
|
|
|
t1 = tcg_const_zeros_vec(type); |
|
|
|
tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); |
|
|
|
tcg_gen_shri_vec(MO_64, v0, v1, imm); |
|
|
|
tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); |
|
|
|
tcg_gen_or_vec(MO_64, v0, v0, t1); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
break; |
|
|
|
} |
|
|
|
/* Otherwise we will need to use a compare vs 0 to produce the
|
|
|
|
sign-extend, shift and merge. */ |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_const_zeros_vec(type); |
|
|
|
vec_gen_4(INDEX_op_cmp_vec, type, MO_64, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); |
|
|
|
vec_gen_3(INDEX_op_shli_vec, type, MO_64, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); |
|
|
|
vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_mul_vec: |
|
|
|
tcg_debug_assert(vece == MO_8); |
|
|
|
a1 = va_arg(va, TCGArg); |
|
|
|
a2 = va_arg(va, TCGArg); |
|
|
|
switch (type) { |
|
|
|
case TCG_TYPE_V64: |
|
|
|
t1 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
t2 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
tcg_gen_dup16i_vec(t2, 0); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); |
|
|
|
tcg_gen_mul_vec(MO_16, t1, t1, t2); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, |
|
|
|
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
break; |
|
|
|
default: |
|
|
|
g_assert_not_reached(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
case TCG_TYPE_V128: |
|
|
|
t1 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
t2 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
t3 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
t4 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
tcg_gen_dup16i_vec(t4, 0); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); |
|
|
|
tcg_gen_mul_vec(MO_16, t1, t1, t2); |
|
|
|
tcg_gen_mul_vec(MO_16, t3, t3, t4); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t3, t3, 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, |
|
|
|
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
tcg_temp_free_vec(t3); |
|
|
|
tcg_temp_free_vec(t4); |
|
|
|
break; |
|
|
|
static void expand_vec_mul(TCGType type, unsigned vece, |
|
|
|
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) |
|
|
|
{ |
|
|
|
TCGv_vec t1, t2, t3, t4; |
|
|
|
|
|
|
|
case TCG_TYPE_V256: |
|
|
|
t1 = tcg_temp_new_vec(TCG_TYPE_V256); |
|
|
|
t2 = tcg_temp_new_vec(TCG_TYPE_V256); |
|
|
|
t3 = tcg_temp_new_vec(TCG_TYPE_V256); |
|
|
|
t4 = tcg_temp_new_vec(TCG_TYPE_V256); |
|
|
|
tcg_gen_dup16i_vec(t4, 0); |
|
|
|
/* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
|
|
|
|
t1: extends of B[0-7], D[0-7] |
|
|
|
t2: extends of X[0-7], Z[0-7] |
|
|
|
t3: extends of A[0-7], C[0-7] |
|
|
|
t4: extends of W[0-7], Y[0-7]. */ |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, |
|
|
|
tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, |
|
|
|
tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, |
|
|
|
tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); |
|
|
|
/* t1: BX DZ; t2: AW CY. */ |
|
|
|
tcg_gen_mul_vec(MO_16, t1, t1, t2); |
|
|
|
tcg_gen_mul_vec(MO_16, t3, t3, t4); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t3, t3, 8); |
|
|
|
/* a0: AW BX CY DZ. */ |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, |
|
|
|
a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
tcg_temp_free_vec(t3); |
|
|
|
tcg_temp_free_vec(t4); |
|
|
|
break; |
|
|
|
tcg_debug_assert(vece == MO_8); |
|
|
|
|
|
|
|
default: |
|
|
|
g_assert_not_reached(); |
|
|
|
} |
|
|
|
/*
|
|
|
|
* Unpack v1 bytes to words, 0 | x. |
|
|
|
* Unpack v2 bytes to words, y | 0. |
|
|
|
* This leaves the 8-bit result, x * y, with 8 bits of right padding. |
|
|
|
* Shift logical right by 8 bits to clear the high 8 bytes before |
|
|
|
* using an unsigned saturated pack. |
|
|
|
* |
|
|
|
* The difference between the V64, V128 and V256 cases is merely how |
|
|
|
* we distribute the expansion between temporaries. |
|
|
|
*/ |
|
|
|
switch (type) { |
|
|
|
case TCG_TYPE_V64: |
|
|
|
t1 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
t2 = tcg_temp_new_vec(TCG_TYPE_V128); |
|
|
|
tcg_gen_dup16i_vec(t2, 0); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2)); |
|
|
|
tcg_gen_mul_vec(MO_16, t1, t1, t2); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_cmp_vec: |
|
|
|
{ |
|
|
|
enum { |
|
|
|
NEED_SWAP = 1, |
|
|
|
NEED_INV = 2, |
|
|
|
NEED_BIAS = 4 |
|
|
|
}; |
|
|
|
static const uint8_t fixups[16] = { |
|
|
|
[0 ... 15] = -1, |
|
|
|
[TCG_COND_EQ] = 0, |
|
|
|
[TCG_COND_NE] = NEED_INV, |
|
|
|
[TCG_COND_GT] = 0, |
|
|
|
[TCG_COND_LT] = NEED_SWAP, |
|
|
|
[TCG_COND_LE] = NEED_INV, |
|
|
|
[TCG_COND_GE] = NEED_SWAP | NEED_INV, |
|
|
|
[TCG_COND_GTU] = NEED_BIAS, |
|
|
|
[TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, |
|
|
|
[TCG_COND_LEU] = NEED_BIAS | NEED_INV, |
|
|
|
[TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, |
|
|
|
}; |
|
|
|
case TCG_TYPE_V128: |
|
|
|
case TCG_TYPE_V256: |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
t3 = tcg_temp_new_vec(type); |
|
|
|
t4 = tcg_temp_new_vec(type); |
|
|
|
tcg_gen_dup16i_vec(t4, 0); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); |
|
|
|
vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); |
|
|
|
tcg_gen_mul_vec(MO_16, t1, t1, t2); |
|
|
|
tcg_gen_mul_vec(MO_16, t3, t3, t4); |
|
|
|
tcg_gen_shri_vec(MO_16, t1, t1, 8); |
|
|
|
tcg_gen_shri_vec(MO_16, t3, t3, 8); |
|
|
|
vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
tcg_temp_free_vec(t3); |
|
|
|
tcg_temp_free_vec(t4); |
|
|
|
break; |
|
|
|
|
|
|
|
TCGCond cond; |
|
|
|
uint8_t fixup; |
|
|
|
default: |
|
|
|
g_assert_not_reached(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
a1 = va_arg(va, TCGArg); |
|
|
|
a2 = va_arg(va, TCGArg); |
|
|
|
cond = va_arg(va, TCGArg); |
|
|
|
fixup = fixups[cond & 15]; |
|
|
|
tcg_debug_assert(fixup != 0xff); |
|
|
|
static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, |
|
|
|
TCGv_vec v1, TCGv_vec v2, TCGCond cond) |
|
|
|
{ |
|
|
|
enum { |
|
|
|
NEED_SWAP = 1, |
|
|
|
NEED_INV = 2, |
|
|
|
NEED_BIAS = 4 |
|
|
|
}; |
|
|
|
static const uint8_t fixups[16] = { |
|
|
|
[0 ... 15] = -1, |
|
|
|
[TCG_COND_EQ] = 0, |
|
|
|
[TCG_COND_NE] = NEED_INV, |
|
|
|
[TCG_COND_GT] = 0, |
|
|
|
[TCG_COND_LT] = NEED_SWAP, |
|
|
|
[TCG_COND_LE] = NEED_INV, |
|
|
|
[TCG_COND_GE] = NEED_SWAP | NEED_INV, |
|
|
|
[TCG_COND_GTU] = NEED_BIAS, |
|
|
|
[TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, |
|
|
|
[TCG_COND_LEU] = NEED_BIAS | NEED_INV, |
|
|
|
[TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, |
|
|
|
}; |
|
|
|
TCGv_vec t1, t2; |
|
|
|
uint8_t fixup; |
|
|
|
|
|
|
|
if (fixup & NEED_INV) { |
|
|
|
cond = tcg_invert_cond(cond); |
|
|
|
} |
|
|
|
if (fixup & NEED_SWAP) { |
|
|
|
TCGArg t; |
|
|
|
t = a1, a1 = a2, a2 = t; |
|
|
|
cond = tcg_swap_cond(cond); |
|
|
|
} |
|
|
|
fixup = fixups[cond & 15]; |
|
|
|
tcg_debug_assert(fixup != 0xff); |
|
|
|
|
|
|
|
t1 = t2 = NULL; |
|
|
|
if (fixup & NEED_BIAS) { |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); |
|
|
|
tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); |
|
|
|
tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); |
|
|
|
a1 = tcgv_vec_arg(t1); |
|
|
|
a2 = tcgv_vec_arg(t2); |
|
|
|
cond = tcg_signed_cond(cond); |
|
|
|
} |
|
|
|
if (fixup & NEED_INV) { |
|
|
|
cond = tcg_invert_cond(cond); |
|
|
|
} |
|
|
|
if (fixup & NEED_SWAP) { |
|
|
|
t1 = v1, v1 = v2, v2 = t1; |
|
|
|
cond = tcg_swap_cond(cond); |
|
|
|
} |
|
|
|
|
|
|
|
tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); |
|
|
|
vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); |
|
|
|
t1 = t2 = NULL; |
|
|
|
if (fixup & NEED_BIAS) { |
|
|
|
t1 = tcg_temp_new_vec(type); |
|
|
|
t2 = tcg_temp_new_vec(type); |
|
|
|
tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); |
|
|
|
tcg_gen_sub_vec(vece, t1, v1, t2); |
|
|
|
tcg_gen_sub_vec(vece, t2, v2, t2); |
|
|
|
v1 = t1; |
|
|
|
v2 = t2; |
|
|
|
cond = tcg_signed_cond(cond); |
|
|
|
} |
|
|
|
|
|
|
|
if (fixup & NEED_BIAS) { |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
} |
|
|
|
if (fixup & NEED_INV) { |
|
|
|
tcg_gen_not_vec(vece, v0, v0); |
|
|
|
} |
|
|
|
tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); |
|
|
|
/* Expand directly; do not recurse. */ |
|
|
|
vec_gen_4(INDEX_op_cmp_vec, type, vece, |
|
|
|
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); |
|
|
|
|
|
|
|
if (t1) { |
|
|
|
tcg_temp_free_vec(t1); |
|
|
|
if (t2) { |
|
|
|
tcg_temp_free_vec(t2); |
|
|
|
} |
|
|
|
} |
|
|
|
if (fixup & NEED_INV) { |
|
|
|
tcg_gen_not_vec(vece, v0, v0); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, |
|
|
|
TCGArg a0, ...) |
|
|
|
{ |
|
|
|
va_list va; |
|
|
|
TCGArg a2; |
|
|
|
TCGv_vec v0, v1, v2; |
|
|
|
|
|
|
|
va_start(va, a0); |
|
|
|
v0 = temp_tcgv_vec(arg_temp(a0)); |
|
|
|
v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); |
|
|
|
a2 = va_arg(va, TCGArg); |
|
|
|
|
|
|
|
switch (opc) { |
|
|
|
case INDEX_op_shli_vec: |
|
|
|
case INDEX_op_shri_vec: |
|
|
|
expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_sari_vec: |
|
|
|
expand_vec_sari(type, vece, v0, v1, a2); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_mul_vec: |
|
|
|
v2 = temp_tcgv_vec(arg_temp(a2)); |
|
|
|
expand_vec_mul(type, vece, v0, v1, v2); |
|
|
|
break; |
|
|
|
|
|
|
|
case INDEX_op_cmp_vec: |
|
|
|
v2 = temp_tcgv_vec(arg_temp(a2)); |
|
|
|
expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); |
|
|
|
break; |
|
|
|
|
|
|
|
default: |
|
|
|
|