From 5bd020bddbee667607c4d1d7c152124801c33e36 Mon Sep 17 00:00:00 2001 From: Jason Conway Date: Sat, 15 Oct 2022 20:26:38 -0500 Subject: [PATCH 1/3] legalize instructions Add mov instructions for packed floating-point values; single-precision min, max, and sqrt; movntq and movntdq --- emu/decode.h | 32 ++++++++++++++++++++++++++------ emu/vec.c | 23 ++++++++++++++++++++--- emu/vec.h | 10 +++++++++- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/emu/decode.h b/emu/decode.h index 7edf05f217..9cb25f8598 100644 --- a/emu/decode.h +++ b/emu/decode.h @@ -268,11 +268,18 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break; case 0x11: TRACEI("movupd xmm, xmm:modrm"); READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break; + case 0x12: TRACEI("movlpd xmm, modrm"); + READMODRM; V_OP(movl_p, modrm_val, xmm_modrm_reg,64); break; + case 0x13: TRACEI("movlpd modrm, xmm"); + READMODRM; V_OP(movl_pm, xmm_modrm_reg, modrm_val,64); break; case 0x14: TRACEI("unpcklpd xmm, xmm:modrm"); READMODRM; V_OP(unpackl_pd, xmm_modrm_val, xmm_modrm_reg,128); break; case 0x15: TRACEI("unpckhpd xmm, xmm:modrm"); READMODRM; V_OP(unpackh_pd, xmm_modrm_val, xmm_modrm_reg,128); break; - + case 0x16: TRACEI("movhpd xmm, modrm"); + READMODRM; V_OP(movh_p, modrm_val, xmm_modrm_reg,64); break; + case 0x17: TRACEI("movhpd modrm, xmm"); + READMODRM; V_OP(movh_pm, xmm_modrm_reg, modrm_val,64); break; case 0x2e: TRACEI("ucomisd xmm, xmm:modrm"); READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,64); break; case 0x2f: TRACEI("comisd xmm, xmm:modrm"); @@ -422,6 +429,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; V_OP(mulu, xmm_modrm_val, xmm_modrm_reg, 128); break; case 0xe6: TRACEI("cvttpd2dq xmm:modrm, xmm"); READMODRM; V_OP(cvttpd2dq, xmm_modrm_val, xmm_modrm_reg,64); break; + case 0xe7: TRACEI("movntdq xmm, xmm:modrm"); + READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break; case 0xe8: TRACEI("psubsb xmm:modrm, xmm"); READMODRM; V_OP(subss_b, xmm_modrm_val, xmm_modrm_reg,128); break; case 0xe9: TRACEI("psubsw xmm:modrm, xmm"); @@ -465,14 +474,18 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break; case 0x11: TRACEI("movups xmm, xmm:modrm"); READMODRM; VMOV(xmm_modrm_reg, xmm_modrm_val,128); break; - + case 0x12: TRACEI("movlps xmm, modrm"); + READMODRM; V_OP(movl_p, modrm_val, xmm_modrm_reg,64); break; + case 0x13: TRACEI("movlps modrm, xmm"); + READMODRM; V_OP(movl_pm, xmm_modrm_reg, modrm_val,64); break; case 0x14: TRACEI("unpcklps xmm, xmm:modrm"); READMODRM; V_OP(unpackl_ps, xmm_modrm_val, xmm_modrm_reg,128); break; case 0x15: TRACEI("unpckhps xmm, xmm:modrm"); READMODRM; V_OP(unpackh_ps, xmm_modrm_val, xmm_modrm_reg,128); break; - case 0x16: TRACEI("movlhps xmm, xmm:modrm"); - READMODRM; V_OP(movlh_ps, xmm_modrm_val, xmm_modrm_reg,128); break; - + case 0x16: TRACEI("movhps xmm, modrm"); + READMODRM; V_OP(movh_p, modrm_val, xmm_modrm_reg,64); break; + case 0x17: TRACEI("movhps modrm, xmm"); + READMODRM; V_OP(movh_pm, xmm_modrm_reg, modrm_val,64); break; case 0x2e: TRACEI("ucomiss xmm, xmm:modrm"); READMODRM; V_OP(single_ucomi, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x2f: TRACEI("comiss xmm, xmm:modrm"); @@ -530,7 +543,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { case 0xe5: TRACEI("pmulhw mm:modrm, mm"); READMODRM; V_OP(mulu, mm_modrm_val, mm_modrm_reg,64); break; - + case 0xe7: TRACEI("movntq mm, mm:modrm"); + READMODRM_MEM; VMOV(mm_modrm_reg, mm_modrm_val,64); break; case 0xef: TRACEI("pxor mm:modrm, mm"); READMODRM; V_OP(xor, mm_modrm_val, mm_modrm_reg,64); break; @@ -1186,6 +1200,8 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; V_OP(cvtsi2ss, modrm_val, xmm_modrm_reg,32); break; case 0x2c: TRACEI("cvttss2si reg, xmm:modrm"); READMODRM; V_OP(cvttss2si, xmm_modrm_val, modrm_reg,32); break; + case 0x51: TRACEI("sqrtss xmm:modrm, xmm"); + READMODRM; V_OP(single_fsqrt, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x5a: TRACEI("cvtss2sd xmm:modrm, xmm"); READMODRM; V_OP(cvtss2sd, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x5b: TRACEI("cvttps2dq xmm:modrm, xmm"); @@ -1197,8 +1213,12 @@ __no_instrument DECODER_RET glue(DECODER_NAME, OP_SIZE)(DECODER_ARGS) { READMODRM; V_OP(single_fmul, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x5c: TRACEI("subss xmm:modrm, xmm"); READMODRM; V_OP(single_fsub, xmm_modrm_val, xmm_modrm_reg,32); break; + case 0x5d: TRACEI("minss xmm:modrm, xmm"); + READMODRM; V_OP(single_fmin, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x5e: TRACEI("divss xmm:modrm, xmm"); READMODRM; V_OP(single_fdiv, xmm_modrm_val, xmm_modrm_reg,32); break; + case 0x5f: TRACEI("maxss xmm:modrm, xmm"); + READMODRM; V_OP(single_fmax, xmm_modrm_val, xmm_modrm_reg,32); break; case 0x6f: TRACEI("movdqu xmm:modrm, xmm"); READMODRM; VMOV(xmm_modrm_val, xmm_modrm_reg,128); break; diff --git a/emu/vec.c b/emu/vec.c index 0fb2e30b34..3b1e002627 100644 --- a/emu/vec.c +++ b/emu/vec.c @@ -381,6 +381,7 @@ void vec_single_fdiv64(NO_CPU, const double *src, double *dst) { *dst /= *src; } void vec_single_fdiv32(NO_CPU, const float *src, float *dst) { *dst /= *src; } void vec_single_fsqrt64(NO_CPU, const double *src, double *dst) { *dst = sqrt(*src); } +void vec_single_fsqrt32(NO_CPU, const float *src, float *dst) { *dst = sqrtf(*src); } void vec_single_fmax64(NO_CPU, const double *src, double *dst) { if (*src > *dst || isnan(*src) || isnan(*dst)) *dst = *src; @@ -388,6 +389,12 @@ void vec_single_fmax64(NO_CPU, const double *src, double *dst) { void vec_single_fmin64(NO_CPU, const double *src, double *dst) { if (*src < *dst || isnan(*src) || isnan(*dst)) *dst = *src; } +void vec_single_fmax32(NO_CPU, const float *src, float *dst) { + if (*src > *dst || isnan(*src) || isnan(*dst)) *dst = *src; +} +void vec_single_fmin32(NO_CPU, const float *src, float *dst) { + if (*src < *dst || isnan(*src) || isnan(*dst)) *dst = *src; +} void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst) { cpu->zf_res = cpu->pf_res = 0; @@ -531,9 +538,6 @@ void vec_unpackh_pd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) { dst->f64[0] = dst->f64[1]; dst->f64[1] = src->f64[1]; } -void vec_movlh_ps128(NO_CPU, union xmm_reg *src, union xmm_reg *dst) { - dst->qw[1] = src->qw[0]; -} void vec_packss_w128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst) { dst->u32[0] = (satsw(dst->u16[0]) << 0x00) | (satsw(dst->u16[1]) << 0x08) | @@ -622,6 +626,19 @@ void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst) { } } +void vec_movl_p64(NO_CPU, const uint64_t *src, union xmm_reg *dst) { + dst->qw[0] = *src; +} +void vec_movl_pm64(NO_CPU, const union xmm_reg *src, uint64_t *dst) { + *dst = src->qw[0]; +} +void vec_movh_p64(NO_CPU, const uint64_t *src, union xmm_reg *dst) { + dst->qw[1] = *src; +} +void vec_movh_pm64(NO_CPU, const union xmm_reg *src, uint64_t *dst) { + *dst = src->qw[1]; +} + void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index) { *dst = src->u16[index % 8]; } diff --git a/emu/vec.h b/emu/vec.h index 51c3e5ff60..96f42eeda4 100644 --- a/emu/vec.h +++ b/emu/vec.h @@ -87,9 +87,12 @@ void vec_single_fsub32(NO_CPU, const float *src, float *dst); void vec_single_fdiv64(NO_CPU, const double *src, double *dst); void vec_single_fdiv32(NO_CPU, const float *src, float *dst); void vec_single_fsqrt64(NO_CPU, const double *src, double *dst); +void vec_single_fsqrt32(NO_CPU, const float *src, float *dst); void vec_single_fmax64(NO_CPU, const double *src, double *dst); +void vec_single_fmax32(NO_CPU, const float *src, float *dst); void vec_single_fmin64(NO_CPU, const double *src, double *dst); +void vec_single_fmin32(NO_CPU, const float *src, float *dst); void vec_single_ucomi32(struct cpu_state *cpu, const float *src, const float *dst); void vec_single_ucomi64(struct cpu_state *cpu, const double *src, const double *dst); void vec_single_fcmp64(NO_CPU, const double *src, union xmm_reg *dst, uint8_t type); @@ -124,7 +127,7 @@ void vec_unpackh_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); void vec_unpackh_dq128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); void vec_unpackh_ps128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); void vec_unpackh_pd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); -void vec_movlh_ps128(NO_CPU, union xmm_reg *src, union xmm_reg *dst); + void vec_shuffle_lw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding); void vec_shuffle_hw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding); void vec_shuffle_d128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst, uint8_t encoding); @@ -136,6 +139,11 @@ void vec_compares_gtb128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); void vec_compares_gtw128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); void vec_compares_gtd128(NO_CPU, const union xmm_reg *src, union xmm_reg *dst); +void vec_movl_p64(NO_CPU, const uint64_t *src, union xmm_reg *dst); +void vec_movl_pm64(NO_CPU, const union xmm_reg *src, uint64_t *dst); +void vec_movh_p64(NO_CPU, const uint64_t *src, union xmm_reg *dst); +void vec_movh_pm64(NO_CPU, const union xmm_reg *src, uint64_t *dst); + void vec_movmask_b128(NO_CPU, const union xmm_reg *src, uint32_t *dst); void vec_fmovmask_d128(NO_CPU, const union xmm_reg *src, uint32_t *dst); void vec_extract_w128(NO_CPU, const union xmm_reg *src, uint32_t *dst, uint8_t index); From 8146bf6989ae17cf7e4dccbda0b23599b67102f6 Mon Sep 17 00:00:00 2001 From: Jason Conway Date: Sat, 15 Oct 2022 20:33:25 -0500 Subject: [PATCH 2/3] Expand qemu-test and add tests for new instructions --- tests/e2e/qemu/qemu-test.c | 86 +++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/tests/e2e/qemu/qemu-test.c b/tests/e2e/qemu/qemu-test.c index 76e4393e5b..d8044bf4cb 100644 --- a/tests/e2e/qemu/qemu-test.c +++ b/tests/e2e/qemu/qemu-test.c @@ -2116,8 +2116,8 @@ static void test_enter(void) #endif #ifdef TEST_SSE -typedef int __m64 __attribute__ ((__mode__ (__V2SI__))); -typedef float __m128 __attribute__ ((__mode__(__V4SF__))); +typedef int __m64 __attribute__ ((vector_size (8))); +typedef float __m128 __attribute__ ((vector_size (16))); typedef union { double d[2]; @@ -2134,6 +2134,73 @@ static uint64_t __attribute__((aligned(16))) test_values[4][2] = { { 0x0f76255a085427f8, 0xc233e9e8c4c9439a }, }; +#define MOV_OP(op, hi, rm)\ +{\ + r.q[0] = r.q[1] = 0;\ + if (rm) {\ + uint64_t mem;\ + asm volatile (#op " %1, %0" : "=m" (mem) : "x" (a.dq));\ + printf("%-9s: a=" FMT64X "" FMT64X " r=" FMT64X "\n", #op, a.q[1], a.q[0], mem);\ + } else {\ + uint64_t mem = a.q[hi];\ + asm volatile (#op " %1, %0" : "=x" (r.dq) : "m" (mem));\ + printf("%-9s: a=" FMT64X " r=" FMT64X "" FMT64X "\n", #op, mem, r.q[1], r.q[0]);\ + }\ +} +#define MOV_OP_REGMEM(op, hi, rm)\ +{\ + int i;\ + for(i=0;i<2;i++) {\ + a.q[0] = test_values[2*i][0];\ + a.q[1] = test_values[2*i][1];\ + MOV_OP(op, hi, rm);\ + }\ +} +#define MOVL_OP2(op)\ +{\ + MOV_OP_REGMEM(op, 0, 0);\ + MOV_OP_REGMEM(op, 0, 1);\ +} +#define MOVH_OP2(op)\ +{\ + MOV_OP_REGMEM(op, 1, 0);\ + MOV_OP_REGMEM(op, 1, 1);\ +} +#define MOVNT_OP(op, quad)\ +{\ + r.q[0] = r.q[1] = 0;\ + if (quad) {\ + asm volatile (#op " %1, %0" : "=m" (r.dq) : "x" (a.dq));\ + printf("%-9s: a=" FMT64X "" FMT64X " r=" FMT64X "" FMT64X "\n", #op, a.q[1], a.q[0], r.q[1], r.q[0]);\ + } else {\ + asm volatile (#op " %1, %0" : "=m" (r.q[0]) : "y" (a.q[0]));\ + printf("%-9s: a=" FMT64X " r=" FMT64X "\n", #op, a.q[0], r.q[0]);\ + }\ +} +#define MOVNT_OP2(op,quad)\ +{\ + int i;\ + for(i=0;i<2;i++) {\ + a.q[0] = test_values[2*i][0];\ + a.q[1] = test_values[2*i][1];\ + MOVNT_OP(op, quad);\ + }\ +} +#define MOVU_OP(op)\ +{\ + asm volatile (#op " %1, %0" : "=x" (r.dq) : "x" (a.dq));\ + printf("%-9s: a=" FMT64X "" FMT64X " r=" FMT64X "" FMT64X "\n",#op, a.q[1], a.q[0], r.q[1], r.q[0]);\ +} +#define MOVU_OP2(op)\ +{\ + int i;\ + for(i=0;i<2;i++) {\ + a.q[0] = test_values[2*i][0];\ + a.q[1] = test_values[2*i][1];\ + MOVU_OP(op);\ + }\ +} + #define SSE_OP(op)\ {\ asm volatile (#op " %2, %0" : "=x" (r.dq) : "0" (a.dq), "x" (b.dq));\ @@ -2701,6 +2768,21 @@ void test_sse(void) // CVT_OP_XMM(cvtdq2ps); // CVT_OP_XMM(cvtdq2pd); + /* sse/sse2 moves */ + MOVL_OP2(movlps); + MOVH_OP2(movhps); + MOVL_OP2(movlpd); + MOVH_OP2(movhpd); + MOVNT_OP2(movntq, 0); + MOVNT_OP2(movntdq, 1); + MOVU_OP2(movups); + MOVU_OP2(movupd); + + /* misc sse ops*/ + SSE_OP2(minss); + SSE_OP2(maxss); + SSE_OP2(sqrtss); + /* XXX: test PNI insns */ #if 0 SSE_OP2(movshdup); From f76e46a106eb985864ee2f34faa798d01e9c762a Mon Sep 17 00:00:00 2001 From: Jason Conway Date: Sat, 15 Oct 2022 20:35:03 -0500 Subject: [PATCH 3/3] Update golden file --- tests/e2e/qemu/expected.txt | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/e2e/qemu/expected.txt b/tests/e2e/qemu/expected.txt index 340d49b1a2..d539307d3c 100644 --- a/tests/e2e/qemu/expected.txt +++ b/tests/e2e/qemu/expected.txt @@ -4696,3 +4696,33 @@ cvttsd2si: a=c00b3333333333334004cccccccccccd r=00000002 cvttpd2dq: a=c00b3333333333334004cccccccccccd r=0000000000000000fffffffd00000002 cvtsi2ss : a=fffffffa r=0000000000000000fffffffdc0c00000 cvtsi2sd : a=fffffffa r=0000000000000000c018000000000000 +movlps : a=456723c698694873 r=0000000000000000456723c698694873 +movlps : a=007c62c2085427f8 r=0000000000000000007c62c2085427f8 +movlps : a=dc515cff944a58ec456723c698694873 r=456723c698694873 +movlps : a=231be9e8cde7438d007c62c2085427f8 r=007c62c2085427f8 +movhps : a=dc515cff944a58ec r=dc515cff944a58ec007c62c2085427f8 +movhps : a=231be9e8cde7438d r=231be9e8cde7438d007c62c2085427f8 +movhps : a=dc515cff944a58ec456723c698694873 r=dc515cff944a58ec +movhps : a=231be9e8cde7438d007c62c2085427f8 r=231be9e8cde7438d +movlpd : a=456723c698694873 r=231be9e8cde7438d456723c698694873 +movlpd : a=007c62c2085427f8 r=231be9e8cde7438d007c62c2085427f8 +movlpd : a=dc515cff944a58ec456723c698694873 r=456723c698694873 +movlpd : a=231be9e8cde7438d007c62c2085427f8 r=007c62c2085427f8 +movhpd : a=dc515cff944a58ec r=dc515cff944a58ec007c62c2085427f8 +movhpd : a=231be9e8cde7438d r=231be9e8cde7438d007c62c2085427f8 +movhpd : a=dc515cff944a58ec456723c698694873 r=dc515cff944a58ec +movhpd : a=231be9e8cde7438d007c62c2085427f8 r=231be9e8cde7438d +movntq : a=456723c698694873 r=456723c698694873 +movntq : a=007c62c2085427f8 r=007c62c2085427f8 +movntdq : a=dc515cff944a58ec456723c698694873 r=dc515cff944a58ec456723c698694873 +movntdq : a=231be9e8cde7438d007c62c2085427f8 r=231be9e8cde7438d007c62c2085427f8 +movups : a=dc515cff944a58ec456723c698694873 r=dc515cff944a58ec456723c698694873 +movups : a=231be9e8cde7438d007c62c2085427f8 r=231be9e8cde7438d007c62c2085427f8 +movupd : a=dc515cff944a58ec456723c698694873 r=dc515cff944a58ec456723c698694873 +movupd : a=231be9e8cde7438d007c62c2085427f8 r=231be9e8cde7438d007c62c2085427f8 +minss : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=dc515cff944a58ec456723c698694873 +minss : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=231be9e8cde7438d007c62c2085427f8 +maxss : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=dc515cff944a58ec456723c658bad7ab +maxss : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=231be9e8cde7438d007c62c2085427f8 +sqrtss : a=dc515cff944a58ec456723c698694873 b=41f21efba9e3e1461f297ccd58bad7ab r=dc515cff944a58ec456723c64c1aa5bf +sqrtss : a=231be9e8cde7438d007c62c2085427f8 b=c233e9e8c4c9439a0f76255a085427f8 r=231be9e8cde7438d007c62c223e90c9e