The SSE2 instructions cvttps2dq, movdq2q, movq2dq do not behave correctly, as shown by the attached program. It should print
cvttps2dq_1 ... ok cvttps2dq_2 ... ok movdq2q_1 ... ok movq2dq_1 ... ok but instead produces cvttps2dq_1 ... ok cvttps2dq_2 ... not ok result0.sd[0] = 12 (expected 12) result0.sd[1] = 3 (expected 56) result0.sd[2] = -2147483648 (expected 43) result0.sd[3] = 3 (expected 87) movdq2q_1 ... not ok result0.uq[0] = 1302123111658042420 (expected 5124095577148911) movq2dq_1 ... not ok result0.uq[0] = 1302123111658042420 (expected 5124095577148911) result0.uq[1] = 6221254864647256184 (expected 0) I looked at QEMU's instruction decoders for these, and compared them to Valgrind's, but could not see what the problem was. The decode logic looks OK. Maybe the problem is elsewhere. J ------------------------------------------------------------------- #include <math.h> #include <setjmp.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> typedef union { char sb[1]; unsigned char ub[1]; } reg8_t; typedef union { char sb[2]; unsigned char ub[2]; short sw[1]; unsigned short uw[1]; } reg16_t; typedef union { char sb[4]; unsigned char ub[4]; short sw[2]; unsigned short uw[2]; long int sd[1]; unsigned long int ud[1]; float ps[1]; } reg32_t; typedef union { char sb[8]; unsigned char ub[8]; short sw[4]; unsigned short uw[4]; long int sd[2]; unsigned long int ud[2]; long long int sq[1]; unsigned long long int uq[1]; float ps[2]; double pd[1]; } reg64_t __attribute__ ((aligned (8))); typedef union { char sb[16]; unsigned char ub[16]; short sw[8]; unsigned short uw[8]; long int sd[4]; unsigned long int ud[4]; long long int sq[2]; unsigned long long int uq[2]; float ps[4]; double pd[2]; } reg128_t __attribute__ ((aligned (16))); static sigjmp_buf catchpoint; static void handle_sigill(int signum) { siglongjmp(catchpoint, 1); } __attribute__((unused)) static int eq_float(float f1, float f2) { return f1 == f2 || fabsf(f1 - f2) < fabsf(f1) * 1.5 * pow(2,-12); } __attribute__((unused)) static int eq_double(double d1, double d2) { return d1 == d2 || fabs(d1 - d2) < fabs(d1) * 1.5 * pow(2,-12); } static void cvttps2dq_1(void) { reg128_t arg0 = { .ps = { 12.34F, 56.78F, 43.21F, 87.65F } }; reg128_t arg1 = { .sd = { 1L, 2L, 3L, 4L } }; reg128_t result0; char state[108]; if (sigsetjmp(catchpoint, 1) == 0) { asm( "fsave %3\n" "movlps 0%0, %%xmm4\n" "movhps 8%0, %%xmm4\n" "movlps 0%1, %%xmm5\n" "movhps 8%1, %%xmm5\n" "cvttps2dq %%xmm4, %%xmm5\n" "movlps %%xmm5, 0%2\n" "movhps %%xmm5, 8%2\n" "frstor %3\n" : : "m" (arg0), "m" (arg1), "m" (result0), "m" (state[0]) : "xmm4", "xmm5" ); if (result0.sd[0] == 12L && result0.sd[1] == 56L && result0.sd[2] == 43L && result0.sd[3] == 87L ) { printf("cvttps2dq_1 ... ok\n"); } else { printf("cvttps2dq_1 ... not ok\n"); printf(" result0.sd[0] = %ld (expected %ld)\n", result0.sd[0], 12L); printf(" result0.sd[1] = %ld (expected %ld)\n", result0.sd[1], 56L); printf(" result0.sd[2] = %ld (expected %ld)\n", result0.sd[2], 43L); printf(" result0.sd[3] = %ld (expected %ld)\n", result0.sd[3], 87L); } } else { printf("cvttps2dq_1 ... failed\n"); } return; } static void cvttps2dq_2(void) { reg128_t arg0 = { .ps = { 12.34F, 56.78F, 43.21F, 87.65F } }; reg128_t arg1 = { .sd = { 1L, 2L, 3L, 4L } }; reg128_t result0; char state[108]; if (sigsetjmp(catchpoint, 1) == 0) { asm( "fsave %3\n" "movlps 0%1, %%xmm5\n" "movhps 8%1, %%xmm5\n" "cvttps2dq %0, %%xmm5\n" "movlps %%xmm5, 0%2\n" "movhps %%xmm5, 8%2\n" "frstor %3\n" : : "m" (arg0), "m" (arg1), "m" (result0), "m" (state[0]) : "xmm4", "xmm5" ); if (result0.sd[0] == 12L && result0.sd[1] == 56L && result0.sd[2] == 43L && result0.sd[3] == 87L ) { printf("cvttps2dq_2 ... ok\n"); } else { printf("cvttps2dq_2 ... not ok\n"); printf(" result0.sd[0] = %ld (expected %ld)\n", result0.sd[0], 12L); printf(" result0.sd[1] = %ld (expected %ld)\n", result0.sd[1], 56L); printf(" result0.sd[2] = %ld (expected %ld)\n", result0.sd[2], 43L); printf(" result0.sd[3] = %ld (expected %ld)\n", result0.sd[3], 87L); } } else { printf("cvttps2dq_2 ... failed\n"); } return; } static void movdq2q_1(void) { reg128_t arg0 = { .uq = { 0x012345678abcdefULL, 0xfedcba9876543210ULL } }; reg64_t arg1 = { .uq = { 0x1212121234343434ULL } }; reg64_t result0; char state[108]; if (sigsetjmp(catchpoint, 1) == 0) { asm( "fsave %3\n" "movlps 0%0, %%xmm4\n" "movhps 8%0, %%xmm4\n" "movq %1, %%mm6\n" "movdq2q %%xmm4, %%mm6\n" "movq %%mm6, %2\n" "frstor %3\n" : : "m" (arg0), "m" (arg1), "m" (result0), "m" (state[0]) : "xmm4", "mm6" ); if (result0.uq[0] == 0x012345678abcdefULL ) { printf("movdq2q_1 ... ok\n"); } else { printf("movdq2q_1 ... not ok\n"); printf(" result0.uq[0] = %llu (expected %llu)\n", result0.uq[0], 0x012345678abcdefULL); } } else { printf("movdq2q_1 ... failed\n"); } return; } static void movq2dq_1(void) { reg64_t arg0 = { .uq = { 0x012345678abcdefULL } }; reg128_t arg1 = { .uq = { 0x1212121234343434ULL, 0x5656565678787878ULL } }; reg128_t result0; char state[108]; if (sigsetjmp(catchpoint, 1) == 0) { asm( "fsave %3\n" "movq %0, %%mm6\n" "movlps 0%1, %%xmm4\n" "movhps 8%1, %%xmm4\n" "movq2dq %%mm6, %%xmm4\n" "movlps %%xmm4, 0%2\n" "movhps %%xmm4, 8%2\n" "frstor %3\n" : : "m" (arg0), "m" (arg1), "m" (result0), "m" (state[0]) : "mm6", "xmm4" ); if (result0.uq[0] == 0x012345678abcdefULL && result0.uq[1] == 0ULL ) { printf("movq2dq_1 ... ok\n"); } else { printf("movq2dq_1 ... not ok\n"); printf(" result0.uq[0] = %llu (expected %llu)\n", result0.uq[0], 0x012345678abcdefULL); printf(" result0.uq[1] = %llu (expected %llu)\n", result0.uq[1], 0ULL); } } else { printf("movq2dq_1 ... failed\n"); } return; } int main(int argc, char **argv) { signal(SIGILL, handle_sigill); cvttps2dq_1(); cvttps2dq_2(); movdq2q_1(); movq2dq_1(); exit(0); } _______________________________________________ Qemu-devel mailing list Qemu-devel@nongnu.org http://lists.nongnu.org/mailman/listinfo/qemu-devel