From 5e9b6a89ac29be768de14075ebe896f28e944e7a Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Oct 2015 22:34:26 +1000 Subject: [PATCH 1/4] Make it work on older compilers --- intrin-nolut.c | 3 +++ intrin-pinsrw.c | 4 ++-- main.c | 24 +++++++++++++++++++----- process-purec.c | 8 ++++---- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/intrin-nolut.c b/intrin-nolut.c index 52963a2..b128693 100644 --- a/intrin-nolut.c +++ b/intrin-nolut.c @@ -1,3 +1,4 @@ +#ifdef __AVX__ //#define _GNU_SOURCE #include #include // vzeroupper @@ -240,3 +241,5 @@ void SYSV_ABI rs_process_nolut_intrin(void* dstvoid, const void* srcvoid, size_t * VPSLLVW doesn't exist until AVX512BW. AVX2 only has D and Q sizes. * On Haswell, those take 3 uops anyway (lat=2, recip tput=2). useless without fast vshift */ + +#endif \ No newline at end of file diff --git a/intrin-pinsrw.c b/intrin-pinsrw.c index 97ce5e1..f421249 100644 --- a/intrin-pinsrw.c +++ b/intrin-pinsrw.c @@ -25,8 +25,8 @@ void SYSV_ABI rs_process_pinsrw_intrin(void* dstvoid, const void* srcvoid, size_ const uint64_t *src = srcvoid; __m128i *dst = dstvoid; - const typeof(LH) L = LH; - const typeof(LH) H = LH + 256; + const uint32_t* L = LH; + const uint32_t* H = LH + 256; // _mm256_zeroupper(); for (size_t i = 0; i < size/sizeof(*dst) ; i+=1) { diff --git a/main.c b/main.c index b672235..732faeb 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,10 @@ * compile with: * x86_64-w64-mingw32-gcc to make a.exe * - * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu11 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s + * (or, for older compilers) + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s intrin-pinsrw.c asm-pinsrw*.s + * * some ASM files have IACA marks in them, but the illegal-instruction code is only illegal for 32bit code. * * run with: @@ -54,6 +57,13 @@ #define HAVE_AVX2 0 #endif +#ifdef __AVX__ +#define VZEROUPPER if(HAVE_AVX2) _mm256_zeroupper(); +#else +#define VZEROUPPER +#endif + + static __inline__ uint64_t rdtsc() { uint32_t low, high; /* __asm__ __volatile__ ( @@ -102,14 +112,14 @@ static uint64_t time_rs(rs_procfunc_t *fn, void* dst, const void* src, size_t si { uint64_t starttime, stoptime; - _mm256_zeroupper(); + VZEROUPPER starttime = rdtsc(); const int maxiter = ITERS; for (int c=0 ; c Date: Mon, 12 Oct 2015 22:35:28 +1000 Subject: [PATCH 2/4] Add memcpy test --- main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.c b/main.c index 732faeb..fa99832 100644 --- a/main.c +++ b/main.c @@ -87,6 +87,7 @@ void SYSV_ABI rs_process_pinsrw_nodep(void* dst, const void* src, size_t size, c void SYSV_ABI rs_process_uoptest(void* dst, const void* src, size_t size, const uint32_t* LH); // rs_process_pinsrw_intrin void SYSV_ABI rs_dummy(void* dst, const void* src, size_t size, const uint32_t* LH) { } +void SYSV_ABI rs_memcpy(void* dst, const void* src, size_t size, const uint32_t* LH) { memcpy(dst, src, size) ;} #ifdef PERF_ONE #define ONE_ALGO_ONLY @@ -187,6 +188,7 @@ int main (int argc, char *argv[]) time_rs_print ("pinsrw128 ", rs_process_pinsrw128, dstbuf, srcbuf, size, LH); time_rs_print ("orig MMX-unpck", rs_process_x86_64_mmx_orig, dstbuf, srcbuf, size, LH); time_rs_print ("dummy ", rs_dummy, dstbuf, srcbuf, size, LH); + time_rs_print ("memcpy ", rs_memcpy, dstbuf, srcbuf, size, LH); time_rs_print ("MMX w/ 64b rdx", rs_process_x86_64_mmx, dstbuf, srcbuf, size, LH); time_rs_print ("pinsrw-intrin ", rs_process_pinsrw_intrin, dstbuf, srcbuf, size, LH); // time_rs_print ("pinsrw-unpipe ", rs_process_pinsrw_unpipelined, dstbuf, srcbuf, size, LH); From 3d0747958987926bcb8bd48eeb1bf7bc64de39a7 Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Oct 2015 22:36:36 +1000 Subject: [PATCH 3/4] Add disassembled JIT code from XOR_DEPENDS algorithm for multiply-by 32767 --- main.c | 7 +- xordep_mul32767.s | 204 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+), 2 deletions(-) create mode 100644 xordep_mul32767.s diff --git a/main.c b/main.c index fa99832..5c092ac 100644 --- a/main.c +++ b/main.c @@ -6,9 +6,9 @@ * compile with: * x86_64-w64-mingw32-gcc to make a.exe * - * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs] * (or, for older compilers) - * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s intrin-pinsrw.c asm-pinsrw*.s + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs] * * some ASM files have IACA marks in them, but the illegal-instruction code is only illegal for 32bit code. * @@ -85,6 +85,7 @@ void SYSV_ABI rs_process_pinsrw64(void* dst, const void* src, size_t size, const void SYSV_ABI rs_process_pinsrw128(void* dst, const void* src, size_t size, const uint32_t* LH); void SYSV_ABI rs_process_pinsrw_nodep(void* dst, const void* src, size_t size, const uint32_t* LH); void SYSV_ABI rs_process_uoptest(void* dst, const void* src, size_t size, const uint32_t* LH); +void SYSV_ABI rs_process_xordep_mul32767(void* dst, const void* src, size_t size, const uint32_t* LH); // rs_process_pinsrw_intrin void SYSV_ABI rs_dummy(void* dst, const void* src, size_t size, const uint32_t* LH) { } void SYSV_ABI rs_memcpy(void* dst, const void* src, size_t size, const uint32_t* LH) { memcpy(dst, src, size) ;} @@ -193,6 +194,7 @@ int main (int argc, char *argv[]) time_rs_print ("pinsrw-intrin ", rs_process_pinsrw_intrin, dstbuf, srcbuf, size, LH); // time_rs_print ("pinsrw-unpipe ", rs_process_pinsrw_unpipelined, dstbuf, srcbuf, size, LH); time_rs_print ("Pure C ", rs_process_purec, dstbuf, srcbuf, size, LH); + time_rs_print ("xord mul32767 ", rs_process_xordep_mul32767, dstbuf, srcbuf, size, LH); puts ("----------------"); #endif for (int i=0 ; i<3 ; i++) { @@ -213,6 +215,7 @@ int main (int argc, char *argv[]) #else time_rs_print ("pinsrw128 ", rs_process_pinsrw128, dstbuf, srcbuf, size, LH); #endif + time_rs_print ("xord mul32767 ", rs_process_xordep_mul32767, dstbuf, srcbuf, size, LH); // fflush(stdout); #ifdef __AVX__ if (HAVE_AVX2) { diff --git a/xordep_mul32767.s b/xordep_mul32767.s new file mode 100644 index 0000000..0685c62 --- /dev/null +++ b/xordep_mul32767.s @@ -0,0 +1,204 @@ +# disassembled from XOR algo's JIT for multiplying by 32767 +# slightly modified for readability etc +# algorithm doesn't use LH tables, so that arg is ignored +# NOTE: size must be a multiple of 256 + +.text +.intel_syntax noprefix +.globl rs_process_xordep_mul32767 +rs_process_xordep_mul32767: +# rs_process_xordep_mul32767(void* dst (%rdi), const void* src (%rsi), size_t size (%rdx), const u16* LH (%rcx)); + +# save xmm6-15 for Windows' sake, not necessary otherwise + push rbp + mov rbp,rsp + mov rax,rsp + and rax,0xF + sub rbp,rax + movaps xmmword ptr [rbp-0x10],xmm6 + movaps xmmword ptr [rbp-0x20],xmm7 + movaps xmmword ptr [rbp-0x30],xmm8 + movaps xmmword ptr [rbp-0x40],xmm9 + movaps xmmword ptr [rbp-0x50],xmm10 + movaps xmmword ptr [rbp-0x60],xmm11 + movaps xmmword ptr [rbp-0x70],xmm12 + movaps xmmword ptr [rbp-0x80],xmm13 + movaps xmmword ptr [rbp-0x90],xmm14 + movaps xmmword ptr [rbp-0xA0],xmm15 + + mov rax,rsi #src + lea rcx,[rdi+rdx] #dest-end + mov rdx,rdi #dest + +.align 16 +.loop: +# pre-load inputs 3-15 into registers (can't fit all 16 inputs) + movaps xmm3,xmmword ptr [rax+0x30] + movaps xmm4,xmmword ptr [rax+0x40] + movaps xmm5,xmmword ptr [rax+0x50] + movaps xmm6,xmmword ptr [rax+0x60] + movaps xmm7,xmmword ptr [rax+0x70] + movaps xmm8,xmmword ptr [rax+0x80] + movaps xmm9,xmmword ptr [rax+0x90] + movaps xmm10,xmmword ptr [rax+0xA0] + movaps xmm11,xmmword ptr [rax+0xB0] + movaps xmm12,xmmword ptr [rax+0xC0] + movaps xmm13,xmmword ptr [rax+0xD0] + movaps xmm14,xmmword ptr [rax+0xE0] + movaps xmm15,xmmword ptr [rax+0xF0] + +# process 256 bytes + movaps xmm2,xmmword ptr [rax] + xorps xmm2,xmmword ptr [rax+0x10] + movdqa xmm1,xmmword ptr [rax+0x20] + movaps xmm0,xmm3 + xorps xmm2,xmm4 + xorps xmm2,xmm5 + xorps xmm2,xmm6 + pxor xmm1,xmm7 + xorps xmm0,xmm11 + xorps xmm2,xmm12 + xorps xmm2,xmm13 + xorps xmm2,xmm14 + pxor xmm1,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx],xmm0 + movdqa xmmword ptr [rdx+0x10],xmm1 + movaps xmm2,xmmword ptr [rax] + xorps xmm2,xmmword ptr [rax+0x10] + xorps xmm2,xmmword ptr [rax+0x20] + xorps xmm2,xmm3 + movdqa xmm1,xmm4 + movaps xmm0,xmm5 + xorps xmm2,xmm6 + xorps xmm2,xmm7 + xorps xmm2,xmm8 + pxor xmm1,xmm9 + xorps xmm0,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0x20],xmm0 + movdqa xmmword ptr [rdx+0x30],xmm1 + movdqa xmm1,xmmword ptr [rax] + movaps xmm0,xmmword ptr [rax+0x20] + pxor xmm1,xmm3 + xorps xmm0,xmm6 + movaps xmm2,xmm7 + xorps xmm2,xmm8 + xorps xmm2,xmm9 + xorps xmm2,xmm10 + xorps xmm2,xmm11 + xorps xmm2,xmm12 + xorps xmm2,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0x40],xmm0 + movdqa xmmword ptr [rdx+0x50],xmm1 + movdqa xmm1,xmmword ptr [rax] + movaps xmm0,xmmword ptr [rax+0x10] + pxor xmm1,xmmword ptr [rax+0x20] + xorps xmm0,xmm4 + pxor xmm1,xmm5 + xorps xmm0,xmm8 + movaps xmm2,xmm9 + xorps xmm2,xmm10 + xorps xmm2,xmm11 + xorps xmm2,xmm12 + xorps xmm2,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0x60],xmm0 + movdqa xmmword ptr [rdx+0x70],xmm1 + movaps xmm0,xmmword ptr [rax] + movaps xmm2,xmmword ptr [rax+0x10] + movdqa xmm1,xmmword ptr [rax+0x20] + xorps xmm0,xmm3 + pxor xmm1,xmm4 + xorps xmm0,xmm6 + pxor xmm1,xmm7 + xorps xmm0,xmm10 + xorps xmm2,xmm11 + xorps xmm2,xmm12 + xorps xmm2,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0x80],xmm0 + movdqa xmmword ptr [rdx+0x90],xmm1 + movaps xmm0,xmmword ptr [rax] + movdqa xmm1,xmmword ptr [rax+0x10] + xorps xmm0,xmmword ptr [rax+0x20] + movaps xmm2,xmm3 + pxor xmm1,xmm4 + xorps xmm0,xmm5 + pxor xmm1,xmm6 + xorps xmm0,xmm8 + pxor xmm1,xmm9 + xorps xmm0,xmm12 + xorps xmm2,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0xA0],xmm0 + movdqa xmmword ptr [rdx+0xB0],xmm1 + movaps xmm2,xmmword ptr [rax] + movaps xmm0,xmmword ptr [rax+0x20] + xorps xmm2,xmm4 + xorps xmm0,xmm5 + xorps xmm0,xmm7 + movdqa xmm1,xmm8 + xorps xmm0,xmm10 + pxor xmm1,xmm12 + pxor xmm1,xmm13 + xorps xmm2,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0xC0],xmm0 + movdqa xmmword ptr [rdx+0xD0],xmm1 + movaps xmm2,xmmword ptr [rax] + movaps xmm0,xmmword ptr [rax+0x10] + movdqa xmm1,xmmword ptr [rax+0x20] + pxor xmm1,xmm3 + pxor xmm1,xmm4 + xorps xmm2,xmm5 + xorps xmm0,xmm9 + pxor xmm1,xmm10 + pxor xmm1,xmm11 + pxor xmm1,xmm12 + xorps xmm2,xmm13 + xorps xmm0,xmm14 + xorps xmm2,xmm15 + xorps xmm0,xmm2 + pxor xmm1,xmm2 + movaps xmmword ptr [rdx+0xE0],xmm0 + movdqa xmmword ptr [rdx+0xF0],xmm1 +#end of main processing + + add rax,0x100 + add rdx,0x100 + cmp rdx,rcx + jl .loop + +# restore xmm6-15 + movaps xmm6,xmmword ptr [rbp-0x10] + movaps xmm7,xmmword ptr [rbp-0x20] + movaps xmm8,xmmword ptr [rbp-0x30] + movaps xmm9,xmmword ptr [rbp-0x40] + movaps xmm10,xmmword ptr [rbp-0x50] + movaps xmm11,xmmword ptr [rbp-0x60] + movaps xmm12,xmmword ptr [rbp-0x70] + movaps xmm13,xmmword ptr [rbp-0x80] + movaps xmm14,xmmword ptr [rbp-0x90] + movaps xmm15,xmmword ptr [rbp-0xA0] + pop rbp + ret From bd7a3b3964b8e02f193d02386fad800fc3b4157a Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 12 Oct 2015 23:10:13 +1000 Subject: [PATCH 4/4] Oops, disassembled the non-XOR kernel; manually fix to be the same as the XOR one --- xordep_mul32767.s | 57 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/xordep_mul32767.s b/xordep_mul32767.s index 0685c62..1f099f2 100644 --- a/xordep_mul32767.s +++ b/xordep_mul32767.s @@ -47,11 +47,13 @@ rs_process_xordep_mul32767: movaps xmm14,xmmword ptr [rax+0xE0] movaps xmm15,xmmword ptr [rax+0xF0] -# process 256 bytes +# process 256 bytes; algorithm does it in 32 byte 'sub-blocks' + movaps xmm0,xmmword ptr [rdx] + movdqa xmm1,xmmword ptr [rdx+0x10] movaps xmm2,xmmword ptr [rax] xorps xmm2,xmmword ptr [rax+0x10] - movdqa xmm1,xmmword ptr [rax+0x20] - movaps xmm0,xmm3 + pxor xmm1,xmmword ptr [rax+0x20] + xorps xmm0,xmm3 xorps xmm2,xmm4 xorps xmm2,xmm5 xorps xmm2,xmm6 @@ -65,12 +67,15 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx],xmm0 movdqa xmmword ptr [rdx+0x10],xmm1 + + movaps xmm0,xmmword ptr [rdx+0x20] + movdqa xmm1,xmmword ptr [rdx+0x30] movaps xmm2,xmmword ptr [rax] xorps xmm2,xmmword ptr [rax+0x10] xorps xmm2,xmmword ptr [rax+0x20] xorps xmm2,xmm3 - movdqa xmm1,xmm4 - movaps xmm0,xmm5 + pxor xmm1,xmm4 + xorps xmm0,xmm5 xorps xmm2,xmm6 xorps xmm2,xmm7 xorps xmm2,xmm8 @@ -82,8 +87,11 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0x20],xmm0 movdqa xmmword ptr [rdx+0x30],xmm1 - movdqa xmm1,xmmword ptr [rax] - movaps xmm0,xmmword ptr [rax+0x20] + + movaps xmm0,xmmword ptr [rdx+0x40] + movdqa xmm1,xmmword ptr [rdx+0x50] + pxor xmm1,xmmword ptr [rax] + xorps xmm0,xmmword ptr [rax+0x20] pxor xmm1,xmm3 xorps xmm0,xmm6 movaps xmm2,xmm7 @@ -99,8 +107,11 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0x40],xmm0 movdqa xmmword ptr [rdx+0x50],xmm1 - movdqa xmm1,xmmword ptr [rax] - movaps xmm0,xmmword ptr [rax+0x10] + + movaps xmm0,xmmword ptr [rdx+0x60] + movdqa xmm1,xmmword ptr [rdx+0x70] + pxor xmm1,xmmword ptr [rax] + xorps xmm0,xmmword ptr [rax+0x10] pxor xmm1,xmmword ptr [rax+0x20] xorps xmm0,xmm4 pxor xmm1,xmm5 @@ -116,9 +127,12 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0x60],xmm0 movdqa xmmword ptr [rdx+0x70],xmm1 - movaps xmm0,xmmword ptr [rax] + + movaps xmm0,xmmword ptr [rdx+0x80] + movdqa xmm1,xmmword ptr [rdx+0x90] + xorps xmm0,xmmword ptr [rax] movaps xmm2,xmmword ptr [rax+0x10] - movdqa xmm1,xmmword ptr [rax+0x20] + pxor xmm1,xmmword ptr [rax+0x20] xorps xmm0,xmm3 pxor xmm1,xmm4 xorps xmm0,xmm6 @@ -133,8 +147,11 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0x80],xmm0 movdqa xmmword ptr [rdx+0x90],xmm1 - movaps xmm0,xmmword ptr [rax] - movdqa xmm1,xmmword ptr [rax+0x10] + + movaps xmm0,xmmword ptr [rdx+0xA0] + movdqa xmm1,xmmword ptr [rdx+0xB0] + xorps xmm0,xmmword ptr [rax] + pxor xmm1,xmmword ptr [rax+0x10] xorps xmm0,xmmword ptr [rax+0x20] movaps xmm2,xmm3 pxor xmm1,xmm4 @@ -150,12 +167,15 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0xA0],xmm0 movdqa xmmword ptr [rdx+0xB0],xmm1 + + movaps xmm0,xmmword ptr [rdx+0xC0] + movdqa xmm1,xmmword ptr [rdx+0xD0] movaps xmm2,xmmword ptr [rax] - movaps xmm0,xmmword ptr [rax+0x20] + xorps xmm0,xmmword ptr [rax+0x20] xorps xmm2,xmm4 xorps xmm0,xmm5 xorps xmm0,xmm7 - movdqa xmm1,xmm8 + pxor xmm1,xmm8 xorps xmm0,xmm10 pxor xmm1,xmm12 pxor xmm1,xmm13 @@ -165,9 +185,12 @@ rs_process_xordep_mul32767: pxor xmm1,xmm2 movaps xmmword ptr [rdx+0xC0],xmm0 movdqa xmmword ptr [rdx+0xD0],xmm1 + + movaps xmm0,xmmword ptr [rdx+0xE0] + movdqa xmm1,xmmword ptr [rdx+0xF0] movaps xmm2,xmmword ptr [rax] - movaps xmm0,xmmword ptr [rax+0x10] - movdqa xmm1,xmmword ptr [rax+0x20] + xorps xmm0,xmmword ptr [rax+0x10] + pxor xmm1,xmmword ptr [rax+0x20] pxor xmm1,xmm3 pxor xmm1,xmm4 xorps xmm2,xmm5