diff --git a/intrin-nolut.c b/intrin-nolut.c index 52963a2..b128693 100644 --- a/intrin-nolut.c +++ b/intrin-nolut.c @@ -1,3 +1,4 @@ +#ifdef __AVX__ //#define _GNU_SOURCE #include #include // vzeroupper @@ -240,3 +241,5 @@ void SYSV_ABI rs_process_nolut_intrin(void* dstvoid, const void* srcvoid, size_t * VPSLLVW doesn't exist until AVX512BW. AVX2 only has D and Q sizes. * On Haswell, those take 3 uops anyway (lat=2, recip tput=2). useless without fast vshift */ + +#endif \ No newline at end of file diff --git a/intrin-pinsrw.c b/intrin-pinsrw.c index 97ce5e1..f421249 100644 --- a/intrin-pinsrw.c +++ b/intrin-pinsrw.c @@ -25,8 +25,8 @@ void SYSV_ABI rs_process_pinsrw_intrin(void* dstvoid, const void* srcvoid, size_ const uint64_t *src = srcvoid; __m128i *dst = dstvoid; - const typeof(LH) L = LH; - const typeof(LH) H = LH + 256; + const uint32_t* L = LH; + const uint32_t* H = LH + 256; // _mm256_zeroupper(); for (size_t i = 0; i < size/sizeof(*dst) ; i+=1) { diff --git a/main.c b/main.c index b672235..5c092ac 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,10 @@ * compile with: * x86_64-w64-mingw32-gcc to make a.exe * - * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu11 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs] + * (or, for older compilers) + * gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs] + * * some ASM files have IACA marks in them, but the illegal-instruction code is only illegal for 32bit code. * * run with: @@ -54,6 +57,13 @@ #define HAVE_AVX2 0 #endif +#ifdef __AVX__ +#define VZEROUPPER if(HAVE_AVX2) _mm256_zeroupper(); +#else +#define VZEROUPPER +#endif + + static __inline__ uint64_t rdtsc() { uint32_t low, high; /* __asm__ __volatile__ ( @@ -75,8 +85,10 @@ void SYSV_ABI rs_process_pinsrw64(void* dst, const void* src, size_t size, const void SYSV_ABI rs_process_pinsrw128(void* dst, const void* src, size_t size, const uint32_t* LH); void SYSV_ABI rs_process_pinsrw_nodep(void* dst, const void* src, size_t size, const uint32_t* LH); void SYSV_ABI rs_process_uoptest(void* dst, const void* src, size_t size, const uint32_t* LH); +void SYSV_ABI rs_process_xordep_mul32767(void* dst, const void* src, size_t size, const uint32_t* LH); // rs_process_pinsrw_intrin void SYSV_ABI rs_dummy(void* dst, const void* src, size_t size, const uint32_t* LH) { } +void SYSV_ABI rs_memcpy(void* dst, const void* src, size_t size, const uint32_t* LH) { memcpy(dst, src, size) ;} #ifdef PERF_ONE #define ONE_ALGO_ONLY @@ -102,14 +114,14 @@ static uint64_t time_rs(rs_procfunc_t *fn, void* dst, const void* src, size_t si { uint64_t starttime, stoptime; - _mm256_zeroupper(); + VZEROUPPER starttime = rdtsc(); const int maxiter = ITERS; for (int c=0 ; c