diff --git a/src/ailego/math/matrix_utility.i b/src/ailego/math/matrix_utility.i index 34951478..8ae3e245 100644 --- a/src/ailego/math/matrix_utility.i +++ b/src/ailego/math/matrix_utility.i @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once + #include namespace zvec { @@ -46,7 +48,7 @@ static inline float HorizontalAdd_FP32_V128(__m128 v) { return _mm_cvtss_f32(x4); #endif } -#endif // __SSE__ +#endif // __SSE__ #if defined(__SSE2__) static inline int32_t HorizontalAdd_INT32_V128(__m128i v) { @@ -71,7 +73,7 @@ static inline int64_t HorizontalAdd_INT64_V128(__m128i v) { _mm_add_epi64(_mm_shuffle_epi32(v, _MM_SHUFFLE(0, 0, 3, 2)), v)); #endif } -#endif // __SSE2__ +#endif // __SSE2__ #if defined(__SSSE3__) static const __m128i POPCNT_LOOKUP_SSE = @@ -86,7 +88,7 @@ static inline __m128i VerticalPopCount_INT8_V128(__m128i v) { __m128i hi = _mm_shuffle_epi8(POPCNT_LOOKUP_SSE, _mm_and_si128(_mm_srli_epi32(v, 4), low_mask)); return _mm_add_epi8(lo, hi); -#endif // __AVX512VL__ && __AVX512BITALG__ +#endif // __AVX512VL__ && __AVX512BITALG__ } static inline __m128i VerticalPopCount_INT16_V128(__m128i v) { @@ -96,7 +98,7 @@ static inline __m128i VerticalPopCount_INT16_V128(__m128i v) { __m128i total = VerticalPopCount_INT8_V128(v); return _mm_add_epi16(_mm_srli_epi16(total, 8), _mm_and_si128(total, _mm_set1_epi16(0xff))); -#endif // __AVX512VL__ && __AVX512BITALG__ +#endif // __AVX512VL__ && __AVX512BITALG__ } static inline __m128i VerticalPopCount_INT32_V128(__m128i v) { @@ -107,7 +109,7 @@ static inline __m128i VerticalPopCount_INT32_V128(__m128i v) { _mm_madd_epi16(VerticalPopCount_INT8_V128(v), _mm_set1_epi16(1)); return _mm_add_epi32(_mm_srli_epi32(total, 8), _mm_and_si128(total, _mm_set1_epi32(0xff))); -#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ +#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ } static inline __m128i VerticalPopCount_INT64_V128(__m128i v) { @@ -115,9 +117,9 @@ static inline __m128i VerticalPopCount_INT64_V128(__m128i v) { return _mm_popcnt_epi64(v); #else return _mm_sad_epu8(VerticalPopCount_INT8_V128(v), _mm_setzero_si128()); -#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ +#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ } -#endif // __SSSE3__ +#endif // __SSSE3__ #if defined(__SSE4_1__) static inline int16_t HorizontalMax_UINT8_V128(__m128i v) { @@ -127,7 +129,7 @@ static inline int16_t HorizontalMax_UINT8_V128(__m128i v) { v = _mm_max_epu8(v, _mm_srli_epi16(v, 8)); return static_cast(_mm_cvtsi128_si32(v)); } -#endif // __SSE4_1__ +#endif // __SSE4_1__ #if defined(__AVX__) static inline float HorizontalMax_FP32_V256(__m256 v) { @@ -147,7 +149,7 @@ static inline float HorizontalAdd_FP32_V256(__m256 v) { __m128 x4 = _mm_add_ss(_mm256_castps256_ps128(x2), x3); return _mm_cvtss_f32(x4); } -#endif // __AVX__ +#endif // __AVX__ #if defined(__AVX2__) static const __m256i POPCNT_MASK1_INT8_AVX = _mm256_set1_epi8(0x0f); @@ -169,7 +171,7 @@ static inline __m256i VerticalPopCount_INT8_V256(__m256i v) { POPCNT_LOOKUP_AVX, _mm256_and_si256(_mm256_srli_epi32(v, 4), POPCNT_MASK1_INT8_AVX)); return _mm256_add_epi8(lo, hi); -#endif // __AVX512VL__ && __AVX512BITALG__ +#endif // __AVX512VL__ && __AVX512BITALG__ } static inline __m256i VerticalPopCount_INT16_V256(__m256i v) { @@ -179,7 +181,7 @@ static inline __m256i VerticalPopCount_INT16_V256(__m256i v) { __m256i total = VerticalPopCount_INT8_V256(v); return _mm256_add_epi16(_mm256_srli_epi16(total, 8), _mm256_and_si256(total, POPCNT_MASK2_INT16_AVX)); -#endif // __AVX512VL__ && __AVX512BITALG__ +#endif // __AVX512VL__ && __AVX512BITALG__ } static inline __m256i VerticalPopCount_INT32_V256(__m256i v) { @@ -190,7 +192,7 @@ static inline __m256i VerticalPopCount_INT32_V256(__m256i v) { _mm256_madd_epi16(VerticalPopCount_INT8_V256(v), POPCNT_MASK1_INT16_AVX); return _mm256_add_epi32(_mm256_srli_epi32(total, 8), _mm256_and_si256(total, POPCNT_MASK1_INT32_AVX)); -#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ +#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ } static inline __m256i VerticalPopCount_INT64_V256(__m256i v) { @@ -198,7 +200,7 @@ static inline __m256i VerticalPopCount_INT64_V256(__m256i v) { return _mm256_popcnt_epi64(v); #else return _mm256_sad_epu8(VerticalPopCount_INT8_V256(v), POPCNT_ZERO_AVX); -#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ +#endif // __AVX512VL__ && __AVX512VPOPCNTDQ__ } static inline int16_t HorizontalMax_UINT8_V256(__m256i v) { @@ -226,7 +228,7 @@ static inline int64_t HorizontalAdd_INT64_V256(__m256i v) { __m128i x4 = _mm_add_epi64(_mm256_extractf128_si256(x2, 0), x3); return _mm_cvtsi128_si64(x4); } -#endif // __AVX2__ +#endif // __AVX2__ #if defined(__AVX512F__) static inline float HorizontalMax_FP32_V512(__m512 v) { @@ -242,7 +244,7 @@ static inline float HorizontalAdd_FP32_V512(__m512 v) { _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)); return HorizontalAdd_FP32_V256(_mm256_add_ps(low, high)); } -#endif // __AVX512F__ +#endif // __AVX512F__ #if defined(__AVX512FP16__) static inline float HorizontalMax_FP16_V512(__m512h v) { @@ -259,7 +261,7 @@ static inline float HorizontalAdd_FP16_V512(__m512h v) { return HorizontalAdd_FP32_V512(_mm512_add_ps(low, high)); } -#endif // __AVX512FP16__ +#endif // __AVX512FP16__ -} // namespace ailego -} // namespace zvec \ No newline at end of file +} // namespace ailego +} // namespace zvec \ No newline at end of file diff --git a/src/ailego/math_batch/distance_batch.h b/src/ailego/math_batch/distance_batch.h index c762a258..9494be85 100644 --- a/src/ailego/math_batch/distance_batch.h +++ b/src/ailego/math_batch/distance_batch.h @@ -17,12 +17,11 @@ #include #include "ailego/math/distance_matrix.h" #include "cosine_distance_batch.h" +#include "euclidean_distance_batch.h" #include "inner_product_distance_batch.h" - namespace zvec::ailego { - template < template class DistanceType, typename ValueType, size_t BatchSize, size_t PrefetchStep, typename = void> @@ -44,6 +43,21 @@ struct BaseDistance { out); } + if constexpr (std::is_same_v, + EuclideanDistanceMatrix>) { + return DistanceBatch::EuclideanDistanceBatch< + ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, + out); + } + + if constexpr (std::is_same_v< + DistanceType, + SquaredEuclideanDistanceMatrix>) { + return DistanceBatch::SquaredEuclideanDistanceBatch< + ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, + out); + } + _ComputeBatch(m, q, num, dim, out); } }; diff --git a/src/ailego/math_batch/distance_batch_math.h b/src/ailego/math_batch/distance_batch_math.h new file mode 100644 index 00000000..a2672cd8 --- /dev/null +++ b/src/ailego/math_batch/distance_batch_math.h @@ -0,0 +1,30 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(__AVX2__) + +inline float sum4(__m128 v) { + v = _mm_add_ps(v, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 8))); + return v[0] + v[1]; +} + +inline __m128 sum_top_bottom_avx(__m256 v) { + const __m128 high = _mm256_extractf128_ps(v, 1); + const __m128 low = _mm256_castps256_ps128(v); + return _mm_add_ps(high, low); +} + +#endif diff --git a/src/ailego/math_batch/euclidean_distance_batch.h b/src/ailego/math_batch/euclidean_distance_batch.h new file mode 100644 index 00000000..96705d4e --- /dev/null +++ b/src/ailego/math_batch/euclidean_distance_batch.h @@ -0,0 +1,176 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "euclidean_distance_batch_impl.h" +#include "euclidean_distance_batch_impl_fp16.h" +#include "euclidean_distance_batch_impl_int8.h" + +namespace zvec::ailego::DistanceBatch { + +// SquaredEuclideanDistanceBatch +template +struct SquaredEuclideanDistanceBatch; + +// Function template partial specialization is not allowed, +// therefore the wrapper struct is required. +template +struct SquaredEuclideanDistanceBatchImpl { + using ValueType = typename std::remove_cv::type; + static void compute_one_to_many( + const ValueType *query, const ValueType **ptrs, + std::array &prefetch_ptrs, size_t dim, + float *sums) { + return compute_one_to_many_squared_euclidean_fallback( + query, ptrs, prefetch_ptrs, dim, sums); + } +}; + +template +struct SquaredEuclideanDistanceBatchImpl { + using ValueType = float; + static void compute_one_to_many( + const ValueType *query, const ValueType **ptrs, + std::array &prefetch_ptrs, size_t dim, + float *sums) { +#if defined(__AVX512F__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return compute_one_to_many_squared_euclidean_avx512f_fp32( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif + +#if defined(__AVX2__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return compute_one_to_many_squared_euclidean_avx2_fp32( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif + return compute_one_to_many_squared_euclidean_fallback( + query, ptrs, prefetch_ptrs, dim, sums); + } +}; + +template +struct SquaredEuclideanDistanceBatchImpl { + using ValueType = int8_t; + static void compute_one_to_many( + const int8_t *query, const int8_t **ptrs, + std::array &prefetch_ptrs, size_t dim, + float *sums) { +#if defined(__AVX2__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return compute_one_to_many_squared_euclidean_avx2_int8( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif + return compute_one_to_many_squared_euclidean_fallback( + query, ptrs, prefetch_ptrs, dim, sums); + } +}; + +template +struct SquaredEuclideanDistanceBatchImpl { + using ValueType = ailego::Float16; + static void compute_one_to_many( + const ailego::Float16 *query, const ailego::Float16 **ptrs, + std::array &prefetch_ptrs, size_t dim, + float *sums) { +#if defined(__AVX512FP16__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_FP16) { + return compute_one_to_many_squared_euclidean_avx512fp16_fp16( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif +#if defined(__AVX512F__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F) { + return compute_one_to_many_squared_euclidean_avx512f_fp16( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif +#if defined(__AVX2__) + if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { + return compute_one_to_many_squared_euclidean_avx2_fp16( + query, ptrs, prefetch_ptrs, dim, sums); + } +#endif + return compute_one_to_many_squared_euclidean_fallback( + query, ptrs, prefetch_ptrs, dim, sums); + } +}; + +template +struct SquaredEuclideanDistanceBatch { + using ValueType = typename std::remove_cv::type; + + static inline void ComputeBatch(const ValueType **vecs, + const ValueType *query, size_t num_vecs, + size_t dim, float *results) { + size_t i = 0; + for (; i + BatchSize <= num_vecs; i += BatchSize) { + std::array prefetch_ptrs; + for (size_t j = 0; j < BatchSize; ++j) { + if (i + j + BatchSize * PrefetchStep < num_vecs) { + prefetch_ptrs[j] = vecs[i + j + BatchSize * PrefetchStep]; + } else { + prefetch_ptrs[j] = nullptr; + } + } + SquaredEuclideanDistanceBatchImpl< + ValueType, BatchSize>::compute_one_to_many(query, &vecs[i], + prefetch_ptrs, dim, + &results[i]); + } + for (; i < num_vecs; ++i) { // TODO: unroll by 1, 2, 4, 8, etc. + std::array prefetch_ptrs{nullptr}; + SquaredEuclideanDistanceBatchImpl::compute_one_to_many( + query, &vecs[i], prefetch_ptrs, dim, &results[i]); + } + } +}; + +// EuclideanDistanceBatch +template +struct EuclideanDistanceBatch; + +template +struct EuclideanDistanceBatch { + using ValueType = typename std::remove_cv::type; + + static inline void ComputeBatch(const ValueType **vecs, + const ValueType *query, size_t num_vecs, + size_t dim, float *results) { + SquaredEuclideanDistanceBatch::ComputeBatch( + vecs, query, num_vecs, dim, results); + + for (size_t i = 0; i < num_vecs; ++i) { + results[i] = std::sqrt(results[i]); + } + } +}; + +} // namespace zvec::ailego::DistanceBatch diff --git a/src/ailego/math_batch/euclidean_distance_batch_impl.h b/src/ailego/math_batch/euclidean_distance_batch_impl.h new file mode 100644 index 00000000..19be4ead --- /dev/null +++ b/src/ailego/math_batch/euclidean_distance_batch_impl.h @@ -0,0 +1,155 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "distance_batch_math.h" + +#define SSD_FP32_GENERAL(m, q, sum) \ + { \ + float x = m - q; \ + sum += (x * x); \ + } + +namespace zvec::ailego::DistanceBatch { + +template +static void compute_one_to_many_squared_euclidean_fallback( + const ValueType *query, const ValueType **ptrs, + std::array &prefetch_ptrs, size_t dim, + float *sums) { + for (size_t j = 0; j < BatchSize; ++j) { + sums[j] = 0.0; + SquaredEuclideanDistanceMatrix::Compute(ptrs[j], query, + dim, sums + j); + ailego_prefetch(&prefetch_ptrs[j]); + } +} + +#if defined(__AVX512F__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx512f_fp32( + const ValueType *query, const ValueType **ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m512 accs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm512_setzero_ps(); + } + size_t dim = 0; + for (; dim + 16 <= dimensionality; dim += 16) { + __m512 q = _mm512_loadu_ps(query + dim); + __m512 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm512_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + __m512 diff = _mm512_sub_ps(q, data_regs[i]); + accs[i] = _mm512_fmadd_ps(diff, diff, accs[i]); + } + } + + if (dim < dimensionality) { + __mmask32 mask = (__mmask32)((1 << (dimensionality - dim)) - 1); + + for (size_t i = 0; i < dp_batch; ++i) { + __m512 zmm_undefined = _mm512_undefined_ps(); + + accs[i] = _mm512_mask3_fmadd_ps( + _mm512_mask_loadu_ps(zmm_undefined, mask, query + dim), + _mm512_mask_loadu_ps(zmm_undefined, mask, ptrs[i] + dim), accs[i], + mask); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_FP32_V512(accs[i]); + } +} + +#endif + +#if defined(__AVX2__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx2_fp32( + const ValueType *query, const ValueType **ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m256 accs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_setzero_ps(); + } + size_t dim = 0; + for (; dim + 8 <= dimensionality; dim += 8) { + __m256 q = _mm256_loadu_ps(query + dim); + __m256 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + for (size_t i = 0; i < dp_batch; ++i) { + __m256 diff = _mm256_sub_ps(q, data_regs[i]); + accs[i] = _mm256_fmadd_ps(diff, diff, accs[i]); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_FP32_V256(accs[i]); + + switch (dimensionality - dim) { + case 7: + SSD_FP32_GENERAL(query[6], ptrs[i][6], results[i]); + /* FALLTHRU */ + case 6: + SSD_FP32_GENERAL(query[5], ptrs[i][5], results[i]); + /* FALLTHRU */ + case 5: + SSD_FP32_GENERAL(query[4], ptrs[i][4], results[i]); + /* FALLTHRU */ + case 4: + SSD_FP32_GENERAL(query[3], ptrs[i][3], results[i]); + /* FALLTHRU */ + case 3: + SSD_FP32_GENERAL(query[2], ptrs[i][2], results[i]); + /* FALLTHRU */ + case 2: + SSD_FP32_GENERAL(query[1], ptrs[i][1], results[i]); + /* FALLTHRU */ + case 1: + SSD_FP32_GENERAL(query[0], ptrs[i][0], results[i]); + } + } +} +#endif + + +} // namespace zvec::ailego::DistanceBatch \ No newline at end of file diff --git a/src/ailego/math_batch/euclidean_distance_batch_impl_fp16.h b/src/ailego/math_batch/euclidean_distance_batch_impl_fp16.h new file mode 100644 index 00000000..23196d52 --- /dev/null +++ b/src/ailego/math_batch/euclidean_distance_batch_impl_fp16.h @@ -0,0 +1,258 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace zvec::ailego::DistanceBatch { + +#if defined(__AVX512FP16__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx512fp16_fp16( + const ailego::Float16 *query, const ailego::Float16 **ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m512h accs[dp_batch]; + + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm512_setzero_ph(); + } + + size_t dim = 0; + for (; dim + 32 <= dimensionality; dim += 32) { + __m512h q = _mm512_loadu_ph(query + dim); + + __m512h data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm512_loadu_ph(ptrs[i] + dim); + } + + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + __m512h diff = _mm512_sub_ph(data_regs[i], q); + accs[i] = _mm512_fmadd_ph(diff, diff, accs[i]); + } + } + + if (dim < dimensionality) { + __mmask32 mask = (__mmask32)((1 << (dimensionality - dim)) - 1); + + for (size_t i = 0; i < dp_batch; ++i) { + __m512i zmm_undefined = _mm512_undefined_epi32(); + __m512h zmm_undefined_ph = _mm512_undefined_ph(); + __m512h zmm_d = + _mm512_mask_sub_ph(zmm_undefined_ph, mask, + _mm512_castsi512_ph(_mm512_mask_loadu_epi16( + zmm_undefined, mask, query + dim)), + _mm512_castsi512_ph(_mm512_mask_loadu_epi16( + zmm_undefined, mask, ptrs[i] + dim))); + + accs[i] = _mm512_mask3_fmadd_ph(zmm_d, zmm_d, accs[i], mask); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_FP16_V512(accs[i]); + } +} + +#endif + +#if defined(__AVX512F__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx512f_fp16( + const ailego::Float16 *query, const ailego::Float16 **ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m512 accs[dp_batch]; + + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm512_setzero_ps(); + } + + size_t dim = 0; + for (; dim + 32 <= dimensionality; dim += 32) { + __m512i q = + _mm512_loadu_si512(reinterpret_cast(query + dim)); + + __m512 q1 = _mm512_cvtph_ps(_mm512_castsi512_si256(q)); + __m512 q2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(q, 1)); + + __m512 data_regs_1[dp_batch]; + __m512 data_regs_2[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + __m512i m = + _mm512_loadu_si512(reinterpret_cast(ptrs[i] + dim)); + + data_regs_1[i] = _mm512_cvtph_ps(_mm512_castsi512_si256(m)); + data_regs_2[i] = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(m, 1)); + } + + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + __m512 diff1 = _mm512_sub_ps(q1, data_regs_1[i]); + accs[i] = _mm512_fmadd_ps(diff1, diff1, accs[i]); + + __m512 diff2 = _mm512_sub_ps(q2, data_regs_2[i]); + accs[i] = _mm512_fmadd_ps(diff2, diff2, accs[i]); + } + } + + if (dim + 16 < dimensionality) { + __m512 q = _mm512_cvtph_ps( + _mm256_loadu_si256(reinterpret_cast(query + dim))); + + __m512 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm512_cvtph_ps( + _mm256_loadu_si256(reinterpret_cast(ptrs[i] + dim))); + accs[i] = _mm512_fmadd_ps(q, data_regs[i], accs[i]); + } + + dim += 16; + } + + __m256 acc_new[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + acc_new[i] = _mm256_add_ps( + _mm512_castps512_ps256(accs[i]), + _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(accs[i]), 1))); + } + + if (dim + 8 < dimensionality) { + __m256 q = _mm256_cvtph_ps( + _mm_loadu_si128(reinterpret_cast(query + dim))); + + for (size_t i = 0; i < dp_batch; ++i) { + __m256 m = _mm256_cvtph_ps( + _mm_loadu_si128(reinterpret_cast(ptrs[i] + dim))); + + __m256 diff = _mm256_sub_ps(m, q); + acc_new[i] = _mm256_fmadd_ps(diff, diff, acc_new[i]); + } + + dim += 8; + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_FP32_V256(acc_new[i]); + } + + for (; dim < dimensionality; ++dim) { + for (size_t i = 0; i < dp_batch; ++i) { + float diff = (*(query + dim)) - (*(ptrs[i] + dim)); + results[i] += diff * diff; + } + } +} +#endif + +#if defined(__AVX2__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx2_fp16( + const ailego::Float16 *query, const ailego::Float16 **ptrs, + std::array &prefetch_ptrs, + size_t dimensionality, float *results) { + __m256 accs[dp_batch]; + + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_setzero_ps(); + } + + size_t dim = 0; + for (; dim + 16 <= dimensionality; dim += 16) { + __m256i q = + _mm256_loadu_si256(reinterpret_cast(query + dim)); + + __m256 q1 = _mm256_cvtph_ps(_mm256_castsi256_si128(q)); + __m256 q2 = _mm256_cvtph_ps(_mm256_extractf128_si256(q, 1)); + + __m256 data_regs_1[dp_batch]; + __m256 data_regs_2[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + __m256i m = + _mm256_loadu_si256(reinterpret_cast(ptrs[i] + dim)); + + data_regs_1[i] = _mm256_cvtph_ps(_mm256_castsi256_si128(m)); + data_regs_2[i] = _mm256_cvtph_ps(_mm256_extractf128_si256(m, 1)); + } + + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + __m256 diff1 = _mm256_sub_ps(q1, data_regs_1[i]); + accs[i] = _mm256_fmadd_ps(diff1, diff1, accs[i]); + + __m256 diff2 = _mm256_sub_ps(q2, data_regs_2[i]); + accs[i] = _mm256_fmadd_ps(diff2, diff2, accs[i]); + } + } + + if (dim + 8 < dimensionality) { + __m256 q = _mm256_cvtph_ps( + _mm_loadu_si128(reinterpret_cast(query + dim))); + + __m256 data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm256_cvtph_ps( + _mm_loadu_si128(reinterpret_cast(ptrs[i] + dim))); + + __m256 diff = _mm256_sub_ps(q, data_regs[i]); + accs[i] = _mm256_fmadd_ps(diff, diff, accs[i]); + } + + dim += 8; + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_FP32_V256(accs[i]); + } + + for (; dim < dimensionality; ++dim) { + for (size_t i = 0; i < dp_batch; ++i) { + float diff = (*(query + dim)) - (*(ptrs[i] + dim)); + results[i] += diff * diff; + } + } +} + +#endif + + +} // namespace zvec::ailego::DistanceBatch \ No newline at end of file diff --git a/src/ailego/math_batch/euclidean_distance_batch_impl_int8.h b/src/ailego/math_batch/euclidean_distance_batch_impl_int8.h new file mode 100644 index 00000000..e69fafa0 --- /dev/null +++ b/src/ailego/math_batch/euclidean_distance_batch_impl_int8.h @@ -0,0 +1,144 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#define SSD_INT8_GENERAL(m, q, sum) \ + { \ + int32_t x = m - q; \ + sum += static_cast(x * x); \ + } + +namespace zvec::ailego::DistanceBatch { + +#if defined(__AVX2__) + +template +static std::enable_if_t, void> +compute_one_to_many_squared_euclidean_avx2_int8( + const int8_t *query, const int8_t **ptrs, + std::array &prefetch_ptrs, size_t dimensionality, + float *results) { + __m256i accs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + accs[i] = _mm256_setzero_si256(); + } + + size_t dim = 0; + for (; dim + 32 <= dimensionality; dim += 32) { + __m256i q = _mm256_loadu_si256((const __m256i *)(query + dim)); + __m256i data_regs[dp_batch]; + for (size_t i = 0; i < dp_batch; ++i) { + data_regs[i] = _mm256_loadu_si256((const __m256i *)(ptrs[i] + dim)); + } + if (prefetch_ptrs[0]) { + for (size_t i = 0; i < dp_batch; ++i) { + ailego_prefetch(prefetch_ptrs[i] + dim); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + __m256i data_diff = _mm256_sub_epi8(_mm256_max_epi8(q, data_regs[i]), + _mm256_min_epi8(q, data_regs[i])); + + __m256i diff0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data_diff)); + __m256i diff1 = + _mm256_cvtepu8_epi16(_mm256_extractf128_si256(data_diff, 1)); + accs[i] = _mm256_add_epi32(_mm256_madd_epi16(diff0, diff0), accs[i]); + accs[i] = _mm256_add_epi32(_mm256_madd_epi16(diff1, diff1), accs[i]); + } + } + + for (size_t i = 0; i < dp_batch; ++i) { + results[i] = HorizontalAdd_INT32_V256(accs[i]); + } + + if (dimensionality >= dim + 16) { + for (size_t i = 0; i < dp_batch; ++i) { + __m128i q = _mm_loadu_si128((const __m128i *)query + dim); + __m128i data_regs = _mm_loadu_si128((const __m128i *)(ptrs[i] + dim)); + + __m128i diff = + _mm_sub_epi8(_mm_max_epi8(q, data_regs), _mm_min_epi8(q, data_regs)); + + __m128i diff0 = _mm_cvtepu8_epi16(diff); + __m128i diff1 = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(diff, diff)); + __m128i sum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + + results[i] += static_cast(HorizontalAdd_INT32_V128(sum)); + } + + dim += 16; + } + + for (size_t i = 0; i < dp_batch; ++i) { + switch (dimensionality - dim) { + case 15: + SSD_INT8_GENERAL(query + dim, ptrs[14] + dim, results[i]); + /* FALLTHRU */ + case 14: + SSD_INT8_GENERAL(query + dim, ptrs[13 + dim], results[i]); + /* FALLTHRU */ + case 13: + SSD_INT8_GENERAL(query + dim, ptrs[12] + dim, results[i]); + /* FALLTHRU */ + case 12: + SSD_INT8_GENERAL(query + dim, ptrs[11] + dim, results[i]); + /* FALLTHRU */ + case 11: + SSD_INT8_GENERAL(query + dim, ptrs[10 + dim], results[i]); + /* FALLTHRU */ + case 10: + SSD_INT8_GENERAL(query + dim, ptrs[9] + dim, results[i]); + /* FALLTHRU */ + case 9: + SSD_INT8_GENERAL(query + dim, ptrs[8] + dim, results[i]); + /* FALLTHRU */ + case 8: + SSD_INT8_GENERAL(query + dim, ptrs[7] + dim, results[i]); + /* FALLTHRU */ + case 7: + SSD_INT8_GENERAL(query + dim, ptrs[6] + dim, results[i]); + /* FALLTHRU */ + case 6: + SSD_INT8_GENERAL(query + dim, ptrs[5] + dim, results[i]); + /* FALLTHRU */ + case 5: + SSD_INT8_GENERAL(query + dim, ptrs[4] + dim, results[i]); + /* FALLTHRU */ + case 4: + SSD_INT8_GENERAL(query + dim, ptrs[3] + dim, results[i]); + /* FALLTHRU */ + case 3: + SSD_INT8_GENERAL(query + dim, ptrs[2] + dim, results[i]); + /* FALLTHRU */ + case 2: + SSD_INT8_GENERAL(query + dim, ptrs[1] + dim, results[i]); + /* FALLTHRU */ + case 1: + SSD_INT8_GENERAL(query + dim, ptrs[0] + dim, results[i]); + } + } +} + +#endif + + +} // namespace zvec::ailego::DistanceBatch \ No newline at end of file diff --git a/src/ailego/math_batch/inner_product_distance_batch.h b/src/ailego/math_batch/inner_product_distance_batch.h index f5799497..e0447f23 100644 --- a/src/ailego/math_batch/inner_product_distance_batch.h +++ b/src/ailego/math_batch/inner_product_distance_batch.h @@ -38,7 +38,8 @@ struct InnerProductDistanceBatchImpl { const ValueType *query, const ValueType **ptrs, std::array &prefetch_ptrs, size_t dim, float *sums) { - return compute_one_to_many_fallback(query, ptrs, prefetch_ptrs, dim, sums); + return compute_one_to_many_inner_product_fallback(query, ptrs, + prefetch_ptrs, dim, sums); } static DistanceBatchQueryPreprocessFunc GetQueryPreprocessFunc() { return nullptr; @@ -54,11 +55,12 @@ struct InnerProductDistanceBatchImpl { float *sums) { #if defined(__AVX2__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return compute_one_to_many_avx2_fp32( + return compute_one_to_many_inner_product_avx2_fp32( query, ptrs, prefetch_ptrs, dim, sums); } #endif - return compute_one_to_many_fallback(query, ptrs, prefetch_ptrs, dim, sums); + return compute_one_to_many_inner_product_fallback(query, ptrs, + prefetch_ptrs, dim, sums); } static DistanceBatchQueryPreprocessFunc GetQueryPreprocessFunc() { @@ -78,23 +80,24 @@ struct InnerProductDistanceBatchImpl { // query, ptrs, prefetch_ptrs, dim, sums); #if defined(__AVX512VNNI__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { - return compute_one_to_many_avx512_vnni_int8( + return compute_one_to_many_inner_product_avx512_vnni_int8( query, ptrs, prefetch_ptrs, dim, sums); } #endif #if defined(__AVX2__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return compute_one_to_many_avx2_int8( + return compute_one_to_many_inner_product_avx2_int8( query, ptrs, prefetch_ptrs, dim, sums); } #endif - return compute_one_to_many_fallback(query, ptrs, prefetch_ptrs, dim, sums); + return compute_one_to_many_inner_product_fallback(query, ptrs, + prefetch_ptrs, dim, sums); } static DistanceBatchQueryPreprocessFunc GetQueryPreprocessFunc() { #if defined(__AVX512VNNI__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_VNNI) { - return compute_one_to_many_avx512_vnni_int8_query_preprocess; + return compute_one_to_many_inner_product_avx512_vnni_int8_query_preprocess; } #endif return nullptr; @@ -110,23 +113,26 @@ struct InnerProductDistanceBatchImpl { float *sums) { #if defined(__AVX512FP16__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512_FP16) { - return compute_one_to_many_avx512fp16_fp16( + return compute_one_to_many_inner_product_avx512fp16_fp16( query, ptrs, prefetch_ptrs, dim, sums); } #endif #if defined(__AVX512F__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX512F) { - return compute_one_to_many_avx512f_fp16( + return compute_one_to_many_inner_product_avx512f_fp16( query, ptrs, prefetch_ptrs, dim, sums); } #endif #if defined(__AVX2__) if (zvec::ailego::internal::CpuFeatures::static_flags_.AVX2) { - return compute_one_to_many_avx2_fp16( + return compute_one_to_many_inner_product_avx2_fp16( query, ptrs, prefetch_ptrs, dim, sums); } #endif - return compute_one_to_many_fallback(query, ptrs, prefetch_ptrs, dim, sums); + return compute_one_to_many_inner_product_fallback(query, ptrs, + prefetch_ptrs, dim, sums); } }; diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl.h b/src/ailego/math_batch/inner_product_distance_batch_impl.h index 1554426f..55e3e6af 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl.h @@ -19,11 +19,12 @@ #include #include #include +#include "distance_batch_math.h" namespace zvec::ailego::DistanceBatch { template -static void compute_one_to_many_fallback( +static void compute_one_to_many_inner_product_fallback( const ValueType *query, const ValueType **ptrs, std::array &prefetch_ptrs, size_t dim, float *sums) { @@ -36,20 +37,9 @@ static void compute_one_to_many_fallback( #if defined(__AVX2__) -inline float sum4(__m128 v) { - v = _mm_add_ps(v, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 8))); - return v[0] + v[1]; -} - -inline __m128 sum_top_bottom_avx(__m256 v) { - const __m128 high = _mm256_extractf128_ps(v, 1); - const __m128 low = _mm256_castps256_ps128(v); - return _mm_add_ps(high, low); -} - template static std::enable_if_t, void> -compute_one_to_many_avx2_fp32( +compute_one_to_many_inner_product_avx2_fp32( const ValueType *query, const ValueType **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h b/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h index db9e81e0..30486cea 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h @@ -26,7 +26,7 @@ namespace zvec::ailego::DistanceBatch { template static std::enable_if_t, void> -compute_one_to_many_avx512fp16_fp16( +compute_one_to_many_inner_product_avx512fp16_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { @@ -82,7 +82,7 @@ compute_one_to_many_avx512fp16_fp16( template static std::enable_if_t, void> -compute_one_to_many_avx512f_fp16( +compute_one_to_many_inner_product_avx512f_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { @@ -172,7 +172,7 @@ compute_one_to_many_avx512f_fp16( template static std::enable_if_t, void> -compute_one_to_many_avx2_fp16( +compute_one_to_many_inner_product_avx2_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h b/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h index 9f49effd..976eeb8b 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h @@ -23,8 +23,8 @@ namespace zvec::ailego::DistanceBatch { #if defined(__AVX512VNNI__) -static void compute_one_to_many_avx512_vnni_int8_query_preprocess(void *query, - size_t dim) { +static void compute_one_to_many_inner_product_avx512_vnni_int8_query_preprocess( + void *query, size_t dim) { const int8_t *input = reinterpret_cast(query); uint8_t *output = reinterpret_cast(query); @@ -51,7 +51,7 @@ static void compute_one_to_many_avx512_vnni_int8_query_preprocess(void *query, // query is unsigned template -static void compute_one_to_many_avx512_vnni_int8( +static void compute_one_to_many_inner_product_avx512_vnni_int8( const int8_t *query, const int8_t **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { @@ -159,7 +159,7 @@ static void compute_one_to_many_avx512_vnni_int8( template static std::enable_if_t, void> -compute_one_to_many_avx2_int8( +compute_one_to_many_inner_product_avx2_int8( const int8_t *query, const int8_t **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { diff --git a/src/core/metric/euclidean_metric.cc b/src/core/metric/euclidean_metric.cc index a1a8d598..dd37315e 100644 --- a/src/core/metric/euclidean_metric.cc +++ b/src/core/metric/euclidean_metric.cc @@ -853,18 +853,6 @@ class SquaredEuclideanMetric : public IndexMetric { //! Retrieve distance function for query MatrixBatchDistance batch_distance(void) const override { switch (data_type_) { - case IndexMeta::DataType::DT_BINARY32: - return reinterpret_cast( - ailego::BaseDistance::ComputeBatch); - -#if defined(AILEGO_M64) - case IndexMeta::DataType::DT_BINARY64: - return reinterpret_cast( - ailego::BaseDistance::ComputeBatch); -#endif // AILEGO_M64 - case IndexMeta::DataType::DT_FP16: return reinterpret_cast( ailego::BaseDistance( + ailego::BaseDistance::ComputeBatch); + + case IndexMeta::DataType::DT_FP32: + return reinterpret_cast( + ailego::BaseDistance::ComputeBatch); + + case IndexMeta::DataType::DT_INT8: + return reinterpret_cast( + ailego::BaseDistance::ComputeBatch); + + case IndexMeta::DataType::DT_INT4: + return reinterpret_cast( + ailego::BaseDistance::ComputeBatch); + + default: + return nullptr; + } + } + //! Retrieve params of Metric const ailego::Params ¶ms(void) const override { return params_; diff --git a/src/core/metric/hamming_metric.cc b/src/core/metric/hamming_metric.cc index ee19c066..95fa67b8 100644 --- a/src/core/metric/hamming_metric.cc +++ b/src/core/metric/hamming_metric.cc @@ -189,22 +189,6 @@ class HammingMetric : public IndexMetric { return nullptr; } - MatrixBatchDistance batch_distance(void) const override { -#if defined(AILEGO_M64) - if (feature_type_ == IndexMeta::DataType::DT_BINARY64) { - return reinterpret_cast( - ailego::BaseDistance::ComputeBatch); - } -#endif - if (feature_type_ == IndexMeta::DataType::DT_BINARY32) { - return reinterpret_cast( - ailego::BaseDistance::ComputeBatch); - } - return nullptr; - } - //! Retrieve distance function for index features MatrixDistance distance_matrix(size_t m, size_t n) const override { #if defined(AILEGO_M64)