From 22485bc7cbdf81abf1442c0a9d83c0a533004aa8 Mon Sep 17 00:00:00 2001 From: Katherine Whitlock Date: Mon, 14 Apr 2025 12:40:00 -0400 Subject: [PATCH 1/2] Fixes for load_interleaved and store --- include/argon/argon_full.hpp | 1 + include/argon/helpers/argon_for.hpp | 2 ++ include/argon/store.hpp | 32 ++++++++++---------- include/argon/vectorize/load.hpp | 3 +- include/argon/vectorize/load_interleaved.hpp | 9 +++--- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/include/argon/argon_full.hpp b/include/argon/argon_full.hpp index ff3064d..c4171f2 100644 --- a/include/argon/argon_full.hpp +++ b/include/argon/argon_full.hpp @@ -20,6 +20,7 @@ #endif template + requires std::is_arithmetic_v class Argon : public argon::Vector> { using T = argon::Vector>; diff --git a/include/argon/helpers/argon_for.hpp b/include/argon/helpers/argon_for.hpp index b4e0a26..ba6cacc 100644 --- a/include/argon/helpers/argon_for.hpp +++ b/include/argon/helpers/argon_for.hpp @@ -12,7 +12,9 @@ template class ArgonHalf; + template + requires std::is_arithmetic_v class Argon; namespace argon::helpers { diff --git a/include/argon/store.hpp b/include/argon/store.hpp index c64b5dc..9b6188b 100644 --- a/include/argon/store.hpp +++ b/include/argon/store.hpp @@ -110,7 +110,7 @@ ace void store(scalar_type* ptr, intrinsic_types... vectors) { using intrinsic_type = typename std::tuple_element_t<0, std::tuple>; constexpr size_t size = sizeof...(vectors); - const std::array vec_array = {vectors...}; + constexpr std::array vec_array = {std::move(vectors)...}; // Best case scenerio: we know both length and stride static_assert(0 < stride && stride < 5, "Stores can only be performed with a stride of 1, 2, 3, or 4"); @@ -119,26 +119,26 @@ ace void store(scalar_type* ptr, intrinsic_types... vectors) { if constexpr (stride == 1) { constexpr size_t tail_size = size % 4; -#pragma unroll - for (auto v : vec_array | std::views::chunk(4)) { - if (v.size() == 4) { // 4-element chunks + constexpr size_t head_size = size - tail_size; + size_t i = 0; + if constexpr (head_size > 0) { + for (; i < head_size; i += 4) { using multi_type = simd::MultiVector_t; - simd::store1_x4(ptr, *(multi_type*)v.begin()); + simd::store1_x4(ptr, *(multi_type*)&vec_array[i]); ptr += (sizeof(intrinsic_type) / sizeof(*ptr)) * 4; // increment output pointer - } else { - if constexpr (tail_size == 1) { // 1-element tail - simd::store1(ptr, v.begin()); - } else if constexpr (tail_size == 2) { - using tail_multi_type = simd::MultiVector_t; - simd::store1_x2(ptr, *(tail_multi_type*)v.begin()); - } else if constexpr (tail_size == 3) { - using tail_multi_type = simd::MultiVector_t; - simd::store1_x3(ptr, *(tail_multi_type*)v.begin()); - } } } + if constexpr (tail_size == 1) { // 1-element tail + simd::store1(ptr, &vec_array[i]); + } else if constexpr (tail_size == 2) { + using tail_multi_type = simd::MultiVector_t; + simd::store1_x2(ptr, *(tail_multi_type*)&vec_array[i]); + } else if constexpr (tail_size == 3) { + using tail_multi_type = simd::MultiVector_t; + simd::store1_x3(ptr, *(tail_multi_type*)&vec_array[i]); + } } else { -#pragma unroll +#pragma GCC unroll size for (auto v : vec_array | std::views::chunk(stride)) { if constexpr (stride == 2) { store_interleaved<2>(ptr, v.begin()); diff --git a/include/argon/vectorize/load.hpp b/include/argon/vectorize/load.hpp index 4f2d2ee..a298a74 100644 --- a/include/argon/vectorize/load.hpp +++ b/include/argon/vectorize/load.hpp @@ -184,7 +184,8 @@ struct load : std::ranges::view_interface> { /// @brief Construct a load from a span /// @param span The span to load data from. - load(const std::span span) : start_{span.data()}, size_{vectorizeable_size(span.size()) / lanes} {} + load(const std::span span) + : start_{span.data()}, size_{helpers::vectorizeable_size(span.size()) / lanes} {} private: const ScalarType* start_; diff --git a/include/argon/vectorize/load_interleaved.hpp b/include/argon/vectorize/load_interleaved.hpp index fc40c3d..393ccea 100644 --- a/include/argon/vectorize/load_interleaved.hpp +++ b/include/argon/vectorize/load_interleaved.hpp @@ -27,7 +27,7 @@ struct load_interleaved : std::ranges::view_interface, Stride>; LoadInterleavedIterator() = default; - LoadInterleavedIterator(ScalarType* ptr) : ptr_{ptr} {} + LoadInterleavedIterator(const ScalarType* ptr) : ptr_{ptr} {} std::array, Stride> operator*() const { return Argon::template LoadInterleaved(ptr_); @@ -96,23 +96,24 @@ struct load_interleaved : std::ranges::view_interface); static_assert(std::bidirectional_iterator); static_assert(std::input_iterator); using iterator = LoadInterleavedIterator; + using sentinel = const ScalarType*; iterator begin() { return start_; } - ScalarType* end() { return start_ + size_; } + const ScalarType* end() { return start_ + size_; } size_t size() const { return size_ / (lanes * Stride); } template load_interleaved(R&& r) : start_{&*std::ranges::begin(r)}, size_{vectorizeable_size(std::ranges::size(r))} {} private: - ScalarType* start_; + const ScalarType* start_; size_t size_; }; From 41780c60e1300eb113e64e087cdc6ee32011920d Mon Sep 17 00:00:00 2001 From: Katherine Whitlock Date: Mon, 14 Apr 2025 12:44:23 -0400 Subject: [PATCH 2/2] test iterator and sentinel --- test/specs/vectorize/load_interleaved_spec.cpp | 4 ++-- test/specs/vectorize/load_spec.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/specs/vectorize/load_interleaved_spec.cpp b/test/specs/vectorize/load_interleaved_spec.cpp index 727fc76..e330e46 100644 --- a/test/specs/vectorize/load_interleaved_spec.cpp +++ b/test/specs/vectorize/load_interleaved_spec.cpp @@ -9,14 +9,14 @@ auto vectorize_load_interleaved = describe("vectorize_load_interleaved", ${ using element_type = int16_t; std::array vals; auto vec = argon::vectorize::load_interleaved(vals); - expect(std::is_same_v::iterator>).to_be_true(); + expect(std::is_same_v>>).to_be_true(); }); it("returns an end sentinel pointer when end() is called", _{ using element_type = int16_t; std::array vals; auto vec = argon::vectorize::load_interleaved(vals); - expect(std::is_same_v).to_be_true(); + expect(std::is_same_v>>).to_be_true(); }); it("can access all elements of vals", _{ diff --git a/test/specs/vectorize/load_spec.cpp b/test/specs/vectorize/load_spec.cpp index bda6fb2..4db7e53 100644 --- a/test/specs/vectorize/load_spec.cpp +++ b/test/specs/vectorize/load_spec.cpp @@ -9,14 +9,14 @@ auto vectorize_load = describe("vectorize_load", ${ using element_type = int16_t; std::array vals; auto vec = argon::vectorize::load(vals); - expect(std::is_same_v::iterator>).to_be_true(); + expect(std::is_same_v>>).to_be_true(); }); it("returns an end sentinel pointer when end() is called", _{ using element_type = int16_t; std::array vals; auto vec = argon::vectorize::load(vals); - expect(std::is_same_v::sentinel>).to_be_true(); + expect(std::is_same_v>>).to_be_true(); }); it("can access all elements of vals", _{