From 70704b4ec30b69952c70372e0549ff9f5541eddc Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Tue, 6 Feb 2024 17:03:11 +0300 Subject: [PATCH 01/15] add reference implementation for IDW --- .vscode/settings.json | 3 +++ CMakeLists.txt | 1 + include/algos.hpp | 3 +++ src/bgr2gray.cpp | 47 +++++++++++++++++++++--------------------- src/idw.cpp | 48 +++++++++++++++++++++++++++++++++++++++++++ test/test_main.cpp | 19 +++++++++++++++++ 6 files changed, 98 insertions(+), 23 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 src/idw.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..70e34ec --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "C_Cpp.errorSquiggles": "disabled" +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index af70cbb..85f2881 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,7 @@ add_executable(test_algo ${PERF_SOURCES}) target_link_libraries(test_algo algos "${binary_dir}/lib/libopencv_ts.a" + "${binary_dir}/lib/libopencv_imgcodecs.so" pthread ) target_include_directories(test_algo PUBLIC diff --git a/include/algos.hpp b/include/algos.hpp index c8517bc..a95cf5d 100644 --- a/include/algos.hpp +++ b/include/algos.hpp @@ -32,3 +32,6 @@ void convolution_nhwc_halide(float* src, float* kernel, float* dst, void convolution_opencv(const cv::Mat& src, const cv::Mat& weights, cv::Mat& dst, int inpChannels, int outChannels); #endif // HAVE_OPENCV_DNN + + +void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights); diff --git a/src/bgr2gray.cpp b/src/bgr2gray.cpp index 67049db..ad7ef5d 100644 --- a/src/bgr2gray.cpp +++ b/src/bgr2gray.cpp @@ -60,29 +60,30 @@ void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int wid int factor = 16; f.vectorize(x, factor); - - // Compile - Target target; - target.os = Target::OS::Linux; - target.arch = Target::Arch::RISCV; - target.bits = 64; - target.vector_bits = factor * sizeof(uint8_t) * 8; - - // Tested XuanTie C906 has 128-bit vector unit - CV_Assert(target.vector_bits <= 128); - - std::vector features; - features.push_back(Target::RVV); - features.push_back(Target::NoAsserts); - features.push_back(Target::NoRuntime); - target.set_features(features); - - std::cout << target << std::endl; - f.print_loop_nest(); - - // Dump AOT code - f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); - f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); + f.realize(output); + + // // Compile + // Target target; + // target.os = Target::OS::Linux; + // target.arch = Target::Arch::RISCV; + // target.bits = 64; + // target.vector_bits = factor * sizeof(uint8_t) * 8; + + // // Tested XuanTie C906 has 128-bit vector unit + // CV_Assert(target.vector_bits <= 128); + + // std::vector features; + // features.push_back(Target::RVV); + // features.push_back(Target::NoAsserts); + // features.push_back(Target::NoRuntime); + // target.set_features(features); + + // std::cout << target << std::endl; + // f.print_loop_nest(); + + // // Dump AOT code + // f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); + // f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); } #endif } diff --git a/src/idw.cpp b/src/idw.cpp new file mode 100644 index 0000000..28be2cd --- /dev/null +++ b/src/idw.cpp @@ -0,0 +1,48 @@ +#include + +#include "algos.hpp" + +#include + +using namespace std; + +static const int pointCount = 100; + +void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { + float* mask = new float[height * width](); + + float maxVal = -numeric_limits::infinity(); + float minVal = numeric_limits::infinity(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + float dot = 0; + for (int i = 0; i < pointCount; i++) { + int x0 = points[i]; + int y0 = points[i + 1]; + int dx = x - x0; + int dy = y - y0; + + dot += sqrt(dx*dx + dy*dy) * weights[i]; + } + + mask[y*width + x] = dot; + if (dot < minVal) { + minVal = dot; + } + if (dot > maxVal) { + maxVal = dot; + } + } + } + + float diff = maxVal - minVal; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (mask[y*width + x] - minVal) / diff); + } + } + + delete[] mask; +} \ No newline at end of file diff --git a/test/test_main.cpp b/test/test_main.cpp index 87a4fa6..49e4035 100644 --- a/test/test_main.cpp +++ b/test/test_main.cpp @@ -143,3 +143,22 @@ TEST(convolution_nhwc, halide) { } #endif // HAVE_OPENCV_DNN + + +TEST(idw_ref, halide) { + Mat src(height, width, CV_8U), dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3); + // randu(src, 0, 256); + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + idw_ref(NULL, dst.ptr(), height, width, points, weights); + + applyColorMap(dst, cl_dst, COLORMAP_JET); + + // imwrite("src.png", src); + imwrite("res.png", dst); + imwrite("cres.png", cl_dst); +} \ No newline at end of file From c143a56a27473659b9c6bbf1af12e89ff44e4870 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Tue, 6 Feb 2024 22:20:13 +0300 Subject: [PATCH 02/15] Minor fixes for reference IDW --- .vscode/settings.json | 3 --- CMakeLists.txt | 2 +- src/bgr2gray.cpp | 47 +++++++++++++++++++++---------------------- src/idw.cpp | 8 ++++---- 4 files changed, 28 insertions(+), 32 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 70e34ec..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "C_Cpp.errorSquiggles": "disabled" -} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 85f2881..27127bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,7 @@ target_include_directories(algos PRIVATE target_link_libraries(algos "${binary_dir}/install/lib/libopencv_core.so" "${binary_dir}/install/lib/libopencv_imgproc.so" + "${binary_dir}/install/lib/libopencv_imgcodecs.so" ) @@ -114,7 +115,6 @@ add_executable(test_algo ${PERF_SOURCES}) target_link_libraries(test_algo algos "${binary_dir}/lib/libopencv_ts.a" - "${binary_dir}/lib/libopencv_imgcodecs.so" pthread ) target_include_directories(test_algo PUBLIC diff --git a/src/bgr2gray.cpp b/src/bgr2gray.cpp index ad7ef5d..67049db 100644 --- a/src/bgr2gray.cpp +++ b/src/bgr2gray.cpp @@ -60,30 +60,29 @@ void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int wid int factor = 16; f.vectorize(x, factor); - f.realize(output); - - // // Compile - // Target target; - // target.os = Target::OS::Linux; - // target.arch = Target::Arch::RISCV; - // target.bits = 64; - // target.vector_bits = factor * sizeof(uint8_t) * 8; - - // // Tested XuanTie C906 has 128-bit vector unit - // CV_Assert(target.vector_bits <= 128); - - // std::vector features; - // features.push_back(Target::RVV); - // features.push_back(Target::NoAsserts); - // features.push_back(Target::NoRuntime); - // target.set_features(features); - - // std::cout << target << std::endl; - // f.print_loop_nest(); - - // // Dump AOT code - // f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); - // f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(uint8_t) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); + f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); } #endif } diff --git a/src/idw.cpp b/src/idw.cpp index 28be2cd..8c738f1 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -6,7 +6,7 @@ using namespace std; -static const int pointCount = 100; +static const int pointCount = 100; void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); @@ -18,8 +18,8 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point for (int x = 0; x < width; x++) { float dot = 0; for (int i = 0; i < pointCount; i++) { - int x0 = points[i]; - int y0 = points[i + 1]; + int x0 = points[3 * i + 1]; + int y0 = points[3 * i]; int dx = x - x0; int dy = y - y0; @@ -45,4 +45,4 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point } delete[] mask; -} \ No newline at end of file +} From f11f48820f49b277f413b22538c63c8236122c03 Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:27:08 +0300 Subject: [PATCH 03/15] add idw_halide --- .vscode/c_cpp_properties.json | 16 +++++ .vscode/settings.json | 9 +++ include/algos.hpp | 3 +- perf/perf_main.cpp | 43 +++++++++++++ src/idw.cpp | 117 ++++++++++++++++++++++++++++++++++ test/test_main.cpp | 9 ++- 6 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/settings.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..4039bef --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,16 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8104e93 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "C_Cpp.errorSquiggles": "disabled", + "files.associations": { + "limits": "cpp", + "iostream": "cpp", + "cmath": "cpp", + "optional": "cpp" + } +} \ No newline at end of file diff --git a/include/algos.hpp b/include/algos.hpp index a95cf5d..7ae4307 100644 --- a/include/algos.hpp +++ b/include/algos.hpp @@ -33,5 +33,6 @@ void convolution_opencv(const cv::Mat& src, const cv::Mat& weights, cv::Mat& dst int inpChannels, int outChannels); #endif // HAVE_OPENCV_DNN - void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights); + +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); \ No newline at end of file diff --git a/perf/perf_main.cpp b/perf/perf_main.cpp index f99b78b..dc7296a 100644 --- a/perf/perf_main.cpp +++ b/perf/perf_main.cpp @@ -14,8 +14,14 @@ #include "algos.hpp" +#include + +// #include + using namespace cv; +// using namespace std; + CV_PERF_TEST_MAIN("") Mat src(1080, 1920, CV_8UC3); @@ -181,3 +187,40 @@ PERF_TEST(convolution_nhwc, halide) { } #endif // HAVE_OPENCV_DNN + + +PERF_TEST(idw, reference) { + const int width = 1920; + const int height = 1080; + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_ref(NULL, dst.ptr(), height, width, points, weights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide) { + const int width = 1920; + const int height = 1080; + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide(NULL, dst.ptr(), height, width, points, weights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} \ No newline at end of file diff --git a/src/idw.cpp b/src/idw.cpp index 8c738f1..7ef9887 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -3,11 +3,128 @@ #include "algos.hpp" #include +#include using namespace std; +#ifdef __riscv + #include + #include "histogram.h" + using namespace Halide::Runtime; +#else + #include + using namespace Halide; +#endif + static const int pointCount = 100; +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw(input, output); +#else + static Func f("idw"); + + // try { + if (!f.defined()) { + Var x("x"), y("y"); + RDom r(0, pointCount); + + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + // f.vectorize(r, 8); + const int factor = 4; + f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw.h", {}, "idw", target); + f.compile_to_assembly("idw.s", {}, "idw", target); + } + // } + // catch (Halide::Error &e) { + // cout << e.what() << '\n'; + // } + + f.realize(mask); +#endif + + // + float maxVal = -numeric_limits::infinity(); + float minVal = numeric_limits::infinity(); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (maskBuf[y*width + x] < minVal) { + minVal = maskBuf[y*width + x]; + } + if (maskBuf[y*width + x] > maxVal) { + maxVal = maskBuf[y*width + x]; + } + } + } + + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + // + + delete[] maskBuf; +// #ifdef __riscv +// idw(input, output); +// #else +// static Func f("idw"); +// if (!f.defined()) { +// Var x("x"), y("y"); +// RDom r(0, pointCount); + +// f(x, y) = 0; +// Expr x0 = points(r.x, 0); +// Expr y0 = points(r.x, 1); +// Expr dx = x - x0; +// Expr dy = y - y0; +// f(x, y) += sqrt(dx*dx + dy*dy) * weights(r.x); +// f(x, y) = cast(f(x, y)); + +// // Dump AOT code +// f.compile_to_header("idw.h", {input}, "idw", target); +// f.compile_to_assembly("idw.s", {input}, "idw", target); +// } +// #endif +} + void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); diff --git a/test/test_main.cpp b/test/test_main.cpp index 49e4035..6d591fd 100644 --- a/test/test_main.cpp +++ b/test/test_main.cpp @@ -145,8 +145,9 @@ TEST(convolution_nhwc, halide) { #endif // HAVE_OPENCV_DNN -TEST(idw_ref, halide) { - Mat src(height, width, CV_8U), dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3); +TEST(idw, halide) { + Mat src(height, width, CV_8U); + Mat dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3), dst_h(height, width, CV_8U), cl_dst_h(height, width, CV_8UC3); // randu(src, 0, 256); // 300 elems (100 points) @@ -155,10 +156,14 @@ TEST(idw_ref, halide) { float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; idw_ref(NULL, dst.ptr(), height, width, points, weights); + idw_halide(NULL, dst_h.ptr(), height, width, points, weights); applyColorMap(dst, cl_dst, COLORMAP_JET); + applyColorMap(dst_h, cl_dst_h, COLORMAP_JET); // imwrite("src.png", src); imwrite("res.png", dst); imwrite("cres.png", cl_dst); + imwrite("res_halide.png", dst_h); + imwrite("cres_halide.png", cl_dst_h); } \ No newline at end of file From 3a6c7a49a63c33b5834eb011e9abb80ff3cb9300 Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:33:51 +0300 Subject: [PATCH 04/15] add aot files for idw_halide --- aot/idw.h | 56 +++++++ aot/idw.s | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 aot/idw.h create mode 100644 aot/idw.s diff --git a/aot/idw.h b/aot/idw.h new file mode 100644 index 0000000..94728e2 --- /dev/null +++ b/aot/idw.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_h +#define HALIDE__idw_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw(struct halide_buffer_t *_idw_buffer); + +HALIDE_FUNCTION_ATTRS +int idw_argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw.s b/aot/idw.s new file mode 100644 index 0000000..45ac36e --- /dev/null +++ b/aot/idw.s @@ -0,0 +1,450 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw,"ax",@progbits + .globl idw # -- Begin function idw + .p2align 1 + .type idw,@function +idw: # @idw +# %bb.0: # %entry + addi sp, sp, -192 + sd ra, 184(sp) # 8-byte Folded Spill + sd s0, 176(sp) # 8-byte Folded Spill + sd s1, 168(sp) # 8-byte Folded Spill + sd s2, 160(sp) # 8-byte Folded Spill + sd s3, 152(sp) # 8-byte Folded Spill + sd s4, 144(sp) # 8-byte Folded Spill + sd s5, 136(sp) # 8-byte Folded Spill + sd s6, 128(sp) # 8-byte Folded Spill + sd s7, 120(sp) # 8-byte Folded Spill + sd s8, 112(sp) # 8-byte Folded Spill + sd s9, 104(sp) # 8-byte Folded Spill + sd s10, 96(sp) # 8-byte Folded Spill + sd s11, 88(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb2.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s6, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb3.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s4, 0(a3) + lwu s8, 4(a3) + lw a4, 16(a3) + sd a4, 8(sp) # 8-byte Folded Spill + ld a4, 0(a1) + lwu a5, 20(a3) + sd a5, 32(sp) # 8-byte Folded Spill + lw s1, 24(a3) + or a5, s6, a4 + mv a4, s9 + bnez a5, .LBB0_2 +# %bb.1: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) +.LBB0_2: # %after_bb + ld a5, 0(a2) + ld s11, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_4 +# %bb.3: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) +.LBB0_4: # %after_bb1 + sext.w a4, s8 + sd a4, 24(sp) # 8-byte Folded Spill + lw a4, 32(sp) # 8-byte Folded Reload + sd s1, 16(sp) # 8-byte Folded Spill + beqz s11, .LBB0_6 +# %bb.5: + li a0, 0 + j .LBB0_9 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s4, 0(a3) + ld s1, 24(sp) # 8-byte Folded Reload + sw s1, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + ld a5, 8(sp) # 8-byte Folded Reload + sw a5, 16(a3) + sw a4, 20(a3) + sw s1, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw.s0.y.rebased.preheader" + ld s3, 16(sp) # 8-byte Folded Reload + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_13 +# %bb.11: # %"for idw.s0.y.rebased.us.preheader" + li s0, 0 + ld s2, 24(sp) # 8-byte Folded Reload + slli s2, s2, 2 + ld s1, 32(sp) # 8-byte Folded Reload +.LBB0_12: # %"for idw.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s11 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s3 + bnez s1, .LBB0_12 +.LBB0_13: # %"for idw.s1.y.rebased.preheader" + sd zero, 40(sp) # 8-byte Folded Spill + li a1, 6 + li a2, 1 + addi a3, sp, 64 + addi a7, sp, 76 + addi t0, sp, 72 + addi t1, sp, 68 + li s0, 4 + addi s1, sp, 48 + addi t2, sp, 52 + addi t3, sp, 60 + addi t6, sp, 56 + li ra, 2 + li a6, 3 +.LBB0_14: # %"for idw.s1.y.rebased" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_19 +# %bb.15: # %"for idw.s1.x.rebased.preheader" + # in Loop: Header=BB0_14 Depth=1 + li t4, 0 + ld a0, 16(sp) # 8-byte Folded Reload + ld a4, 40(sp) # 8-byte Folded Reload + mulw t5, a0, a4 + ld a0, 8(sp) # 8-byte Folded Reload + addw a0, a0, a4 +.LBB0_16: # %"for idw.s1.x.rebased" + # Parent Loop BB0_14 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a4, t4, t5 + slli a4, a4, 2 + add s2, s11, a4 + flw ft0, 0(s2) + addw a4, s4, t4 + li s5, 25 + mv s3, s6 + mv s10, s9 +.LBB0_17: # %"for idw.s1.r4$x.r4$x" + # Parent Loop BB0_14 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + addi s7, s3, 4 + vsetvli zero, a1, e32, m2 + vlwu.v v8, (s7) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a5, s3, 20 + vsetvli zero, a1, e32, m2 + vlwu.v v12, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (a3) + vsw.v v12, (a7) + vsw.v v14, (t0) + vsw.v v10, (t1) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (a3) + vsetvli a5, zero, e32, m1 + vrsub.vx v8, v8, a4 + vsetvli zero, a1, e32, m2 + vlwu.v v10, (s3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a5, s3, 16 + vsetvli zero, a1, e32, m2 + vlwu.v v14, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (s1) + vsw.v v12, (t2) + vsw.v v14, (t3) + vsw.v v16, (t6) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (s1) + vsetvli a5, zero, e32, m1 + vrsub.vx v9, v9, a0 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v8, v9 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vfmv.s.f v9, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, ra, e32, m1 + vslideup.vi v9, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, a6, e32, m1 + vslideup.vi v9, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v8, v8, 3 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v8, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v9, v8, 3 + vsetvli zero, zero, e32, m1 + vlwu.v v8, (s10) + vfmul.vv v8, v9, v8 + vfmv.s.f v9, ft0 + vfredusum.vs v8, v8, v9 + vfmv.f.s ft0, v8 + addi s5, s5, -1 + addi s10, s10, 16 + addi s3, s3, 48 + bnez s5, .LBB0_17 +# %bb.18: # %"end for idw.s1.r4$x.r4$x" + # in Loop: Header=BB0_16 Depth=2 + addi t4, t4, 1 + fsw ft0, 0(s2) + bne t4, s8, .LBB0_16 +.LBB0_19: # %"end for idw.s1.x.rebased" + # in Loop: Header=BB0_14 Depth=1 + ld a4, 40(sp) # 8-byte Folded Reload + addi a4, a4, 1 + ld a0, 32(sp) # 8-byte Folded Reload + sd a4, 40(sp) # 8-byte Folded Spill + bne a4, a0, .LBB0_14 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 184(sp) # 8-byte Folded Reload + ld s0, 176(sp) # 8-byte Folded Reload + ld s1, 168(sp) # 8-byte Folded Reload + ld s2, 160(sp) # 8-byte Folded Reload + ld s3, 152(sp) # 8-byte Folded Reload + ld s4, 144(sp) # 8-byte Folded Reload + ld s5, 136(sp) # 8-byte Folded Reload + ld s6, 128(sp) # 8-byte Folded Reload + ld s7, 120(sp) # 8-byte Folded Reload + ld s8, 112(sp) # 8-byte Folded Reload + ld s9, 104(sp) # 8-byte Folded Reload + ld s10, 96(sp) # 8-byte Folded Reload + ld s11, 88(sp) # 8-byte Folded Reload + addi sp, sp, 192 + ret +.Lfunc_end0: + .size idw, .Lfunc_end0-idw + # -- End function + .section .text.idw_argv,"ax",@progbits + .globl idw_argv # -- Begin function idw_argv + .p2align 1 + .type idw_argv,@function +idw_argv: # @idw_argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_argv, .Lfunc_end1-idw_argv + # -- End function + .section .text.idw_metadata,"ax",@progbits + .globl idw_metadata # -- Begin function idw_metadata + .p2align 1 + .type idw_metadata,@function +idw_metadata: # @idw_metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_metadata, .Lfunc_end2-idw_metadata + # -- End function + .type .Lb2.shape,@object # @b2.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb2.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb2.shape, 16 + + .type .Lb2.data,@object # @b2.data + .p2align 5, 0x0 +.Lb2.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb2.data, 1200 + + .type .Lb2.buffer,@object # @b2.buffer + .data + .p2align 4, 0x0 +.Lb2.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb2.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb2.shape + .quad 0 + .size .Lb2.buffer, 56 + + .type .Lb3.shape,@object # @b3.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb3.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb3.shape, 16 + + .type .Lb3.data,@object # @b3.data + .p2align 5, 0x0 +.Lb3.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb3.data, 400 + + .type .Lb3.buffer,@object # @b3.buffer + .data + .p2align 4, 0x0 +.Lb3.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb3.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb3.shape + .quad 0 + .size .Lb3.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw" + .size .Lstr, 4 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_metadata_storage,@object # @idw_metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits From 1efccdffdc3a37b1f482f3674f36f935ff718f1e Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:54:23 +0300 Subject: [PATCH 05/15] hardcoded maxval and minval --- .vscode/settings.json | 3 ++- src/idw.cpp | 40 ++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8104e93..9b18adb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,6 +4,7 @@ "limits": "cpp", "iostream": "cpp", "cmath": "cpp", - "optional": "cpp" + "optional": "cpp", + "new": "cpp" } } \ No newline at end of file diff --git a/src/idw.cpp b/src/idw.cpp index 7ef9887..7a08ac8 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -80,18 +80,18 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po #endif // - float maxVal = -numeric_limits::infinity(); - float minVal = numeric_limits::infinity(); - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - if (maskBuf[y*width + x] < minVal) { - minVal = maskBuf[y*width + x]; - } - if (maskBuf[y*width + x] > maxVal) { - maxVal = maskBuf[y*width + x]; - } - } - } + float maxVal = 193.0 //-numeric_limits::infinity(); + float minVal = 58.0 //numeric_limits::infinity(); + // for (int y = 0; y < height; y++) { + // for (int x = 0; x < width; x++) { + // if (maskBuf[y*width + x] < minVal) { + // minVal = maskBuf[y*width + x]; + // } + // if (maskBuf[y*width + x] > maxVal) { + // maxVal = maskBuf[y*width + x]; + // } + // } + // } float diff = maxVal - minVal; for (int y = 0; y < height; y++) { @@ -128,8 +128,8 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); - float maxVal = -numeric_limits::infinity(); - float minVal = numeric_limits::infinity(); + float maxVal = 193.0 //-numeric_limits::infinity(); + float minVal = 58.0 //numeric_limits::infinity(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { @@ -144,12 +144,12 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point } mask[y*width + x] = dot; - if (dot < minVal) { - minVal = dot; - } - if (dot > maxVal) { - maxVal = dot; - } + // if (dot < minVal) { + // minVal = dot; + // } + // if (dot > maxVal) { + // maxVal = dot; + // } } } From bda6cc0b8e8dda9f2352ee653c061126ae1a8113 Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Tue, 6 Feb 2024 17:03:11 +0300 Subject: [PATCH 06/15] add reference implementation for IDW --- .vscode/settings.json | 3 +++ src/bgr2gray.cpp | 47 ++++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 23 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..70e34ec --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "C_Cpp.errorSquiggles": "disabled" +} \ No newline at end of file diff --git a/src/bgr2gray.cpp b/src/bgr2gray.cpp index 67049db..ad7ef5d 100644 --- a/src/bgr2gray.cpp +++ b/src/bgr2gray.cpp @@ -60,29 +60,30 @@ void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int wid int factor = 16; f.vectorize(x, factor); - - // Compile - Target target; - target.os = Target::OS::Linux; - target.arch = Target::Arch::RISCV; - target.bits = 64; - target.vector_bits = factor * sizeof(uint8_t) * 8; - - // Tested XuanTie C906 has 128-bit vector unit - CV_Assert(target.vector_bits <= 128); - - std::vector features; - features.push_back(Target::RVV); - features.push_back(Target::NoAsserts); - features.push_back(Target::NoRuntime); - target.set_features(features); - - std::cout << target << std::endl; - f.print_loop_nest(); - - // Dump AOT code - f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); - f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); + f.realize(output); + + // // Compile + // Target target; + // target.os = Target::OS::Linux; + // target.arch = Target::Arch::RISCV; + // target.bits = 64; + // target.vector_bits = factor * sizeof(uint8_t) * 8; + + // // Tested XuanTie C906 has 128-bit vector unit + // CV_Assert(target.vector_bits <= 128); + + // std::vector features; + // features.push_back(Target::RVV); + // features.push_back(Target::NoAsserts); + // features.push_back(Target::NoRuntime); + // target.set_features(features); + + // std::cout << target << std::endl; + // f.print_loop_nest(); + + // // Dump AOT code + // f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); + // f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); } #endif } From 086ea7b5acb670ea911ec94aec2242b16ab093f3 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Tue, 6 Feb 2024 22:20:13 +0300 Subject: [PATCH 07/15] Minor fixes for reference IDW --- .vscode/settings.json | 3 --- CMakeLists.txt | 1 - src/bgr2gray.cpp | 47 +++++++++++++++++++++---------------------- 3 files changed, 23 insertions(+), 28 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 70e34ec..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "C_Cpp.errorSquiggles": "disabled" -} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index a20e3f2..b4ee5cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,6 @@ add_executable(test_algo ${PERF_SOURCES}) target_link_libraries(test_algo algos "${binary_dir}/lib/libopencv_ts.a" - "${binary_dir}/install/lib/libopencv_imgcodecs.so" pthread ) target_include_directories(test_algo PUBLIC diff --git a/src/bgr2gray.cpp b/src/bgr2gray.cpp index ad7ef5d..67049db 100644 --- a/src/bgr2gray.cpp +++ b/src/bgr2gray.cpp @@ -60,30 +60,29 @@ void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int wid int factor = 16; f.vectorize(x, factor); - f.realize(output); - - // // Compile - // Target target; - // target.os = Target::OS::Linux; - // target.arch = Target::Arch::RISCV; - // target.bits = 64; - // target.vector_bits = factor * sizeof(uint8_t) * 8; - - // // Tested XuanTie C906 has 128-bit vector unit - // CV_Assert(target.vector_bits <= 128); - - // std::vector features; - // features.push_back(Target::RVV); - // features.push_back(Target::NoAsserts); - // features.push_back(Target::NoRuntime); - // target.set_features(features); - - // std::cout << target << std::endl; - // f.print_loop_nest(); - - // // Dump AOT code - // f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); - // f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(uint8_t) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("bgr2gray_interleaved.h", {input}, "bgr2gray_interleaved", target); + f.compile_to_assembly("bgr2gray_interleaved.s", {input}, "bgr2gray_interleaved", target); } #endif } From dde043eea1b06e786c536969a1c40e65f3f9f095 Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:27:08 +0300 Subject: [PATCH 08/15] add idw_halide --- .vscode/c_cpp_properties.json | 16 +++++ .vscode/settings.json | 9 +++ include/algos.hpp | 3 +- perf/perf_main.cpp | 43 +++++++++++++ src/idw.cpp | 117 ++++++++++++++++++++++++++++++++++ test/test_main.cpp | 9 ++- 6 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/settings.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..4039bef --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,16 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8104e93 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "C_Cpp.errorSquiggles": "disabled", + "files.associations": { + "limits": "cpp", + "iostream": "cpp", + "cmath": "cpp", + "optional": "cpp" + } +} \ No newline at end of file diff --git a/include/algos.hpp b/include/algos.hpp index 7a291b9..9982b03 100644 --- a/include/algos.hpp +++ b/include/algos.hpp @@ -36,5 +36,6 @@ void convolution_opencv(const cv::Mat& src, const cv::Mat& weights, cv::Mat& dst int inpChannels, int outChannels); #endif // HAVE_OPENCV_DNN - void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights); + +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); \ No newline at end of file diff --git a/perf/perf_main.cpp b/perf/perf_main.cpp index 4a4f4f2..9952623 100644 --- a/perf/perf_main.cpp +++ b/perf/perf_main.cpp @@ -14,8 +14,14 @@ #include "algos.hpp" +#include + +// #include + using namespace cv; +// using namespace std; + CV_PERF_TEST_MAIN("") Mat src(1080, 1920, CV_8UC3); @@ -204,3 +210,40 @@ PERF_TEST(convolution_nhwc, halide) { } #endif // HAVE_OPENCV_DNN + + +PERF_TEST(idw, reference) { + const int width = 1920; + const int height = 1080; + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_ref(NULL, dst.ptr(), height, width, points, weights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide) { + const int width = 1920; + const int height = 1080; + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide(NULL, dst.ptr(), height, width, points, weights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} \ No newline at end of file diff --git a/src/idw.cpp b/src/idw.cpp index 8c738f1..7ef9887 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -3,11 +3,128 @@ #include "algos.hpp" #include +#include using namespace std; +#ifdef __riscv + #include + #include "histogram.h" + using namespace Halide::Runtime; +#else + #include + using namespace Halide; +#endif + static const int pointCount = 100; +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw(input, output); +#else + static Func f("idw"); + + // try { + if (!f.defined()) { + Var x("x"), y("y"); + RDom r(0, pointCount); + + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + // f.vectorize(r, 8); + const int factor = 4; + f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw.h", {}, "idw", target); + f.compile_to_assembly("idw.s", {}, "idw", target); + } + // } + // catch (Halide::Error &e) { + // cout << e.what() << '\n'; + // } + + f.realize(mask); +#endif + + // + float maxVal = -numeric_limits::infinity(); + float minVal = numeric_limits::infinity(); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (maskBuf[y*width + x] < minVal) { + minVal = maskBuf[y*width + x]; + } + if (maskBuf[y*width + x] > maxVal) { + maxVal = maskBuf[y*width + x]; + } + } + } + + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + // + + delete[] maskBuf; +// #ifdef __riscv +// idw(input, output); +// #else +// static Func f("idw"); +// if (!f.defined()) { +// Var x("x"), y("y"); +// RDom r(0, pointCount); + +// f(x, y) = 0; +// Expr x0 = points(r.x, 0); +// Expr y0 = points(r.x, 1); +// Expr dx = x - x0; +// Expr dy = y - y0; +// f(x, y) += sqrt(dx*dx + dy*dy) * weights(r.x); +// f(x, y) = cast(f(x, y)); + +// // Dump AOT code +// f.compile_to_header("idw.h", {input}, "idw", target); +// f.compile_to_assembly("idw.s", {input}, "idw", target); +// } +// #endif +} + void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); diff --git a/test/test_main.cpp b/test/test_main.cpp index 05580f8..9fadfbd 100644 --- a/test/test_main.cpp +++ b/test/test_main.cpp @@ -164,8 +164,9 @@ TEST(convolution_nhwc, halide) { #endif // HAVE_OPENCV_DNN -TEST(idw_ref, halide) { - Mat src(height, width, CV_8U), dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3); +TEST(idw, halide) { + Mat src(height, width, CV_8U); + Mat dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3), dst_h(height, width, CV_8U), cl_dst_h(height, width, CV_8UC3); // randu(src, 0, 256); // 300 elems (100 points) @@ -174,10 +175,14 @@ TEST(idw_ref, halide) { float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; idw_ref(NULL, dst.ptr(), height, width, points, weights); + idw_halide(NULL, dst_h.ptr(), height, width, points, weights); applyColorMap(dst, cl_dst, COLORMAP_JET); + applyColorMap(dst_h, cl_dst_h, COLORMAP_JET); // imwrite("src.png", src); imwrite("res.png", dst); imwrite("cres.png", cl_dst); + imwrite("res_halide.png", dst_h); + imwrite("cres_halide.png", cl_dst_h); } \ No newline at end of file From 0f6398316c692a485e7b55a251836fa63fef1e8e Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:33:51 +0300 Subject: [PATCH 09/15] add aot files for idw_halide --- aot/idw.h | 56 +++++++ aot/idw.s | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 aot/idw.h create mode 100644 aot/idw.s diff --git a/aot/idw.h b/aot/idw.h new file mode 100644 index 0000000..94728e2 --- /dev/null +++ b/aot/idw.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_h +#define HALIDE__idw_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw(struct halide_buffer_t *_idw_buffer); + +HALIDE_FUNCTION_ATTRS +int idw_argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw.s b/aot/idw.s new file mode 100644 index 0000000..45ac36e --- /dev/null +++ b/aot/idw.s @@ -0,0 +1,450 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw,"ax",@progbits + .globl idw # -- Begin function idw + .p2align 1 + .type idw,@function +idw: # @idw +# %bb.0: # %entry + addi sp, sp, -192 + sd ra, 184(sp) # 8-byte Folded Spill + sd s0, 176(sp) # 8-byte Folded Spill + sd s1, 168(sp) # 8-byte Folded Spill + sd s2, 160(sp) # 8-byte Folded Spill + sd s3, 152(sp) # 8-byte Folded Spill + sd s4, 144(sp) # 8-byte Folded Spill + sd s5, 136(sp) # 8-byte Folded Spill + sd s6, 128(sp) # 8-byte Folded Spill + sd s7, 120(sp) # 8-byte Folded Spill + sd s8, 112(sp) # 8-byte Folded Spill + sd s9, 104(sp) # 8-byte Folded Spill + sd s10, 96(sp) # 8-byte Folded Spill + sd s11, 88(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb2.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s6, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb3.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s4, 0(a3) + lwu s8, 4(a3) + lw a4, 16(a3) + sd a4, 8(sp) # 8-byte Folded Spill + ld a4, 0(a1) + lwu a5, 20(a3) + sd a5, 32(sp) # 8-byte Folded Spill + lw s1, 24(a3) + or a5, s6, a4 + mv a4, s9 + bnez a5, .LBB0_2 +# %bb.1: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) +.LBB0_2: # %after_bb + ld a5, 0(a2) + ld s11, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_4 +# %bb.3: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) +.LBB0_4: # %after_bb1 + sext.w a4, s8 + sd a4, 24(sp) # 8-byte Folded Spill + lw a4, 32(sp) # 8-byte Folded Reload + sd s1, 16(sp) # 8-byte Folded Spill + beqz s11, .LBB0_6 +# %bb.5: + li a0, 0 + j .LBB0_9 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s4, 0(a3) + ld s1, 24(sp) # 8-byte Folded Reload + sw s1, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + ld a5, 8(sp) # 8-byte Folded Reload + sw a5, 16(a3) + sw a4, 20(a3) + sw s1, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw.s0.y.rebased.preheader" + ld s3, 16(sp) # 8-byte Folded Reload + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_13 +# %bb.11: # %"for idw.s0.y.rebased.us.preheader" + li s0, 0 + ld s2, 24(sp) # 8-byte Folded Reload + slli s2, s2, 2 + ld s1, 32(sp) # 8-byte Folded Reload +.LBB0_12: # %"for idw.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s11 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s3 + bnez s1, .LBB0_12 +.LBB0_13: # %"for idw.s1.y.rebased.preheader" + sd zero, 40(sp) # 8-byte Folded Spill + li a1, 6 + li a2, 1 + addi a3, sp, 64 + addi a7, sp, 76 + addi t0, sp, 72 + addi t1, sp, 68 + li s0, 4 + addi s1, sp, 48 + addi t2, sp, 52 + addi t3, sp, 60 + addi t6, sp, 56 + li ra, 2 + li a6, 3 +.LBB0_14: # %"for idw.s1.y.rebased" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_19 +# %bb.15: # %"for idw.s1.x.rebased.preheader" + # in Loop: Header=BB0_14 Depth=1 + li t4, 0 + ld a0, 16(sp) # 8-byte Folded Reload + ld a4, 40(sp) # 8-byte Folded Reload + mulw t5, a0, a4 + ld a0, 8(sp) # 8-byte Folded Reload + addw a0, a0, a4 +.LBB0_16: # %"for idw.s1.x.rebased" + # Parent Loop BB0_14 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a4, t4, t5 + slli a4, a4, 2 + add s2, s11, a4 + flw ft0, 0(s2) + addw a4, s4, t4 + li s5, 25 + mv s3, s6 + mv s10, s9 +.LBB0_17: # %"for idw.s1.r4$x.r4$x" + # Parent Loop BB0_14 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + addi s7, s3, 4 + vsetvli zero, a1, e32, m2 + vlwu.v v8, (s7) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a5, s3, 20 + vsetvli zero, a1, e32, m2 + vlwu.v v12, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (a3) + vsw.v v12, (a7) + vsw.v v14, (t0) + vsw.v v10, (t1) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (a3) + vsetvli a5, zero, e32, m1 + vrsub.vx v8, v8, a4 + vsetvli zero, a1, e32, m2 + vlwu.v v10, (s3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a5, s3, 16 + vsetvli zero, a1, e32, m2 + vlwu.v v14, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (s1) + vsw.v v12, (t2) + vsw.v v14, (t3) + vsw.v v16, (t6) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (s1) + vsetvli a5, zero, e32, m1 + vrsub.vx v9, v9, a0 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v8, v9 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vfmv.s.f v9, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, ra, e32, m1 + vslideup.vi v9, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, a6, e32, m1 + vslideup.vi v9, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v8, v8, 3 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v8, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v9, v8, 3 + vsetvli zero, zero, e32, m1 + vlwu.v v8, (s10) + vfmul.vv v8, v9, v8 + vfmv.s.f v9, ft0 + vfredusum.vs v8, v8, v9 + vfmv.f.s ft0, v8 + addi s5, s5, -1 + addi s10, s10, 16 + addi s3, s3, 48 + bnez s5, .LBB0_17 +# %bb.18: # %"end for idw.s1.r4$x.r4$x" + # in Loop: Header=BB0_16 Depth=2 + addi t4, t4, 1 + fsw ft0, 0(s2) + bne t4, s8, .LBB0_16 +.LBB0_19: # %"end for idw.s1.x.rebased" + # in Loop: Header=BB0_14 Depth=1 + ld a4, 40(sp) # 8-byte Folded Reload + addi a4, a4, 1 + ld a0, 32(sp) # 8-byte Folded Reload + sd a4, 40(sp) # 8-byte Folded Spill + bne a4, a0, .LBB0_14 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 184(sp) # 8-byte Folded Reload + ld s0, 176(sp) # 8-byte Folded Reload + ld s1, 168(sp) # 8-byte Folded Reload + ld s2, 160(sp) # 8-byte Folded Reload + ld s3, 152(sp) # 8-byte Folded Reload + ld s4, 144(sp) # 8-byte Folded Reload + ld s5, 136(sp) # 8-byte Folded Reload + ld s6, 128(sp) # 8-byte Folded Reload + ld s7, 120(sp) # 8-byte Folded Reload + ld s8, 112(sp) # 8-byte Folded Reload + ld s9, 104(sp) # 8-byte Folded Reload + ld s10, 96(sp) # 8-byte Folded Reload + ld s11, 88(sp) # 8-byte Folded Reload + addi sp, sp, 192 + ret +.Lfunc_end0: + .size idw, .Lfunc_end0-idw + # -- End function + .section .text.idw_argv,"ax",@progbits + .globl idw_argv # -- Begin function idw_argv + .p2align 1 + .type idw_argv,@function +idw_argv: # @idw_argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_argv, .Lfunc_end1-idw_argv + # -- End function + .section .text.idw_metadata,"ax",@progbits + .globl idw_metadata # -- Begin function idw_metadata + .p2align 1 + .type idw_metadata,@function +idw_metadata: # @idw_metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_metadata, .Lfunc_end2-idw_metadata + # -- End function + .type .Lb2.shape,@object # @b2.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb2.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb2.shape, 16 + + .type .Lb2.data,@object # @b2.data + .p2align 5, 0x0 +.Lb2.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb2.data, 1200 + + .type .Lb2.buffer,@object # @b2.buffer + .data + .p2align 4, 0x0 +.Lb2.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb2.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb2.shape + .quad 0 + .size .Lb2.buffer, 56 + + .type .Lb3.shape,@object # @b3.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb3.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb3.shape, 16 + + .type .Lb3.data,@object # @b3.data + .p2align 5, 0x0 +.Lb3.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb3.data, 400 + + .type .Lb3.buffer,@object # @b3.buffer + .data + .p2align 4, 0x0 +.Lb3.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb3.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb3.shape + .quad 0 + .size .Lb3.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw" + .size .Lstr, 4 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_metadata_storage,@object # @idw_metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits From 366b68a9040cf411535406e8b5b93142cf29ef31 Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Wed, 7 Feb 2024 17:54:23 +0300 Subject: [PATCH 10/15] hardcoded maxval and minval --- .vscode/settings.json | 3 ++- src/idw.cpp | 40 ++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8104e93..9b18adb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,6 +4,7 @@ "limits": "cpp", "iostream": "cpp", "cmath": "cpp", - "optional": "cpp" + "optional": "cpp", + "new": "cpp" } } \ No newline at end of file diff --git a/src/idw.cpp b/src/idw.cpp index 7ef9887..7a08ac8 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -80,18 +80,18 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po #endif // - float maxVal = -numeric_limits::infinity(); - float minVal = numeric_limits::infinity(); - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - if (maskBuf[y*width + x] < minVal) { - minVal = maskBuf[y*width + x]; - } - if (maskBuf[y*width + x] > maxVal) { - maxVal = maskBuf[y*width + x]; - } - } - } + float maxVal = 193.0 //-numeric_limits::infinity(); + float minVal = 58.0 //numeric_limits::infinity(); + // for (int y = 0; y < height; y++) { + // for (int x = 0; x < width; x++) { + // if (maskBuf[y*width + x] < minVal) { + // minVal = maskBuf[y*width + x]; + // } + // if (maskBuf[y*width + x] > maxVal) { + // maxVal = maskBuf[y*width + x]; + // } + // } + // } float diff = maxVal - minVal; for (int y = 0; y < height; y++) { @@ -128,8 +128,8 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); - float maxVal = -numeric_limits::infinity(); - float minVal = numeric_limits::infinity(); + float maxVal = 193.0 //-numeric_limits::infinity(); + float minVal = 58.0 //numeric_limits::infinity(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { @@ -144,12 +144,12 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point } mask[y*width + x] = dot; - if (dot < minVal) { - minVal = dot; - } - if (dot > maxVal) { - maxVal = dot; - } + // if (dot < minVal) { + // minVal = dot; + // } + // if (dot > maxVal) { + // maxVal = dot; + // } } } From fac19e5f0b5c9fdf8a82624c3d191ec001119e2e Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 7 Feb 2024 20:42:57 +0300 Subject: [PATCH 11/15] Remove commented code --- .vscode/c_cpp_properties.json | 16 ---------- .vscode/settings.json | 10 ------- perf/perf_main.cpp | 27 +++++------------ src/idw.cpp | 55 +++++------------------------------ 4 files changed, 15 insertions(+), 93 deletions(-) delete mode 100644 .vscode/c_cpp_properties.json delete mode 100644 .vscode/settings.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json deleted file mode 100644 index 4039bef..0000000 --- a/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "configurations": [ - { - "name": "Linux", - "includePath": [ - "${workspaceFolder}/**" - ], - "defines": [], - "compilerPath": "/usr/bin/gcc", - "cStandard": "c17", - "cppStandard": "gnu++17", - "intelliSenseMode": "linux-gcc-x64" - } - ], - "version": 4 -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9b18adb..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "C_Cpp.errorSquiggles": "disabled", - "files.associations": { - "limits": "cpp", - "iostream": "cpp", - "cmath": "cpp", - "optional": "cpp", - "new": "cpp" - } -} \ No newline at end of file diff --git a/perf/perf_main.cpp b/perf/perf_main.cpp index 9952623..8357ead 100644 --- a/perf/perf_main.cpp +++ b/perf/perf_main.cpp @@ -14,14 +14,8 @@ #include "algos.hpp" -#include - -// #include - using namespace cv; -// using namespace std; - CV_PERF_TEST_MAIN("") Mat src(1080, 1920, CV_8UC3); @@ -212,19 +206,19 @@ PERF_TEST(convolution_nhwc, halide) { #endif // HAVE_OPENCV_DNN +// 300 elems (100 points) +static int idwPoints[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; +// 100 elems +float idwWeights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + PERF_TEST(idw, reference) { const int width = 1920; const int height = 1080; - // 300 elems (100 points) - int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; - // 100 elems - float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; - Mat dst(height, width, CV_8U); PERF_SAMPLE_BEGIN() - idw_ref(NULL, dst.ptr(), height, width, points, weights); + idw_ref(NULL, dst.ptr(), height, width, idwPoints, idwWeights); PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); @@ -234,16 +228,11 @@ PERF_TEST(idw, halide) { const int width = 1920; const int height = 1080; - // 300 elems (100 points) - int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; - // 100 elems - float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; - Mat dst(height, width, CV_8U); PERF_SAMPLE_BEGIN() - idw_halide(NULL, dst.ptr(), height, width, points, weights); + idw_halide(NULL, dst.ptr(), height, width, idwPoints, idwWeights); PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); -} \ No newline at end of file +} diff --git a/src/idw.cpp b/src/idw.cpp index 7a08ac8..4b9315f 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -30,12 +30,12 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po idw(input, output); #else static Func f("idw"); - + // try { if (!f.defined()) { Var x("x"), y("y"); RDom r(0, pointCount); - + f(x, y) = 0.F; Expr x0 = points(3*r+1); Expr y0 = points(3*r); @@ -45,7 +45,7 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po f(x, y) += hypot(dx, dy) * weight; // f.vectorize(r, 8); - const int factor = 4; + const int factor = 4; f.update().atomic().vectorize(r, factor); // Compile @@ -79,57 +79,22 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po f.realize(mask); #endif - // - float maxVal = 193.0 //-numeric_limits::infinity(); - float minVal = 58.0 //numeric_limits::infinity(); - // for (int y = 0; y < height; y++) { - // for (int x = 0; x < width; x++) { - // if (maskBuf[y*width + x] < minVal) { - // minVal = maskBuf[y*width + x]; - // } - // if (maskBuf[y*width + x] > maxVal) { - // maxVal = maskBuf[y*width + x]; - // } - // } - // } - + float maxVal = 193.0; + float minVal = 58.0; float diff = maxVal - minVal; for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); } } - // - delete[] maskBuf; -// #ifdef __riscv -// idw(input, output); -// #else -// static Func f("idw"); -// if (!f.defined()) { -// Var x("x"), y("y"); -// RDom r(0, pointCount); - -// f(x, y) = 0; -// Expr x0 = points(r.x, 0); -// Expr y0 = points(r.x, 1); -// Expr dx = x - x0; -// Expr dy = y - y0; -// f(x, y) += sqrt(dx*dx + dy*dy) * weights(r.x); -// f(x, y) = cast(f(x, y)); - -// // Dump AOT code -// f.compile_to_header("idw.h", {input}, "idw", target); -// f.compile_to_assembly("idw.s", {input}, "idw", target); -// } -// #endif } void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); - float maxVal = 193.0 //-numeric_limits::infinity(); - float minVal = 58.0 //numeric_limits::infinity(); + float maxVal = 193.0; + float minVal = 58.0; for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { @@ -144,12 +109,6 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point } mask[y*width + x] = dot; - // if (dot < minVal) { - // minVal = dot; - // } - // if (dot > maxVal) { - // maxVal = dot; - // } } } From 0bf2bfe4ae76b539361b0c985666c299ada3da25 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 7 Feb 2024 21:03:44 +0300 Subject: [PATCH 12/15] Fix rvv --- aot/idw.s | 2 +- src/idw.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aot/idw.s b/aot/idw.s index 45ac36e..647cee3 100644 --- a/aot/idw.s +++ b/aot/idw.s @@ -266,7 +266,7 @@ idw: # @idw vlwu.v v8, (s10) vfmul.vv v8, v9, v8 vfmv.s.f v9, ft0 - vfredusum.vs v8, v8, v9 + vfredsum.vs v8, v8, v9 vfmv.f.s ft0, v8 addi s5, s5, -1 addi s10, s10, 16 diff --git a/src/idw.cpp b/src/idw.cpp index 4b9315f..d974449 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -9,7 +9,7 @@ using namespace std; #ifdef __riscv #include - #include "histogram.h" + #include "idw.h" using namespace Halide::Runtime; #else #include @@ -27,7 +27,7 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po Buffer points(pointsBuf, {pointCount*3}); Buffer weights(weightsBuf, {pointCount}); #ifdef __riscv - idw(input, output); + idw(mask); #else static Func f("idw"); From 3b97268b3450a5058fcae17b4ec4a816f514fcb9 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 7 Feb 2024 23:02:45 +0300 Subject: [PATCH 13/15] Update test.yaml --- .github/workflows/test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e1fd539..34d7e31 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -45,6 +45,7 @@ jobs: cd build rm ${{ github.workspace }}/aot/* ./perf_algo --gtest_filter=*halide* + sed -i 's|vfredusum|vfredsum|' idw.s mv *.h *.s halide_runtime.o ${{ github.workspace }}/aot - name: Build for RISC-V From 7d7906140ef0c117696662e38d0b89b8ceeff67f Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Thu, 8 Feb 2024 10:44:01 +0300 Subject: [PATCH 14/15] fix merge artifacts --- aot/idw.s | 4 ---- 1 file changed, 4 deletions(-) diff --git a/aot/idw.s b/aot/idw.s index 7fa4310..647cee3 100644 --- a/aot/idw.s +++ b/aot/idw.s @@ -266,11 +266,7 @@ idw: # @idw vlwu.v v8, (s10) vfmul.vv v8, v9, v8 vfmv.s.f v9, ft0 -<<<<<<< HEAD - vfredusum.vs v8, v8, v9 -======= vfredsum.vs v8, v8, v9 ->>>>>>> a6d731b66e883d018fbaefd444edc23e7aa09c80 vfmv.f.s ft0, v8 addi s5, s5, -1 addi s10, s10, 16 From 1fcc642de0616d9fc74839b72beeb06244cd191e Mon Sep 17 00:00:00 2001 From: uiop07558 Date: Thu, 8 Feb 2024 13:08:26 +0300 Subject: [PATCH 15/15] add variants of idw_halide and minor bugfixes --- .vscode/settings.json | 4 +- aot/idw.s | 296 +++++--------- aot/idw_halide.h | 56 +++ aot/idw_halide.s | 362 +++++++++++++++++ aot/idw_halide_parallel.h | 56 +++ aot/idw_halide_parallel.s | 598 +++++++++++++++++++++++++++ aot/idw_halide_parallel_vec.h | 56 +++ aot/idw_halide_parallel_vec.s | 739 ++++++++++++++++++++++++++++++++++ aot/idw_halide_vec.h | 56 +++ aot/idw_halide_vec.s | 450 +++++++++++++++++++++ include/algos.hpp | 6 + perf/perf_main.cpp | 59 ++- src/idw.cpp | 234 ++++++++++- test/test_main.cpp | 14 + 14 files changed, 2778 insertions(+), 208 deletions(-) create mode 100644 aot/idw_halide.h create mode 100644 aot/idw_halide.s create mode 100644 aot/idw_halide_parallel.h create mode 100644 aot/idw_halide_parallel.s create mode 100644 aot/idw_halide_parallel_vec.h create mode 100644 aot/idw_halide_parallel_vec.s create mode 100644 aot/idw_halide_vec.h create mode 100644 aot/idw_halide_vec.s diff --git a/.vscode/settings.json b/.vscode/settings.json index 9b18adb..90a6474 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,8 @@ "iostream": "cpp", "cmath": "cpp", "optional": "cpp", - "new": "cpp" + "new": "cpp", + "ostream": "cpp", + "numbers": "cpp" } } \ No newline at end of file diff --git a/aot/idw.s b/aot/idw.s index 647cee3..12c9a39 100644 --- a/aot/idw.s +++ b/aot/idw.s @@ -8,41 +8,49 @@ .type idw,@function idw: # @idw # %bb.0: # %entry - addi sp, sp, -192 - sd ra, 184(sp) # 8-byte Folded Spill - sd s0, 176(sp) # 8-byte Folded Spill - sd s1, 168(sp) # 8-byte Folded Spill - sd s2, 160(sp) # 8-byte Folded Spill - sd s3, 152(sp) # 8-byte Folded Spill - sd s4, 144(sp) # 8-byte Folded Spill - sd s5, 136(sp) # 8-byte Folded Spill - sd s6, 128(sp) # 8-byte Folded Spill - sd s7, 120(sp) # 8-byte Folded Spill - sd s8, 112(sp) # 8-byte Folded Spill - sd s9, 104(sp) # 8-byte Folded Spill - sd s10, 96(sp) # 8-byte Folded Spill - sd s11, 88(sp) # 8-byte Folded Spill + addi sp, sp, -96 + sd ra, 88(sp) # 8-byte Folded Spill + sd s0, 80(sp) # 8-byte Folded Spill + sd s1, 72(sp) # 8-byte Folded Spill + sd s2, 64(sp) # 8-byte Folded Spill + sd s3, 56(sp) # 8-byte Folded Spill + sd s4, 48(sp) # 8-byte Folded Spill + sd s5, 40(sp) # 8-byte Folded Spill + sd s6, 32(sp) # 8-byte Folded Spill + sd s7, 24(sp) # 8-byte Folded Spill + sd s8, 16(sp) # 8-byte Folded Spill + sd s9, 8(sp) # 8-byte Folded Spill + sd s10, 0(sp) # 8-byte Folded Spill .Lpcrel_hi0: auipc a1, %pcrel_hi(.Lb2.buffer) addi a1, a1, %pcrel_lo(.Lpcrel_hi0) - ld s6, 16(a1) + ld s8, 16(a1) ld a3, 40(a0) .Lpcrel_hi1: auipc a2, %pcrel_hi(.Lb3.buffer) addi a2, a2, %pcrel_lo(.Lpcrel_hi1) ld s9, 16(a2) - lw s4, 0(a3) - lwu s8, 4(a3) - lw a4, 16(a3) - sd a4, 8(sp) # 8-byte Folded Spill + lw s3, 0(a3) + lw s7, 4(a3) + lw s4, 16(a3) ld a4, 0(a1) - lwu a5, 20(a3) - sd a5, 32(sp) # 8-byte Folded Spill - lw s1, 24(a3) - or a5, s6, a4 + lwu s5, 20(a3) + lw s6, 24(a3) + or a5, s8, a4 mv a4, s9 - bnez a5, .LBB0_2 -# %bb.1: # %then_bb + beqz a5, .LBB0_4 +# %bb.1: # %after_bb + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + beqz a4, .LBB0_5 +.LBB0_2: # %after_bb1 + sext.w a4, s5 + beqz s10, .LBB0_6 +.LBB0_3: + li a0, 0 + j .LBB0_9 +.LBB0_4: # %then_bb lui a4, 128 addiw a4, a4, 9 slli a4, a4, 13 @@ -59,12 +67,11 @@ idw: # @idw sw zero, 12(a4) ld a4, 16(a2) sd zero, 24(a1) -.LBB0_2: # %after_bb ld a5, 0(a2) - ld s11, 16(a0) + ld s10, 16(a0) or a4, a4, a5 - bnez a4, .LBB0_4 -# %bb.3: # %then_bb2 + bnez a4, .LBB0_2 +.LBB0_5: # %then_bb2 sd zero, 16(a2) sd zero, 8(a2) sd zero, 0(a2) @@ -81,15 +88,8 @@ idw: # @idw sw a4, 8(a5) sw zero, 12(a5) sd zero, 24(a2) -.LBB0_4: # %after_bb1 - sext.w a4, s8 - sd a4, 24(sp) # 8-byte Folded Spill - lw a4, 32(sp) # 8-byte Folded Reload - sd s1, 16(sp) # 8-byte Folded Spill - beqz s11, .LBB0_6 -# %bb.5: - li a0, 0 - j .LBB0_9 + sext.w a4, s5 + bnez s10, .LBB0_3 .LBB0_6: # %_halide_buffer_is_bounds_query.exit36 ld a5, 0(a0) bnez a5, .LBB0_8 @@ -102,16 +102,14 @@ idw: # @idw slli a5, a5, 13 addi a5, a5, 2 sd a5, 32(a0) - sw s4, 0(a3) - ld s1, 24(sp) # 8-byte Folded Reload - sw s1, 4(a3) + sw s3, 0(a3) + sw s7, 4(a3) li a5, 1 sw a5, 8(a3) sw zero, 12(a3) - ld a5, 8(sp) # 8-byte Folded Reload - sw a5, 16(a3) + sw s4, 16(a3) sw a4, 20(a3) - sw s1, 24(a3) + sw s7, 24(a3) sw zero, 28(a3) sd zero, 24(a0) .LBB0_8: # %land.rhs.i49 @@ -132,174 +130,88 @@ idw: # @idw or a0, a0, a1 bnez a0, .LBB0_20 # %bb.10: # %"for idw.s0.y.rebased.preheader" - ld s3, 16(sp) # 8-byte Folded Reload - ld a0, 24(sp) # 8-byte Folded Reload - blez a0, .LBB0_13 + blez s7, .LBB0_20 # %bb.11: # %"for idw.s0.y.rebased.us.preheader" li s0, 0 - ld s2, 24(sp) # 8-byte Folded Reload - slli s2, s2, 2 - ld s1, 32(sp) # 8-byte Folded Reload + slli s2, s7, 2 + mv s1, s5 .LBB0_12: # %"for idw.s0.y.rebased.us" # =>This Inner Loop Header: Depth=1 slli a0, s0, 2 - add a0, a0, s11 + add a0, a0, s10 li a1, 0 mv a2, s2 call memset@plt addi s1, s1, -1 - addw s0, s0, s3 + addw s0, s0, s6 bnez s1, .LBB0_12 -.LBB0_13: # %"for idw.s1.y.rebased.preheader" - sd zero, 40(sp) # 8-byte Folded Spill - li a1, 6 - li a2, 1 - addi a3, sp, 64 - addi a7, sp, 76 - addi t0, sp, 72 - addi t1, sp, 68 - li s0, 4 - addi s1, sp, 48 - addi t2, sp, 52 - addi t3, sp, 60 - addi t6, sp, 56 - li ra, 2 - li a6, 3 -.LBB0_14: # %"for idw.s1.y.rebased" +# %bb.13: # %"for idw.s1.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.14: # %"for idw.s1.y.rebased.us.preheader" + li a6, 0 + addi s8, s8, 4 +.LBB0_15: # %"for idw.s1.y.rebased.us" # =>This Loop Header: Depth=1 # Child Loop BB0_16 Depth 2 # Child Loop BB0_17 Depth 3 - ld a0, 24(sp) # 8-byte Folded Reload - blez a0, .LBB0_19 -# %bb.15: # %"for idw.s1.x.rebased.preheader" - # in Loop: Header=BB0_14 Depth=1 - li t4, 0 - ld a0, 16(sp) # 8-byte Folded Reload - ld a4, 40(sp) # 8-byte Folded Reload - mulw t5, a0, a4 - ld a0, 8(sp) # 8-byte Folded Reload - addw a0, a0, a4 -.LBB0_16: # %"for idw.s1.x.rebased" - # Parent Loop BB0_14 Depth=1 + li a1, 0 + mulw a7, s6, a6 + add a3, s4, a6 +.LBB0_16: # %"for idw.s1.x.rebased.us" + # Parent Loop BB0_15 Depth=1 # => This Loop Header: Depth=2 # Child Loop BB0_17 Depth 3 - add a4, t4, t5 - slli a4, a4, 2 - add s2, s11, a4 - flw ft0, 0(s2) - addw a4, s4, t4 - li s5, 25 - mv s3, s6 - mv s10, s9 -.LBB0_17: # %"for idw.s1.r4$x.r4$x" - # Parent Loop BB0_14 Depth=1 + add a0, a1, a7 + slli a0, a0, 2 + add t0, s10, a0 + flw ft0, 0(t0) + add a5, s3, a1 + li a0, 100 + mv s1, s8 + mv s0, s9 +.LBB0_17: # %"for idw.s1.r4$x.us" + # Parent Loop BB0_15 Depth=1 # Parent Loop BB0_16 Depth=2 # => This Inner Loop Header: Depth=3 - addi s7, s3, 4 - vsetvli zero, a1, e32, m2 - vlwu.v v8, (s7) - vsetvli zero, a2, e32, m2 - vslidedown.vi v10, v8, 3 - addi a5, s3, 20 - vsetvli zero, a1, e32, m2 - vlwu.v v12, (a5) - vsetvli zero, a2, e32, m2 - vslidedown.vi v14, v12, 2 - vslidedown.vi v12, v12, 5 - vsw.v v8, (a3) - vsw.v v12, (a7) - vsw.v v14, (t0) - vsw.v v10, (t1) - vsetvli zero, s0, e32, m1 - vlwu.v v8, (a3) - vsetvli a5, zero, e32, m1 - vrsub.vx v8, v8, a4 - vsetvli zero, a1, e32, m2 - vlwu.v v10, (s3) - vsetvli zero, a2, e32, m2 - vslidedown.vi v12, v10, 3 - addi a5, s3, 16 - vsetvli zero, a1, e32, m2 - vlwu.v v14, (a5) - vsetvli zero, a2, e32, m2 - vslidedown.vi v16, v14, 2 - vslidedown.vi v14, v14, 5 - vsw.v v10, (s1) - vsw.v v12, (t2) - vsw.v v14, (t3) - vsw.v v16, (t6) - vsetvli zero, s0, e32, m1 - vlwu.v v9, (s1) - vsetvli a5, zero, e32, m1 - vrsub.vx v9, v9, a0 - vmul.vv v9, v9, v9 - vmacc.vv v9, v8, v8 - vfcvt.f.x.v v8, v9 - vfmv.f.s ft1, v8 - fsqrt.s ft1, ft1 - vfmv.s.f v9, ft1 - vsetvli zero, a2, e32, m1 - vslidedown.vi v10, v8, 1 - vfmv.f.s ft1, v10 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v10, ft1 - vsetvli zero, ra, e32, m1 - vslideup.vi v9, v10, 1 - vsetvli zero, a2, e32, m1 - vslidedown.vi v10, v8, 2 - vfmv.f.s ft1, v10 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v10, ft1 - vsetvli zero, a6, e32, m1 - vslideup.vi v9, v10, 2 - vsetvli zero, a2, e32, m1 - vslidedown.vi v8, v8, 3 - vfmv.f.s ft1, v8 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v8, ft1 - vsetvli zero, s0, e32, m1 - vslideup.vi v9, v8, 3 - vsetvli zero, zero, e32, m1 - vlwu.v v8, (s10) - vfmul.vv v8, v9, v8 - vfmv.s.f v9, ft0 - vfredsum.vs v8, v8, v9 - vfmv.f.s ft0, v8 - addi s5, s5, -1 - addi s10, s10, 16 - addi s3, s3, 48 - bnez s5, .LBB0_17 -# %bb.18: # %"end for idw.s1.r4$x.r4$x" + lw a2, 0(s1) + lw a4, -4(s1) + subw a2, a5, a2 + subw a4, a3, a4 + mulw a2, a2, a2 + mulw a4, a4, a4 + flw ft1, 0(s0) + add a2, a2, a4 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi a0, a0, -1 + addi s0, s0, 4 + addi s1, s1, 12 + bnez a0, .LBB0_17 +# %bb.18: # %"end for idw.s1.r4$x.us" # in Loop: Header=BB0_16 Depth=2 - addi t4, t4, 1 - fsw ft0, 0(s2) - bne t4, s8, .LBB0_16 -.LBB0_19: # %"end for idw.s1.x.rebased" - # in Loop: Header=BB0_14 Depth=1 - ld a4, 40(sp) # 8-byte Folded Reload - addi a4, a4, 1 - ld a0, 32(sp) # 8-byte Folded Reload - sd a4, 40(sp) # 8-byte Folded Spill - bne a4, a0, .LBB0_14 + addi a1, a1, 1 + fsw ft0, 0(t0) + bne a1, s7, .LBB0_16 +# %bb.19: # %"end for idw.s1.x.rebased.loopexit.us" + # in Loop: Header=BB0_15 Depth=1 + addi a6, a6, 1 + bne a6, s5, .LBB0_15 .LBB0_20: # %destructor_block li a0, 0 - ld ra, 184(sp) # 8-byte Folded Reload - ld s0, 176(sp) # 8-byte Folded Reload - ld s1, 168(sp) # 8-byte Folded Reload - ld s2, 160(sp) # 8-byte Folded Reload - ld s3, 152(sp) # 8-byte Folded Reload - ld s4, 144(sp) # 8-byte Folded Reload - ld s5, 136(sp) # 8-byte Folded Reload - ld s6, 128(sp) # 8-byte Folded Reload - ld s7, 120(sp) # 8-byte Folded Reload - ld s8, 112(sp) # 8-byte Folded Reload - ld s9, 104(sp) # 8-byte Folded Reload - ld s10, 96(sp) # 8-byte Folded Reload - ld s11, 88(sp) # 8-byte Folded Reload - addi sp, sp, 192 + ld ra, 88(sp) # 8-byte Folded Reload + ld s0, 80(sp) # 8-byte Folded Reload + ld s1, 72(sp) # 8-byte Folded Reload + ld s2, 64(sp) # 8-byte Folded Reload + ld s3, 56(sp) # 8-byte Folded Reload + ld s4, 48(sp) # 8-byte Folded Reload + ld s5, 40(sp) # 8-byte Folded Reload + ld s6, 32(sp) # 8-byte Folded Reload + ld s7, 24(sp) # 8-byte Folded Reload + ld s8, 16(sp) # 8-byte Folded Reload + ld s9, 8(sp) # 8-byte Folded Reload + ld s10, 0(sp) # 8-byte Folded Reload + addi sp, sp, 96 ret .Lfunc_end0: .size idw, .Lfunc_end0-idw diff --git a/aot/idw_halide.h b/aot/idw_halide.h new file mode 100644 index 0000000..e06605e --- /dev/null +++ b/aot/idw_halide.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_h +#define HALIDE__idw_halide_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_(struct halide_buffer_t *_idw_halide__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide.s b/aot/idw_halide.s new file mode 100644 index 0000000..74b4a7e --- /dev/null +++ b/aot/idw_halide.s @@ -0,0 +1,362 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_,"ax",@progbits + .globl idw_halide_ # -- Begin function idw_halide_ + .p2align 1 + .type idw_halide_,@function +idw_halide_: # @idw_halide_ +# %bb.0: # %entry + addi sp, sp, -96 + sd ra, 88(sp) # 8-byte Folded Spill + sd s0, 80(sp) # 8-byte Folded Spill + sd s1, 72(sp) # 8-byte Folded Spill + sd s2, 64(sp) # 8-byte Folded Spill + sd s3, 56(sp) # 8-byte Folded Spill + sd s4, 48(sp) # 8-byte Folded Spill + sd s5, 40(sp) # 8-byte Folded Spill + sd s6, 32(sp) # 8-byte Folded Spill + sd s7, 24(sp) # 8-byte Folded Spill + sd s8, 16(sp) # 8-byte Folded Spill + sd s9, 8(sp) # 8-byte Folded Spill + sd s10, 0(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb2.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s8, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb3.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s3, 0(a3) + lw s7, 4(a3) + lw s4, 16(a3) + ld a4, 0(a1) + lwu s5, 20(a3) + lw s6, 24(a3) + or a5, s8, a4 + mv a4, s9 + beqz a5, .LBB0_4 +# %bb.1: # %after_bb + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + beqz a4, .LBB0_5 +.LBB0_2: # %after_bb1 + sext.w a4, s5 + beqz s10, .LBB0_6 +.LBB0_3: + li a0, 0 + j .LBB0_9 +.LBB0_4: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_2 +.LBB0_5: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) + sext.w a4, s5 + bnez s10, .LBB0_3 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s3, 0(a3) + sw s7, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + sw s4, 16(a3) + sw a4, 20(a3) + sw s7, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw_halide_.s0.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.11: # %"for idw_halide_.s0.y.rebased.us.preheader" + li s0, 0 + slli s2, s7, 2 + mv s1, s5 +.LBB0_12: # %"for idw_halide_.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s10 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s6 + bnez s1, .LBB0_12 +# %bb.13: # %"for idw_halide_.s1.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.14: # %"for idw_halide_.s1.y.rebased.us.preheader" + li a6, 0 + addi s8, s8, 4 +.LBB0_15: # %"for idw_halide_.s1.y.rebased.us" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + li a1, 0 + mulw a7, s6, a6 + add a3, s4, a6 +.LBB0_16: # %"for idw_halide_.s1.x.rebased.us" + # Parent Loop BB0_15 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a0, a1, a7 + slli a0, a0, 2 + add t0, s10, a0 + flw ft0, 0(t0) + add a5, s3, a1 + li a0, 100 + mv s1, s8 + mv s0, s9 +.LBB0_17: # %"for idw_halide_.s1.r4$x.us" + # Parent Loop BB0_15 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + lw a2, 0(s1) + lw a4, -4(s1) + subw a2, a5, a2 + subw a4, a3, a4 + mulw a2, a2, a2 + mulw a4, a4, a4 + flw ft1, 0(s0) + add a2, a2, a4 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi a0, a0, -1 + addi s0, s0, 4 + addi s1, s1, 12 + bnez a0, .LBB0_17 +# %bb.18: # %"end for idw_halide_.s1.r4$x.us" + # in Loop: Header=BB0_16 Depth=2 + addi a1, a1, 1 + fsw ft0, 0(t0) + bne a1, s7, .LBB0_16 +# %bb.19: # %"end for idw_halide_.s1.x.rebased.loopexit.us" + # in Loop: Header=BB0_15 Depth=1 + addi a6, a6, 1 + bne a6, s5, .LBB0_15 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 88(sp) # 8-byte Folded Reload + ld s0, 80(sp) # 8-byte Folded Reload + ld s1, 72(sp) # 8-byte Folded Reload + ld s2, 64(sp) # 8-byte Folded Reload + ld s3, 56(sp) # 8-byte Folded Reload + ld s4, 48(sp) # 8-byte Folded Reload + ld s5, 40(sp) # 8-byte Folded Reload + ld s6, 32(sp) # 8-byte Folded Reload + ld s7, 24(sp) # 8-byte Folded Reload + ld s8, 16(sp) # 8-byte Folded Reload + ld s9, 8(sp) # 8-byte Folded Reload + ld s10, 0(sp) # 8-byte Folded Reload + addi sp, sp, 96 + ret +.Lfunc_end0: + .size idw_halide_, .Lfunc_end0-idw_halide_ + # -- End function + .section .text.idw_halide__argv,"ax",@progbits + .globl idw_halide__argv # -- Begin function idw_halide__argv + .p2align 1 + .type idw_halide__argv,@function +idw_halide__argv: # @idw_halide__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_halide__argv, .Lfunc_end1-idw_halide__argv + # -- End function + .section .text.idw_halide__metadata,"ax",@progbits + .globl idw_halide__metadata # -- Begin function idw_halide__metadata + .p2align 1 + .type idw_halide__metadata,@function +idw_halide__metadata: # @idw_halide__metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_halide__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_halide__metadata, .Lfunc_end2-idw_halide__metadata + # -- End function + .type .Lb2.shape,@object # @b2.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb2.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb2.shape, 16 + + .type .Lb2.data,@object # @b2.data + .p2align 5, 0x0 +.Lb2.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb2.data, 1200 + + .type .Lb2.buffer,@object # @b2.buffer + .data + .p2align 4, 0x0 +.Lb2.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb2.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb2.shape + .quad 0 + .size .Lb2.buffer, 56 + + .type .Lb3.shape,@object # @b3.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb3.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb3.shape, 16 + + .type .Lb3.data,@object # @b3.data + .p2align 5, 0x0 +.Lb3.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb3.data, 400 + + .type .Lb3.buffer,@object # @b3.buffer + .data + .p2align 4, 0x0 +.Lb3.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb3.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb3.shape + .quad 0 + .size .Lb3.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_" + .size .Lstr, 12 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide__metadata_storage,@object # @idw_halide__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_parallel.h b/aot/idw_halide_parallel.h new file mode 100644 index 0000000..6f272cb --- /dev/null +++ b/aot/idw_halide_parallel.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_parallel_h +#define HALIDE__idw_halide_parallel_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_(struct halide_buffer_t *_idw_halide_parallel__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_parallel__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_parallel.s b/aot/idw_halide_parallel.s new file mode 100644 index 0000000..7578098 --- /dev/null +++ b/aot/idw_halide_parallel.s @@ -0,0 +1,598 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1,@function +idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1: # @idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd ra, 40(sp) # 8-byte Folded Spill + sd s0, 32(sp) # 8-byte Folded Spill + sd s1, 24(sp) # 8-byte Folded Spill + sd s2, 16(sp) # 8-byte Folded Spill + sd s3, 8(sp) # 8-byte Folded Spill + lw a4, 24(a2) + lw a7, 8(a2) + lw a6, 12(a2) + sraiw a3, a1, 31 + seqz a5, a4 + negw a0, a5 + srai s1, a4, 31 + subw t0, a1, a3 + or a0, a0, a4 + remw a1, t0, a0 + xor s0, s1, a4 + not a0, s1 + add s0, s0, a0 + and s0, s0, a3 + add a1, a1, s0 + li s0, 480 + mulw a1, a1, s0 + addi s0, a5, -1 + and t2, s0, a1 + addiw a1, a7, -480 + blt t2, a1, .LBB0_2 +# %bb.1: # %entry + mv t2, a1 +.LBB0_2: # %entry + lw a7, 16(a2) + lw s2, 20(a2) + lw t1, 28(a2) + add a4, a4, a5 + divw a1, t0, a4 + subw a0, a0, s1 + and a0, a0, a3 + add a0, a0, a1 + li s1, 270 + mulw a0, a0, s1 + and a0, a0, s0 + addiw a1, a6, -270 + blt a0, a1, .LBB0_4 +# %bb.3: # %entry + mv a0, a1 +.LBB0_4: # %entry + ld s3, 0(a2) + add a0, a0, a7 + add t1, t1, t2 + mulw a0, s2, a0 + addw s0, t1, a0 +.LBB0_5: # %"for idw_halide_parallel_.s0.y.y_inner" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s3 + li a2, 1920 + li a1, 0 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s2 + bnez s1, .LBB0_5 +# %bb.6: # %destructor_block + li a0, 0 + ld ra, 40(sp) # 8-byte Folded Reload + ld s0, 32(sp) # 8-byte Folded Reload + ld s1, 24(sp) # 8-byte Folded Reload + ld s2, 16(sp) # 8-byte Folded Reload + ld s3, 8(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end0: + .size idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1, .Lfunc_end0-idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1,@function +idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1: # @idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd s0, 40(sp) # 8-byte Folded Spill + sd s1, 32(sp) # 8-byte Folded Spill + sd s2, 24(sp) # 8-byte Folded Spill + sd s3, 16(sp) # 8-byte Folded Spill + sd s4, 8(sp) # 8-byte Folded Spill + sd s5, 0(sp) # 8-byte Folded Spill + li a7, 0 + ld s3, 0(a2) + ld t4, 8(a2) + ld t3, 16(a2) + lw a0, 36(a2) + lw t5, 24(a2) + lw t1, 28(a2) + sraiw a3, a1, 31 + seqz a4, a0 + negw a5, a4 + srai s1, a0, 31 + subw a1, a1, a3 + or a5, a5, a0 + remw a6, a1, a5 + xor s0, s1, a0 + not a5, s1 + add s0, s0, a5 + and s0, s0, a3 + add s0, s0, a6 + lw a6, 32(a2) + lw t6, 40(a2) + add a0, a0, a4 + divw a0, a1, a0 + subw a5, a5, s1 + and a3, a3, a5 + add a0, a0, a3 + li t2, 480 + mulw a1, s0, t2 + addi a4, a4, -1 + and s2, a4, a1 + li t0, 270 + mulw a0, a0, t0 + and a0, a0, a4 + add t1, t1, a0 + addi s3, s3, 4 +.LBB1_1: # %"for idw_halide_parallel_.s1.y.y_inner" + # =>This Loop Header: Depth=1 + # Child Loop BB1_2 Depth 2 + # Child Loop BB1_3 Depth 3 + li a4, 0 + add a3, t1, a7 + mulw s4, a3, a6 +.LBB1_2: # %"for idw_halide_parallel_.s1.x.x_inner" + # Parent Loop BB1_1 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB1_3 Depth 3 + add s0, s2, a4 + add a0, s0, t6 + addw a0, a0, s4 + slli a0, a0, 2 + add s5, t3, a0 + flw ft0, 0(s5) + add s0, s0, t5 + li s1, 100 + mv a1, s3 + mv a0, t4 +.LBB1_3: # %"for idw_halide_parallel_.s1.r13$x" + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # => This Inner Loop Header: Depth=3 + lw a5, 0(a1) + lw a2, -4(a1) + subw a5, s0, a5 + subw a2, a3, a2 + mulw a5, a5, a5 + mulw a2, a2, a2 + flw ft1, 0(a0) + add a2, a2, a5 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi s1, s1, -1 + addi a0, a0, 4 + addi a1, a1, 12 + bnez s1, .LBB1_3 +# %bb.4: # %"end for idw_halide_parallel_.s1.r13$x" + # in Loop: Header=BB1_2 Depth=2 + addi a4, a4, 1 + fsw ft0, 0(s5) + bne a4, t2, .LBB1_2 +# %bb.5: # %"end for idw_halide_parallel_.s1.x.x_inner" + # in Loop: Header=BB1_1 Depth=1 + addiw a7, a7, 1 + bne a7, t0, .LBB1_1 +# %bb.6: # %destructor_block + li a0, 0 + ld s0, 40(sp) # 8-byte Folded Reload + ld s1, 32(sp) # 8-byte Folded Reload + ld s2, 24(sp) # 8-byte Folded Reload + ld s3, 16(sp) # 8-byte Folded Reload + ld s4, 8(sp) # 8-byte Folded Reload + ld s5, 0(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end1: + .size idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1, .Lfunc_end1-idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_,"ax",@progbits + .globl idw_halide_parallel_ # -- Begin function idw_halide_parallel_ + .p2align 1 + .type idw_halide_parallel_,@function +idw_halide_parallel_: # @idw_halide_parallel_ +# %bb.0: # %entry + addi sp, sp, -176 + sd ra, 168(sp) # 8-byte Folded Spill + sd s0, 160(sp) # 8-byte Folded Spill + sd s1, 152(sp) # 8-byte Folded Spill + sd s2, 144(sp) # 8-byte Folded Spill + sd s3, 136(sp) # 8-byte Folded Spill + sd s4, 128(sp) # 8-byte Folded Spill + sd s5, 120(sp) # 8-byte Folded Spill + sd s6, 112(sp) # 8-byte Folded Spill + sd s7, 104(sp) # 8-byte Folded Spill + sd s8, 96(sp) # 8-byte Folded Spill + sd s9, 88(sp) # 8-byte Folded Spill + ld t5, 40(a0) + lw a4, 4(t5) + lw t2, 20(t5) + li a2, -480 + subw a6, a2, a4 + addiw a2, a4, -1 + blt a2, a6, .LBB2_2 +# %bb.1: # %entry + mv a6, a2 +.LBB2_2: # %entry +.Lpcrel_hi0: + auipc a2, %pcrel_hi(.Lb24.buffer) + addi a3, a2, %pcrel_lo(.Lpcrel_hi0) + sgtz t4, a4 + slti a5, a4, -479 + slti s0, t2, -269 + lui s1, 559241 + li a7, 480 + addiw t1, s1, -1911 + mv t0, a4 + blt a4, a7, .LBB2_4 +# %bb.3: # %entry + li t0, 480 +.LBB2_4: # %entry +.Lpcrel_hi1: + auipc t6, %pcrel_hi(.Lb25.buffer) + ld s2, 16(a3) + add a1, a4, a5 + addiw a1, a1, 479 + mul a2, a1, t1 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw s9, a1, a5 + add a1, t2, s0 + addiw a1, a1, 269 + lui a2, 994205 + addiw a2, a2, 1609 + mul a2, a1, a2 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw a1, a1, s0 + mulw s7, a1, s9 + slti a1, s7, 1 + negw a2, a1 + seqz s1, s9 + add a1, a1, s7 + addiw a1, a1, -1 + add s0, s9, s1 + divw a1, a1, s0 + add a5, a5, a5 + addiw a5, a5, -1 + and a2, a2, a5 + add a1, a1, a2 + li a2, 270 + mulw a1, a1, a2 + addi s1, s1, -1 + and t3, s1, a1 + addi t4, t4, -1 + and t4, t4, t3 + addiw a2, t2, -270 + addi a5, t6, %pcrel_lo(.Lpcrel_hi1) + blt t4, a2, .LBB2_6 +# %bb.5: # %entry + mv t4, a2 +.LBB2_6: # %entry + ld s4, 16(a5) + lw s3, 0(t5) + ld a1, 0(a3) + lw s6, 16(t5) + lw s5, 24(t5) + or a1, s2, a1 + mv s0, s4 + beqz a1, .LBB2_10 +# %bb.7: # %after_bb + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + beqz a1, .LBB2_11 +.LBB2_8: # %after_bb1 + beqz s8, .LBB2_12 +.LBB2_9: + li a0, 0 + j .LBB2_18 +.LBB2_10: # %then_bb + lui a1, 128 + addiw a1, a1, 9 + slli a1, a1, 13 + sd a1, 32(a3) + ld a1, 40(a3) + sd zero, 0(a3) + sd zero, 8(a3) + sd zero, 16(a3) + sw zero, 0(a1) + li a2, 300 + sw a2, 4(a1) + li a2, 1 + sw a2, 8(a1) + sw zero, 12(a1) + ld s0, 16(a5) + sd zero, 24(a3) + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + bnez a1, .LBB2_8 +.LBB2_11: # %then_bb2 + sd zero, 16(a5) + sd zero, 8(a5) + sd zero, 0(a5) + lui a1, 128 + addiw a1, a1, 9 + ld a2, 40(a5) + slli a1, a1, 13 + addi a1, a1, 2 + sd a1, 32(a5) + sw zero, 0(a2) + li a1, 100 + sw a1, 4(a2) + li a1, 1 + sw a1, 8(a2) + sw zero, 12(a2) + sd zero, 24(a5) + bnez s8, .LBB2_9 +.LBB2_12: # %_halide_buffer_is_bounds_query.exit31 + ld a1, 0(a0) + bnez a1, .LBB2_17 +# %bb.13: # %then_bb5 + srai a1, a6, 31 + subw a2, a6, a1 + mul s1, a2, t1 + srli s1, s1, 32 + add a2, a2, s1 + srliw s1, a2, 31 + sraiw a2, a2, 8 + addw a2, a2, s1 + add a1, a1, a2 + sgtz a2, a1 + negw a2, a2 + and a1, a1, a2 + mulw a1, a1, a7 + subw a1, a1, t0 + add t0, t0, s3 + addiw t0, t0, -480 + addiw a6, a1, 960 + add a7, t4, s6 + bgtz a4, .LBB2_15 +# %bb.14: # %then_bb5 + li s0, 270 + j .LBB2_16 +.LBB2_15: + addi s0, t3, 270 +.LBB2_16: # %then_bb5 + subw a1, s0, t4 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a2, 256 + addiw a2, a2, 9 + slli a2, a2, 13 + addi a2, a2, 2 + sd a2, 32(a0) + sw t0, 0(t5) + sw a6, 4(t5) + li a2, 1 + sw a2, 8(t5) + sw zero, 12(t5) + sw a7, 16(t5) + sw a1, 20(t5) + sw a6, 24(t5) + sw zero, 28(t5) + sd zero, 24(a0) +.LBB2_17: # %land.rhs.i44 + ld a0, 0(a0) + seqz a0, a0 +.LBB2_18: # %_halide_buffer_is_bounds_query.exit45 + ld a1, 16(a3) + ld a2, 0(a3) + ld a3, 16(a5) + ld a5, 0(a5) + or a1, a1, a2 + seqz a1, a1 + or a3, a3, a5 + seqz a2, a3 + or a1, a1, a2 + or a0, a0, a1 + bnez a0, .LBB2_20 +# %bb.19: # %then_bb8 + mulw a0, s6, s5 + negw s0, a0 + sd s8, 8(sp) + sw a4, 16(sp) + sw t2, 20(sp) + sw s6, 24(sp) + sw s5, 28(sp) + sw s9, 32(sp) + sw s0, 36(sp) +.Lpcrel_hi2: + auipc a0, %got_pcrel_hi(idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi2)(a0) + addi a4, sp, 8 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt + sd s2, 40(sp) + sd s4, 48(sp) + sd s8, 56(sp) + sw s3, 64(sp) + sw s6, 68(sp) + sw s5, 72(sp) + sw s9, 76(sp) + sw s0, 80(sp) +.Lpcrel_hi3: + auipc a0, %got_pcrel_hi(idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi3)(a0) + addi a4, sp, 40 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt +.LBB2_20: # %destructor_block + li a0, 0 + ld ra, 168(sp) # 8-byte Folded Reload + ld s0, 160(sp) # 8-byte Folded Reload + ld s1, 152(sp) # 8-byte Folded Reload + ld s2, 144(sp) # 8-byte Folded Reload + ld s3, 136(sp) # 8-byte Folded Reload + ld s4, 128(sp) # 8-byte Folded Reload + ld s5, 120(sp) # 8-byte Folded Reload + ld s6, 112(sp) # 8-byte Folded Reload + ld s7, 104(sp) # 8-byte Folded Reload + ld s8, 96(sp) # 8-byte Folded Reload + ld s9, 88(sp) # 8-byte Folded Reload + addi sp, sp, 176 + ret +.Lfunc_end2: + .size idw_halide_parallel_, .Lfunc_end2-idw_halide_parallel_ + # -- End function + .section .text.idw_halide_parallel__argv,"ax",@progbits + .globl idw_halide_parallel__argv # -- Begin function idw_halide_parallel__argv + .p2align 1 + .type idw_halide_parallel__argv,@function +idw_halide_parallel__argv: # @idw_halide_parallel__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_parallel_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end3: + .size idw_halide_parallel__argv, .Lfunc_end3-idw_halide_parallel__argv + # -- End function + .section .text.idw_halide_parallel__metadata,"ax",@progbits + .globl idw_halide_parallel__metadata # -- Begin function idw_halide_parallel__metadata + .p2align 1 + .type idw_halide_parallel__metadata,@function +idw_halide_parallel__metadata: # @idw_halide_parallel__metadata +# %bb.0: # %entry +.Lpcrel_hi4: + auipc a0, %pcrel_hi(.Lidw_halide_parallel__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi4) + ret +.Lfunc_end4: + .size idw_halide_parallel__metadata, .Lfunc_end4-idw_halide_parallel__metadata + # -- End function + .type .Lb24.shape,@object # @b24.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb24.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb24.shape, 16 + + .type .Lb24.data,@object # @b24.data + .p2align 5, 0x0 +.Lb24.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb24.data, 1200 + + .type .Lb24.buffer,@object # @b24.buffer + .data + .p2align 4, 0x0 +.Lb24.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb24.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb24.shape + .quad 0 + .size .Lb24.buffer, 56 + + .type .Lb25.shape,@object # @b25.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb25.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb25.shape, 16 + + .type .Lb25.data,@object # @b25.data + .p2align 5, 0x0 +.Lb25.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb25.data, 400 + + .type .Lb25.buffer,@object # @b25.buffer + .data + .p2align 4, 0x0 +.Lb25.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb25.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb25.shape + .quad 0 + .size .Lb25.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_parallel_" + .size .Lstr, 21 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_parallel__metadata_storage,@object # @idw_halide_parallel__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_parallel__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_parallel__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_parallel_vec.h b/aot/idw_halide_parallel_vec.h new file mode 100644 index 0000000..156229d --- /dev/null +++ b/aot/idw_halide_parallel_vec.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_parallel_vec_h +#define HALIDE__idw_halide_parallel_vec_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_vec_(struct halide_buffer_t *_idw_halide_parallel_vec__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_vec__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_parallel_vec__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_parallel_vec.s b/aot/idw_halide_parallel_vec.s new file mode 100644 index 0000000..484d582 --- /dev/null +++ b/aot/idw_halide_parallel_vec.s @@ -0,0 +1,739 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1,@function +idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1: # @idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd ra, 40(sp) # 8-byte Folded Spill + sd s0, 32(sp) # 8-byte Folded Spill + sd s1, 24(sp) # 8-byte Folded Spill + sd s2, 16(sp) # 8-byte Folded Spill + sd s3, 8(sp) # 8-byte Folded Spill + lw a4, 24(a2) + lw a7, 8(a2) + lw a6, 12(a2) + sraiw a3, a1, 31 + seqz a5, a4 + negw a0, a5 + srai s1, a4, 31 + subw t0, a1, a3 + or a0, a0, a4 + remw a1, t0, a0 + xor s0, s1, a4 + not a0, s1 + add s0, s0, a0 + and s0, s0, a3 + add a1, a1, s0 + li s0, 480 + mulw a1, a1, s0 + addi s0, a5, -1 + and t2, s0, a1 + addiw a1, a7, -480 + blt t2, a1, .LBB0_2 +# %bb.1: # %entry + mv t2, a1 +.LBB0_2: # %entry + lw a7, 16(a2) + lw s2, 20(a2) + lw t1, 28(a2) + add a4, a4, a5 + divw a1, t0, a4 + subw a0, a0, s1 + and a0, a0, a3 + add a0, a0, a1 + li s1, 270 + mulw a0, a0, s1 + and a0, a0, s0 + addiw a1, a6, -270 + blt a0, a1, .LBB0_4 +# %bb.3: # %entry + mv a0, a1 +.LBB0_4: # %entry + ld s3, 0(a2) + add a0, a0, a7 + add t1, t1, t2 + mulw a0, s2, a0 + addw s0, t1, a0 +.LBB0_5: # %"for idw_halide_parallel_vec_.s0.y.y_inner" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s3 + li a2, 1920 + li a1, 0 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s2 + bnez s1, .LBB0_5 +# %bb.6: # %destructor_block + li a0, 0 + ld ra, 40(sp) # 8-byte Folded Reload + ld s0, 32(sp) # 8-byte Folded Reload + ld s1, 24(sp) # 8-byte Folded Reload + ld s2, 16(sp) # 8-byte Folded Reload + ld s3, 8(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end0: + .size idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1, .Lfunc_end0-idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1,@function +idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1: # @idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -208 + sd ra, 200(sp) # 8-byte Folded Spill + sd s0, 192(sp) # 8-byte Folded Spill + sd s1, 184(sp) # 8-byte Folded Spill + sd s2, 176(sp) # 8-byte Folded Spill + sd s3, 168(sp) # 8-byte Folded Spill + sd s4, 160(sp) # 8-byte Folded Spill + sd s5, 152(sp) # 8-byte Folded Spill + sd s6, 144(sp) # 8-byte Folded Spill + sd s7, 136(sp) # 8-byte Folded Spill + sd s8, 128(sp) # 8-byte Folded Spill + sd s9, 120(sp) # 8-byte Folded Spill + sd s10, 112(sp) # 8-byte Folded Spill + sd s11, 104(sp) # 8-byte Folded Spill + li a3, 0 + ld t4, 0(a2) + ld t0, 8(a2) + ld a0, 16(a2) + sd a0, 48(sp) # 8-byte Folded Spill + lw a0, 24(a2) + sd a0, 40(sp) # 8-byte Folded Spill + lw a7, 28(a2) + lw a4, 36(a2) + lw a0, 32(a2) + sd a0, 8(sp) # 8-byte Folded Spill + lw a0, 40(a2) + sd a0, 32(sp) # 8-byte Folded Spill + sraiw a2, a1, 31 + seqz a5, a4 + negw s1, a5 + srai s0, a4, 31 + subw a1, a1, a2 + or s1, s1, a4 + remw t1, a1, s1 + xor a0, s0, a4 + not s1, s0 + add a0, a0, s1 + and a0, a0, a2 + add a0, a0, t1 + add a4, a4, a5 + divw a1, a1, a4 + subw s1, s1, s0 + and a2, a2, s1 + add a1, a1, a2 + li a2, 480 + mulw a0, a0, a2 + addi a5, a5, -1 + and a0, a0, a5 + sd a0, 24(sp) # 8-byte Folded Spill + li a0, 270 + mulw a0, a1, a0 + and a0, a0, a5 + add a0, a0, a7 + sd a0, 0(sp) # 8-byte Folded Spill + li s2, 48 + li a0, 6 + li a2, 1 + addi t5, sp, 80 + addi s3, sp, 92 + addi s4, sp, 88 + addi s5, sp, 84 + li s0, 4 + addi t6, sp, 64 + addi s6, sp, 68 + addi s7, sp, 76 + addi s8, sp, 72 + li s9, 2 + li s10, 3 + li ra, 25 +.LBB1_1: # %"for idw_halide_parallel_vec_.s1.y.y_inner" + # =>This Loop Header: Depth=1 + # Child Loop BB1_2 Depth 2 + # Child Loop BB1_3 Depth 3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + li s11, 0 + sd a3, 16(sp) # 8-byte Folded Spill + ld a1, 0(sp) # 8-byte Folded Reload + addw t1, a1, a3 + ld a1, 8(sp) # 8-byte Folded Reload + mulw a1, t1, a1 + sd a1, 56(sp) # 8-byte Folded Spill +.LBB1_2: # %"for idw_halide_parallel_vec_.s1.x.x_inner" + # Parent Loop BB1_1 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB1_3 Depth 3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + li a7, 0 + ld t2, 24(sp) # 8-byte Folded Reload + add t2, t2, s11 + ld a4, 32(sp) # 8-byte Folded Reload + add a4, a4, t2 + ld a1, 56(sp) # 8-byte Folded Reload + addw a4, a4, a1 + ld a1, 40(sp) # 8-byte Folded Reload + addw t2, t2, a1 + slli a4, a4, 2 + ld t3, 48(sp) # 8-byte Folded Reload + add t3, t3, a4 +.LBB1_3: # %"for idw_halide_parallel_vec_.s1.r52$x.r52$x" + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + mul a4, a7, s2 + add a4, a4, t4 + addi a6, a4, 4 + vsetvli zero, a0, e32, m2 + vlwu.v v8, (a6) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a3, a4, 20 + vsetvli zero, a0, e32, m2 + vlwu.v v12, (a3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (t5) + vsw.v v12, (s3) + vsw.v v14, (s4) + vsw.v v10, (s5) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (t5) + vsetvli a3, zero, e32, m1 + vrsub.vx v8, v8, t2 + vsetvli zero, a0, e32, m2 + vlwu.v v10, (a4) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a4, a4, 16 + vsetvli zero, a0, e32, m2 + vlwu.v v14, (a4) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (t6) + vsw.v v12, (s6) + vsw.v v14, (s7) + vsw.v v16, (s8) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (t6) + vsetvli a3, zero, e32, m1 + lw a3, 0(t3) + vrsub.vx v9, v9, t1 + fmv.w.x ft0, a3 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v9, v9 + vfmv.f.s ft1, v9 + fsqrt.s ft1, ft1 + vfmv.s.f v8, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v9, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, s9, e32, m1 + vslideup.vi v8, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v9, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, s10, e32, m1 + vslideup.vi v8, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v9, v9, 3 + vfmv.f.s ft1, v9 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v9, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v8, v9, 3 + slli a4, a7, 4 + add a4, a4, t0 +.LBB1_4: # %casloop.start + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # Parent Loop BB1_3 Depth=3 + # => This Loop Header: Depth=4 + # Child Loop BB1_9 Depth 5 + flw ft1, 0(t3) + vsetvli zero, zero, e32, m1 + vlwu.v v9, (a4) + vfmul.vv v9, v8, v9 + vfmv.s.f v10, ft1 + vfredsum.vs v9, v9, v10 + vfmv.f.s ft1, v9 + fmv.x.w a3, ft1 + fmv.x.w a1, ft0 +.LBB1_9: # %casloop.start + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # Parent Loop BB1_3 Depth=3 + # Parent Loop BB1_4 Depth=4 + # => This Inner Loop Header: Depth=5 + lr.w s1, (t3) + bne s1, a1, .LBB1_11 +# %bb.10: # %casloop.start + # in Loop: Header=BB1_9 Depth=5 + sc.w a5, a3, (t3) + bnez a5, .LBB1_9 +.LBB1_11: # %casloop.start + # in Loop: Header=BB1_4 Depth=4 + fmv.w.x ft0, s1 + bne s1, a1, .LBB1_4 +# %bb.5: # %casloop.end + # in Loop: Header=BB1_3 Depth=3 + addi a7, a7, 1 + bne a7, ra, .LBB1_3 +# %bb.6: # %"end for idw_halide_parallel_vec_.s1.r52$x.r52$x" + # in Loop: Header=BB1_2 Depth=2 + addi s11, s11, 1 + li a1, 480 + bne s11, a1, .LBB1_2 +# %bb.7: # %"end for idw_halide_parallel_vec_.s1.x.x_inner" + # in Loop: Header=BB1_1 Depth=1 + ld a3, 16(sp) # 8-byte Folded Reload + addiw a3, a3, 1 + li a1, 270 + bne a3, a1, .LBB1_1 +# %bb.8: # %destructor_block + li a0, 0 + ld ra, 200(sp) # 8-byte Folded Reload + ld s0, 192(sp) # 8-byte Folded Reload + ld s1, 184(sp) # 8-byte Folded Reload + ld s2, 176(sp) # 8-byte Folded Reload + ld s3, 168(sp) # 8-byte Folded Reload + ld s4, 160(sp) # 8-byte Folded Reload + ld s5, 152(sp) # 8-byte Folded Reload + ld s6, 144(sp) # 8-byte Folded Reload + ld s7, 136(sp) # 8-byte Folded Reload + ld s8, 128(sp) # 8-byte Folded Reload + ld s9, 120(sp) # 8-byte Folded Reload + ld s10, 112(sp) # 8-byte Folded Reload + ld s11, 104(sp) # 8-byte Folded Reload + addi sp, sp, 208 + ret +.Lfunc_end1: + .size idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1, .Lfunc_end1-idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_vec_,"ax",@progbits + .globl idw_halide_parallel_vec_ # -- Begin function idw_halide_parallel_vec_ + .p2align 1 + .type idw_halide_parallel_vec_,@function +idw_halide_parallel_vec_: # @idw_halide_parallel_vec_ +# %bb.0: # %entry + addi sp, sp, -176 + sd ra, 168(sp) # 8-byte Folded Spill + sd s0, 160(sp) # 8-byte Folded Spill + sd s1, 152(sp) # 8-byte Folded Spill + sd s2, 144(sp) # 8-byte Folded Spill + sd s3, 136(sp) # 8-byte Folded Spill + sd s4, 128(sp) # 8-byte Folded Spill + sd s5, 120(sp) # 8-byte Folded Spill + sd s6, 112(sp) # 8-byte Folded Spill + sd s7, 104(sp) # 8-byte Folded Spill + sd s8, 96(sp) # 8-byte Folded Spill + sd s9, 88(sp) # 8-byte Folded Spill + ld t5, 40(a0) + lw a4, 4(t5) + lw t2, 20(t5) + li a2, -480 + subw a6, a2, a4 + addiw a2, a4, -1 + blt a2, a6, .LBB2_2 +# %bb.1: # %entry + mv a6, a2 +.LBB2_2: # %entry +.Lpcrel_hi0: + auipc a2, %pcrel_hi(.Lb34.buffer) + addi a3, a2, %pcrel_lo(.Lpcrel_hi0) + sgtz t4, a4 + slti a5, a4, -479 + slti s0, t2, -269 + lui s1, 559241 + li a7, 480 + addiw t1, s1, -1911 + mv t0, a4 + blt a4, a7, .LBB2_4 +# %bb.3: # %entry + li t0, 480 +.LBB2_4: # %entry +.Lpcrel_hi1: + auipc t6, %pcrel_hi(.Lb35.buffer) + ld s2, 16(a3) + add a1, a4, a5 + addiw a1, a1, 479 + mul a2, a1, t1 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw s9, a1, a5 + add a1, t2, s0 + addiw a1, a1, 269 + lui a2, 994205 + addiw a2, a2, 1609 + mul a2, a1, a2 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw a1, a1, s0 + mulw s7, a1, s9 + slti a1, s7, 1 + negw a2, a1 + seqz s1, s9 + add a1, a1, s7 + addiw a1, a1, -1 + add s0, s9, s1 + divw a1, a1, s0 + add a5, a5, a5 + addiw a5, a5, -1 + and a2, a2, a5 + add a1, a1, a2 + li a2, 270 + mulw a1, a1, a2 + addi s1, s1, -1 + and t3, s1, a1 + addi t4, t4, -1 + and t4, t4, t3 + addiw a2, t2, -270 + addi a5, t6, %pcrel_lo(.Lpcrel_hi1) + blt t4, a2, .LBB2_6 +# %bb.5: # %entry + mv t4, a2 +.LBB2_6: # %entry + ld s4, 16(a5) + lw s3, 0(t5) + ld a1, 0(a3) + lw s6, 16(t5) + lw s5, 24(t5) + or a1, s2, a1 + mv s0, s4 + beqz a1, .LBB2_10 +# %bb.7: # %after_bb + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + beqz a1, .LBB2_11 +.LBB2_8: # %after_bb1 + beqz s8, .LBB2_12 +.LBB2_9: + li a0, 0 + j .LBB2_18 +.LBB2_10: # %then_bb + lui a1, 128 + addiw a1, a1, 9 + slli a1, a1, 13 + sd a1, 32(a3) + ld a1, 40(a3) + sd zero, 0(a3) + sd zero, 8(a3) + sd zero, 16(a3) + sw zero, 0(a1) + li a2, 300 + sw a2, 4(a1) + li a2, 1 + sw a2, 8(a1) + sw zero, 12(a1) + ld s0, 16(a5) + sd zero, 24(a3) + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + bnez a1, .LBB2_8 +.LBB2_11: # %then_bb2 + sd zero, 16(a5) + sd zero, 8(a5) + sd zero, 0(a5) + lui a1, 128 + addiw a1, a1, 9 + ld a2, 40(a5) + slli a1, a1, 13 + addi a1, a1, 2 + sd a1, 32(a5) + sw zero, 0(a2) + li a1, 100 + sw a1, 4(a2) + li a1, 1 + sw a1, 8(a2) + sw zero, 12(a2) + sd zero, 24(a5) + bnez s8, .LBB2_9 +.LBB2_12: # %_halide_buffer_is_bounds_query.exit31 + ld a1, 0(a0) + bnez a1, .LBB2_17 +# %bb.13: # %then_bb5 + srai a1, a6, 31 + subw a2, a6, a1 + mul s1, a2, t1 + srli s1, s1, 32 + add a2, a2, s1 + srliw s1, a2, 31 + sraiw a2, a2, 8 + addw a2, a2, s1 + add a1, a1, a2 + sgtz a2, a1 + negw a2, a2 + and a1, a1, a2 + mulw a1, a1, a7 + subw a1, a1, t0 + add t0, t0, s3 + addiw t0, t0, -480 + addiw a6, a1, 960 + add a7, t4, s6 + bgtz a4, .LBB2_15 +# %bb.14: # %then_bb5 + li s0, 270 + j .LBB2_16 +.LBB2_15: + addi s0, t3, 270 +.LBB2_16: # %then_bb5 + subw a1, s0, t4 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a2, 256 + addiw a2, a2, 9 + slli a2, a2, 13 + addi a2, a2, 2 + sd a2, 32(a0) + sw t0, 0(t5) + sw a6, 4(t5) + li a2, 1 + sw a2, 8(t5) + sw zero, 12(t5) + sw a7, 16(t5) + sw a1, 20(t5) + sw a6, 24(t5) + sw zero, 28(t5) + sd zero, 24(a0) +.LBB2_17: # %land.rhs.i44 + ld a0, 0(a0) + seqz a0, a0 +.LBB2_18: # %_halide_buffer_is_bounds_query.exit45 + ld a1, 16(a3) + ld a2, 0(a3) + ld a3, 16(a5) + ld a5, 0(a5) + or a1, a1, a2 + seqz a1, a1 + or a3, a3, a5 + seqz a2, a3 + or a1, a1, a2 + or a0, a0, a1 + bnez a0, .LBB2_20 +# %bb.19: # %then_bb8 + mulw a0, s6, s5 + negw s0, a0 + sd s8, 8(sp) + sw a4, 16(sp) + sw t2, 20(sp) + sw s6, 24(sp) + sw s5, 28(sp) + sw s9, 32(sp) + sw s0, 36(sp) +.Lpcrel_hi2: + auipc a0, %got_pcrel_hi(idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi2)(a0) + addi a4, sp, 8 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt + sd s2, 40(sp) + sd s4, 48(sp) + sd s8, 56(sp) + sw s3, 64(sp) + sw s6, 68(sp) + sw s5, 72(sp) + sw s9, 76(sp) + sw s0, 80(sp) +.Lpcrel_hi3: + auipc a0, %got_pcrel_hi(idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi3)(a0) + addi a4, sp, 40 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt +.LBB2_20: # %destructor_block + li a0, 0 + ld ra, 168(sp) # 8-byte Folded Reload + ld s0, 160(sp) # 8-byte Folded Reload + ld s1, 152(sp) # 8-byte Folded Reload + ld s2, 144(sp) # 8-byte Folded Reload + ld s3, 136(sp) # 8-byte Folded Reload + ld s4, 128(sp) # 8-byte Folded Reload + ld s5, 120(sp) # 8-byte Folded Reload + ld s6, 112(sp) # 8-byte Folded Reload + ld s7, 104(sp) # 8-byte Folded Reload + ld s8, 96(sp) # 8-byte Folded Reload + ld s9, 88(sp) # 8-byte Folded Reload + addi sp, sp, 176 + ret +.Lfunc_end2: + .size idw_halide_parallel_vec_, .Lfunc_end2-idw_halide_parallel_vec_ + # -- End function + .section .text.idw_halide_parallel_vec__argv,"ax",@progbits + .globl idw_halide_parallel_vec__argv # -- Begin function idw_halide_parallel_vec__argv + .p2align 1 + .type idw_halide_parallel_vec__argv,@function +idw_halide_parallel_vec__argv: # @idw_halide_parallel_vec__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_parallel_vec_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end3: + .size idw_halide_parallel_vec__argv, .Lfunc_end3-idw_halide_parallel_vec__argv + # -- End function + .section .text.idw_halide_parallel_vec__metadata,"ax",@progbits + .globl idw_halide_parallel_vec__metadata # -- Begin function idw_halide_parallel_vec__metadata + .p2align 1 + .type idw_halide_parallel_vec__metadata,@function +idw_halide_parallel_vec__metadata: # @idw_halide_parallel_vec__metadata +# %bb.0: # %entry +.Lpcrel_hi4: + auipc a0, %pcrel_hi(.Lidw_halide_parallel_vec__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi4) + ret +.Lfunc_end4: + .size idw_halide_parallel_vec__metadata, .Lfunc_end4-idw_halide_parallel_vec__metadata + # -- End function + .type .Lb34.shape,@object # @b34.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb34.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb34.shape, 16 + + .type .Lb34.data,@object # @b34.data + .p2align 5, 0x0 +.Lb34.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb34.data, 1200 + + .type .Lb34.buffer,@object # @b34.buffer + .data + .p2align 4, 0x0 +.Lb34.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb34.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb34.shape + .quad 0 + .size .Lb34.buffer, 56 + + .type .Lb35.shape,@object # @b35.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb35.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb35.shape, 16 + + .type .Lb35.data,@object # @b35.data + .p2align 5, 0x0 +.Lb35.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb35.data, 400 + + .type .Lb35.buffer,@object # @b35.buffer + .data + .p2align 4, 0x0 +.Lb35.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb35.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb35.shape + .quad 0 + .size .Lb35.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_parallel_vec_" + .size .Lstr, 25 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_parallel_vec__metadata_storage,@object # @idw_halide_parallel_vec__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_parallel_vec__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_parallel_vec__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_vec.h b/aot/idw_halide_vec.h new file mode 100644 index 0000000..6de0b40 --- /dev/null +++ b/aot/idw_halide_vec.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_vec_h +#define HALIDE__idw_halide_vec_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_vec_(struct halide_buffer_t *_idw_halide_vec__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_vec__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_vec__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_vec.s b/aot/idw_halide_vec.s new file mode 100644 index 0000000..f595e22 --- /dev/null +++ b/aot/idw_halide_vec.s @@ -0,0 +1,450 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_vec_,"ax",@progbits + .globl idw_halide_vec_ # -- Begin function idw_halide_vec_ + .p2align 1 + .type idw_halide_vec_,@function +idw_halide_vec_: # @idw_halide_vec_ +# %bb.0: # %entry + addi sp, sp, -192 + sd ra, 184(sp) # 8-byte Folded Spill + sd s0, 176(sp) # 8-byte Folded Spill + sd s1, 168(sp) # 8-byte Folded Spill + sd s2, 160(sp) # 8-byte Folded Spill + sd s3, 152(sp) # 8-byte Folded Spill + sd s4, 144(sp) # 8-byte Folded Spill + sd s5, 136(sp) # 8-byte Folded Spill + sd s6, 128(sp) # 8-byte Folded Spill + sd s7, 120(sp) # 8-byte Folded Spill + sd s8, 112(sp) # 8-byte Folded Spill + sd s9, 104(sp) # 8-byte Folded Spill + sd s10, 96(sp) # 8-byte Folded Spill + sd s11, 88(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb44.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s6, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb45.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s4, 0(a3) + lwu s8, 4(a3) + lw a4, 16(a3) + sd a4, 8(sp) # 8-byte Folded Spill + ld a4, 0(a1) + lwu a5, 20(a3) + sd a5, 32(sp) # 8-byte Folded Spill + lw s1, 24(a3) + or a5, s6, a4 + mv a4, s9 + bnez a5, .LBB0_2 +# %bb.1: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) +.LBB0_2: # %after_bb + ld a5, 0(a2) + ld s11, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_4 +# %bb.3: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) +.LBB0_4: # %after_bb1 + sext.w a4, s8 + sd a4, 24(sp) # 8-byte Folded Spill + lw a4, 32(sp) # 8-byte Folded Reload + sd s1, 16(sp) # 8-byte Folded Spill + beqz s11, .LBB0_6 +# %bb.5: + li a0, 0 + j .LBB0_9 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s4, 0(a3) + ld s1, 24(sp) # 8-byte Folded Reload + sw s1, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + ld a5, 8(sp) # 8-byte Folded Reload + sw a5, 16(a3) + sw a4, 20(a3) + sw s1, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw_halide_vec_.s0.y.rebased.preheader" + ld s3, 16(sp) # 8-byte Folded Reload + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_13 +# %bb.11: # %"for idw_halide_vec_.s0.y.rebased.us.preheader" + li s0, 0 + ld s2, 24(sp) # 8-byte Folded Reload + slli s2, s2, 2 + ld s1, 32(sp) # 8-byte Folded Reload +.LBB0_12: # %"for idw_halide_vec_.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s11 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s3 + bnez s1, .LBB0_12 +.LBB0_13: # %"for idw_halide_vec_.s1.y.rebased.preheader" + sd zero, 40(sp) # 8-byte Folded Spill + li a1, 6 + li a2, 1 + addi a3, sp, 64 + addi a7, sp, 76 + addi t0, sp, 72 + addi t1, sp, 68 + li s0, 4 + addi s1, sp, 48 + addi t2, sp, 52 + addi t3, sp, 60 + addi t6, sp, 56 + li ra, 2 + li a6, 3 +.LBB0_14: # %"for idw_halide_vec_.s1.y.rebased" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_19 +# %bb.15: # %"for idw_halide_vec_.s1.x.rebased.preheader" + # in Loop: Header=BB0_14 Depth=1 + li t4, 0 + ld a0, 16(sp) # 8-byte Folded Reload + ld a4, 40(sp) # 8-byte Folded Reload + mulw t5, a0, a4 + ld a0, 8(sp) # 8-byte Folded Reload + addw a0, a0, a4 +.LBB0_16: # %"for idw_halide_vec_.s1.x.rebased" + # Parent Loop BB0_14 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a4, t4, t5 + slli a4, a4, 2 + add s2, s11, a4 + flw ft0, 0(s2) + addw a4, s4, t4 + li s5, 25 + mv s3, s6 + mv s10, s9 +.LBB0_17: # %"for idw_halide_vec_.s1.r24$x.r24$x" + # Parent Loop BB0_14 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + addi s7, s3, 4 + vsetvli zero, a1, e32, m2 + vlwu.v v8, (s7) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a5, s3, 20 + vsetvli zero, a1, e32, m2 + vlwu.v v12, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (a3) + vsw.v v12, (a7) + vsw.v v14, (t0) + vsw.v v10, (t1) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (a3) + vsetvli a5, zero, e32, m1 + vrsub.vx v8, v8, a4 + vsetvli zero, a1, e32, m2 + vlwu.v v10, (s3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a5, s3, 16 + vsetvli zero, a1, e32, m2 + vlwu.v v14, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (s1) + vsw.v v12, (t2) + vsw.v v14, (t3) + vsw.v v16, (t6) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (s1) + vsetvli a5, zero, e32, m1 + vrsub.vx v9, v9, a0 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v8, v9 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vfmv.s.f v9, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, ra, e32, m1 + vslideup.vi v9, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, a6, e32, m1 + vslideup.vi v9, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v8, v8, 3 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v8, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v9, v8, 3 + vsetvli zero, zero, e32, m1 + vlwu.v v8, (s10) + vfmul.vv v8, v9, v8 + vfmv.s.f v9, ft0 + vfredsum.vs v8, v8, v9 + vfmv.f.s ft0, v8 + addi s5, s5, -1 + addi s10, s10, 16 + addi s3, s3, 48 + bnez s5, .LBB0_17 +# %bb.18: # %"end for idw_halide_vec_.s1.r24$x.r24$x" + # in Loop: Header=BB0_16 Depth=2 + addi t4, t4, 1 + fsw ft0, 0(s2) + bne t4, s8, .LBB0_16 +.LBB0_19: # %"end for idw_halide_vec_.s1.x.rebased" + # in Loop: Header=BB0_14 Depth=1 + ld a4, 40(sp) # 8-byte Folded Reload + addi a4, a4, 1 + ld a0, 32(sp) # 8-byte Folded Reload + sd a4, 40(sp) # 8-byte Folded Spill + bne a4, a0, .LBB0_14 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 184(sp) # 8-byte Folded Reload + ld s0, 176(sp) # 8-byte Folded Reload + ld s1, 168(sp) # 8-byte Folded Reload + ld s2, 160(sp) # 8-byte Folded Reload + ld s3, 152(sp) # 8-byte Folded Reload + ld s4, 144(sp) # 8-byte Folded Reload + ld s5, 136(sp) # 8-byte Folded Reload + ld s6, 128(sp) # 8-byte Folded Reload + ld s7, 120(sp) # 8-byte Folded Reload + ld s8, 112(sp) # 8-byte Folded Reload + ld s9, 104(sp) # 8-byte Folded Reload + ld s10, 96(sp) # 8-byte Folded Reload + ld s11, 88(sp) # 8-byte Folded Reload + addi sp, sp, 192 + ret +.Lfunc_end0: + .size idw_halide_vec_, .Lfunc_end0-idw_halide_vec_ + # -- End function + .section .text.idw_halide_vec__argv,"ax",@progbits + .globl idw_halide_vec__argv # -- Begin function idw_halide_vec__argv + .p2align 1 + .type idw_halide_vec__argv,@function +idw_halide_vec__argv: # @idw_halide_vec__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_vec_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_halide_vec__argv, .Lfunc_end1-idw_halide_vec__argv + # -- End function + .section .text.idw_halide_vec__metadata,"ax",@progbits + .globl idw_halide_vec__metadata # -- Begin function idw_halide_vec__metadata + .p2align 1 + .type idw_halide_vec__metadata,@function +idw_halide_vec__metadata: # @idw_halide_vec__metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_halide_vec__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_halide_vec__metadata, .Lfunc_end2-idw_halide_vec__metadata + # -- End function + .type .Lb44.shape,@object # @b44.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb44.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb44.shape, 16 + + .type .Lb44.data,@object # @b44.data + .p2align 5, 0x0 +.Lb44.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb44.data, 1200 + + .type .Lb44.buffer,@object # @b44.buffer + .data + .p2align 4, 0x0 +.Lb44.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb44.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb44.shape + .quad 0 + .size .Lb44.buffer, 56 + + .type .Lb45.shape,@object # @b45.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb45.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb45.shape, 16 + + .type .Lb45.data,@object # @b45.data + .p2align 5, 0x0 +.Lb45.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb45.data, 400 + + .type .Lb45.buffer,@object # @b45.buffer + .data + .p2align 4, 0x0 +.Lb45.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb45.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb45.shape + .quad 0 + .size .Lb45.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_vec_" + .size .Lstr, 16 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_vec__metadata_storage,@object # @idw_halide_vec__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_vec__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_vec__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/include/algos.hpp b/include/algos.hpp index d6c9f0c..e1f005f 100644 --- a/include/algos.hpp +++ b/include/algos.hpp @@ -42,6 +42,12 @@ void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* point void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); +void idw_halide_parallel(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void idw_halide_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void idw_halide_parallel_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + void voxel_up(float* src, float* kernel, float* dst, int inpChannels, int height, int width, int depth); void upscale(const std::vector img_path, int width, int height); diff --git a/perf/perf_main.cpp b/perf/perf_main.cpp index 905450a..a20069f 100644 --- a/perf/perf_main.cpp +++ b/perf/perf_main.cpp @@ -14,7 +14,14 @@ #include "algos.hpp" -#include +// #ifdef __riscv +// #include +// #include "idw.h" +// using namespace Halide::Runtime; +// #else +// #include +// using namespace Halide; +// #endif // #include @@ -215,21 +222,16 @@ PERF_TEST(convolution_nhwc, halide) { // 300 elems (100 points) static int idwPoints[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; // 100 elems -float idwWeights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; +static float idwWeights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; PERF_TEST(idw, reference) { const int width = 1920; const int height = 1080; - - // 300 elems (100 points) - int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; - // 100 elems - float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; - + Mat dst(height, width, CV_8U); PERF_SAMPLE_BEGIN() - idw_ref(NULL, dst.ptr(), height, width, points, weights); + idw_ref(NULL, dst.ptr(), height, width, idwPoints, idwWeights); PERF_SAMPLE_END() SANITY_CHECK_NOTHING(); @@ -248,6 +250,45 @@ PERF_TEST(idw, halide) { SANITY_CHECK_NOTHING(); } +PERF_TEST(idw, halide_parallel) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_parallel(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide_vec) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_vec(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide_parallel_vec) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_parallel_vec(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + PERF_TEST(voxel_up, halide) { static const int ic = 4; static const int height = 100; diff --git a/src/idw.cpp b/src/idw.cpp index 622867e..2933703 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -9,7 +9,10 @@ using namespace std; #ifdef __riscv #include - #include "idw.h" + #include "idw_halide.h" + #include "idw_halide_vec.h" + #include "idw_halide_parallel.h" + #include "idw_halide_parallel_vec.h" using namespace Halide::Runtime; #else #include @@ -27,9 +30,154 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po Buffer points(pointsBuf, {pointCount*3}); Buffer weights(weightsBuf, {pointCount}); #ifdef __riscv - idw(mask); + idw_halide_(mask); #else - static Func f("idw"); + static Func f("idw_halide_"); + + // try { + if (!f.defined()) { + Var x("x"), y("y"); + RDom r(0, pointCount); + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + // f.vectorize(r, 8); + const int factor = 4; + // f.update().atomic().vectorize(r, factor); + // f.update().parallel(x); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide.h", {}, "idw_halide_", target); + f.compile_to_assembly("idw_halide.s", {}, "idw_halide_", target); + } + // } + // catch (Halide::Error &e) { + // cout << e.what() << '\n'; + // } + + f.realize(mask); +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +void idw_halide_parallel(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_parallel_(mask); +#else + static Func f("idw_halide_parallel_"); + + try { + if (!f.defined()) { + Var x("x"), y("y"), x_outer("x_outer"), y_outer("y_outer"), x_inner("x_inner"), y_inner("y_inner"), tile_index("tile_index"); + RDom r(0, pointCount); + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + f.tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + f.update().tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + + const int factor = 4; + // f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide_parallel.h", {}, "idw_halide_parallel_", target); + f.compile_to_assembly("idw_halide_parallel.s", {}, "idw_halide_parallel_", target); + } + + f.realize(mask); + } + catch (Halide::Error &e) { + cout << e.what() << '\n'; + } +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +void idw_halide_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_vec_(mask); +#else + static Func f("idw_halide_vec_"); // try { if (!f.defined()) { @@ -46,6 +194,7 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po // f.vectorize(r, 8); const int factor = 4; f.update().atomic().vectorize(r, factor); + // f.update().parallel(x); // Compile Target target; @@ -64,11 +213,11 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po target.set_features(features); std::cout << target << std::endl; - f.print_loop_nest(); + // f.print_loop_nest(); // Dump AOT code - f.compile_to_header("idw.h", {}, "idw", target); - f.compile_to_assembly("idw.s", {}, "idw", target); + f.compile_to_header("idw_halide_vec.h", {}, "idw_halide_vec_", target); + f.compile_to_assembly("idw_halide_vec.s", {}, "idw_halide_vec_", target); } // } // catch (Halide::Error &e) { @@ -89,6 +238,79 @@ void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* po delete[] maskBuf; } +void idw_halide_parallel_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_parallel_vec_(mask); +#else + static Func f("idw_halide_parallel_vec_"); + + try { + if (!f.defined()) { + Var x("x"), y("y"), x_outer("x_outer"), y_outer("y_outer"), x_inner("x_inner"), y_inner("y_inner"), tile_index("tile_index"); + RDom r(0, pointCount); + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + f.tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + f.update().tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + + const int factor = 4; + f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide_parallel_vec.h", {}, "idw_halide_parallel_vec_", target); + f.compile_to_assembly("idw_halide_parallel_vec.s", {}, "idw_halide_parallel_vec_", target); + } + + f.realize(mask); + } + catch (Halide::Error &e) { + cout << e.what() << '\n'; + } +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { float* mask = new float[height * width](); diff --git a/test/test_main.cpp b/test/test_main.cpp index 9fadfbd..0e89acf 100644 --- a/test/test_main.cpp +++ b/test/test_main.cpp @@ -167,6 +167,8 @@ TEST(convolution_nhwc, halide) { TEST(idw, halide) { Mat src(height, width, CV_8U); Mat dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3), dst_h(height, width, CV_8U), cl_dst_h(height, width, CV_8UC3); + Mat dst_hp(height, width, CV_8U), cl_dst_hp(height, width, CV_8UC3), dst_hv(height, width, CV_8U), cl_dst_hv(height, width, CV_8UC3), + dst_hpv(height, width, CV_8U), cl_dst_hpv(height, width, CV_8UC3); // randu(src, 0, 256); // 300 elems (100 points) @@ -176,13 +178,25 @@ TEST(idw, halide) { idw_ref(NULL, dst.ptr(), height, width, points, weights); idw_halide(NULL, dst_h.ptr(), height, width, points, weights); + idw_halide_parallel(NULL, dst_hp.ptr(), height, width, points, weights); + idw_halide_vec(NULL, dst_hv.ptr(), height, width, points, weights); + idw_halide_parallel_vec(NULL, dst_hpv.ptr(), height, width, points, weights); applyColorMap(dst, cl_dst, COLORMAP_JET); applyColorMap(dst_h, cl_dst_h, COLORMAP_JET); + applyColorMap(dst_hp, cl_dst_hp, COLORMAP_JET); + applyColorMap(dst_hv, cl_dst_hv, COLORMAP_JET); + applyColorMap(dst_hpv, cl_dst_hpv, COLORMAP_JET); // imwrite("src.png", src); imwrite("res.png", dst); imwrite("cres.png", cl_dst); imwrite("res_halide.png", dst_h); imwrite("cres_halide.png", cl_dst_h); + imwrite("res_halide_par.png", dst_hp); + imwrite("cres_halide_par.png", cl_dst_hp); + imwrite("res_halide_vec.png", dst_hv); + imwrite("cres_halide_vec.png", cl_dst_hv); + imwrite("res_halide_parvec.png", dst_hpv); + imwrite("cres_halide_parvec.png", cl_dst_hpv); } \ No newline at end of file