Skip to content

[Do not merge] Test KernelIntrinsics#688

Open
christiangnrd wants to merge 6 commits into
mainfrom
kaintr
Open

[Do not merge] Test KernelIntrinsics#688
christiangnrd wants to merge 6 commits into
mainfrom
kaintr

Conversation

@christiangnrd

Copy link
Copy Markdown
Member

Not a draft to also run benchmarks

@github-actions

github-actions Bot commented Oct 22, 2025

Copy link
Copy Markdown
Contributor

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
@@ -91,13 +91,13 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
-                 @inbounds dest[i] = bc[i]
-                 i += stride
-             end
-             return
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
+                @inbounds dest[i] = bc[i]
+                i += stride
+            end
+            return
         end
         ## COV_EXCL_STOP
 
@@ -168,9 +168,9 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
-             while 1 <= i <= length(dest)
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
+            while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
                 i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # gamma and erfc not currently supported on Metal.jl
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31

@github-actions github-actions Bot left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Details
Benchmark suite Current: e1c67c7 Previous: b8d46ce Ratio
array/accumulate/Float32/1d 575958 ns 515334 ns 1.12
array/accumulate/Float32/dims=1 568584 ns 506708 ns 1.12
array/accumulate/Float32/dims=1L 8862333 ns 8950000 ns 0.99
array/accumulate/Float32/dims=2 626208 ns 538958 ns 1.16
array/accumulate/Float32/dims=2L 3362292 ns 2913312.5 ns 1.15
array/accumulate/Int64/1d 960208 ns 931417 ns 1.03
array/accumulate/Int64/dims=1 1235917 ns 1114292 ns 1.11
array/accumulate/Int64/dims=1L 9714584 ns 9781937 ns 0.99
array/accumulate/Int64/dims=2 1593125 ns 1454062.5 ns 1.10
array/accumulate/Int64/dims=2L 6584729 ns 6944084 ns 0.95
array/broadcast 357417 ns 333167 ns 1.07
array/construct 3333 ns 3333 ns 1
array/permutedims/2d 704083 ns 619917 ns 1.14
array/permutedims/3d 1343625 ns 1099417 ns 1.22
array/permutedims/4d 1515770.5 ns 1231833 ns 1.23
array/private/copy 405188 ns 393104 ns 1.03
array/private/copyto!/cpu_to_gpu 375437.5 ns 368604.5 ns 1.02
array/private/copyto!/gpu_to_cpu 382333 ns 357542 ns 1.07
array/private/copyto!/gpu_to_gpu 349041.5 ns 325791 ns 1.07
array/private/iteration/findall/bool 1195667 ns 1192542 ns 1.00
array/private/iteration/findall/int 1337208 ns 1323145.5 ns 1.01
array/private/iteration/findfirst/bool 1425458 ns 1315083 ns 1.08
array/private/iteration/findfirst/int 1521709 ns 1375917 ns 1.11
array/private/iteration/findmin/1d 1621958 ns 1520792 ns 1.07
array/private/iteration/findmin/2d 1278541 ns 1229812.5 ns 1.04
array/private/iteration/logical 1972917 ns 1990958 ns 0.99
array/private/iteration/scalar 2710938 ns 1747333 ns 1.55
array/random/rand/Float32 613917 ns 603209 ns 1.02
array/random/rand/Int64 632666.5 ns 641812.5 ns 0.99
array/random/rand!/Float32 514209 ns 498917 ns 1.03
array/random/rand!/Int64 491604.5 ns 466875 ns 1.05
array/random/randn/Float32 567875 ns 561042 ns 1.01
array/random/randn!/Float32 476917 ns 454937.5 ns 1.05
array/reductions/mapreduce/Float32/1d 691375 ns 620042 ns 1.12
array/reductions/mapreduce/Float32/dims=1 472250 ns 442167 ns 1.07
array/reductions/mapreduce/Float32/dims=1L 689125 ns 701292 ns 0.98
array/reductions/mapreduce/Float32/dims=2 523417 ns 449542 ns 1.16
array/reductions/mapreduce/Float32/dims=2L 1381625 ns 1001125 ns 1.38
array/reductions/mapreduce/Int64/1d 903812.5 ns 808375 ns 1.12
array/reductions/mapreduce/Int64/dims=1 776750 ns 758375 ns 1.02
array/reductions/mapreduce/Int64/dims=1L 1078708 ns 1102250 ns 0.98
array/reductions/mapreduce/Int64/dims=2 1009166.5 ns 812583 ns 1.24
array/reductions/mapreduce/Int64/dims=2L 2290042 ns 2213417 ns 1.03
array/reductions/reduce/Float32/1d 683084 ns 610625 ns 1.12
array/reductions/reduce/Float32/dims=1 474458 ns 446333.5 ns 1.06
array/reductions/reduce/Float32/dims=1L 691167 ns 702500 ns 0.98
array/reductions/reduce/Float32/dims=2 386167 ns 356917 ns 1.08
array/reductions/reduce/Float32/dims=2L 495834 ns 497708 ns 1.00
array/reductions/reduce/Int64/1d 921812.5 ns 798708 ns 1.15
array/reductions/reduce/Int64/dims=1 795584 ns 760000 ns 1.05
array/reductions/reduce/Int64/dims=1L 1073125 ns 1097542 ns 0.98
array/reductions/reduce/Int64/dims=2 289167 ns 293833.5 ns 0.98
array/reductions/reduce/Int64/dims=2L 687500 ns 689500 ns 1.00
array/shared/copy 152958 ns 158167 ns 0.97
array/shared/copyto!/cpu_to_gpu 39541 ns 39584 ns 1.00
array/shared/copyto!/gpu_to_cpu 39708 ns 39917 ns 0.99
array/shared/copyto!/gpu_to_gpu 40333 ns 40292 ns 1.00
array/shared/iteration/findall/bool 1198584 ns 1199792 ns 1.00
array/shared/iteration/findall/int 1342416 ns 1330625 ns 1.01
array/shared/iteration/findfirst/bool 1152416 ns 994708 ns 1.16
array/shared/iteration/findfirst/int 1310209 ns 1020625 ns 1.28
array/shared/iteration/findmin/1d 1370750 ns 1281750 ns 1.07
array/shared/iteration/findmin/2d 1289417 ns 1231125 ns 1.05
array/shared/iteration/logical 1783083 ns 1796688 ns 0.99
array/shared/iteration/scalar 3484.375 ns 3437.5 ns 1.01
array/sorting/1d 2347709 ns 1993833.5 ns 1.18
array/sorting/2d 8372167 ns 8450062.5 ns 0.99
integration/byval/reference 1137042 ns 1150062.5 ns 0.99
integration/byval/slices=1 1134792 ns 1151750 ns 0.99
integration/byval/slices=2 2047000 ns 2079167 ns 0.98
integration/byval/slices=3 7460562.5 ns 18009271 ns 0.41
integration/metaldevrt 445959 ns 439250 ns 1.02
kernel/indexing 345417 ns 331500 ns 1.04
kernel/indexing_checked 530417 ns 492500 ns 1.08
kernel/launch 1908.3 ns 1954.1 ns 0.98
kernel/rand 516208 ns 497209 ns 1.04
latency/import 2049781375 ns 1812323292 ns 1.13
latency/precompile 44678686833 ns 39925435375 ns 1.12
latency/ttfp 2279228791 ns 2150481396 ns 1.06
metal/synchronization/context 531.46875 ns 534.3264248704663 ns 0.99
metal/synchronization/stream 278.220447284345 ns 274.8666666666667 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from b0fd1b3 to 865af1a Compare February 19, 2026 23:30
@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from c2c8426 to e52f747 Compare June 4, 2026 18:07
@christiangnrd christiangnrd force-pushed the kaintr branch 6 times, most recently from 9f95616 to fb61557 Compare June 12, 2026 15:19
@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 86196fa to 9ee5724 Compare June 17, 2026 18:54
skip scripts tests on 1.10

Project.toml

Better workaround
@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 8de4757 to dae5922 Compare July 2, 2026 15:28
@codecov

codecov Bot commented Jul 2, 2026

Copy link
Copy Markdown

Codecov Report

❌ Patch coverage is 96.15385% with 1 line in your changes missing coverage. Please review.
✅ Project coverage is 86.65%. Comparing base (b8d46ce) to head (e1c67c7).

Files with missing lines Patch % Lines
src/MetalKernels.jl 92.85% 1 Missing ⚠️
Additional details and impacted files
@@           Coverage Diff           @@
##             main     #688   +/-   ##
=======================================
  Coverage   86.65%   86.65%           
=======================================
  Files          76       76           
  Lines        5142     5157   +15     
=======================================
+ Hits         4456     4469   +13     
- Misses        686      688    +2     

☔ View full report in Codecov by Harness.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant