[Do not merge] Test KernelIntrinsics#688
Open
christiangnrd wants to merge 6 commits into
Open
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 5d107ec2..3455fad2 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,9 +66,9 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
i += stride
@@ -91,13 +91,13 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
- @inbounds dest[i] = bc[i]
- i += stride
- end
- return
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
+ @inbounds dest[i] = bc[i]
+ i += stride
+ end
+ return
end
## COV_EXCL_STOP
@@ -168,9 +168,9 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
- while 1 <= i <= length(dest)
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
+ while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
i += stride
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index cda5b249..339fcbc8 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(MetalBackend, "Metal", Metal, MtlArray, Metal.MtlDeviceArray
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # gamma and erfc not currently supported on Metal.jl
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index 32b45c8c..14fcfb93 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: e1c67c7 | Previous: b8d46ce | Ratio |
|---|---|---|---|
array/accumulate/Float32/1d |
575958 ns |
515334 ns |
1.12 |
array/accumulate/Float32/dims=1 |
568584 ns |
506708 ns |
1.12 |
array/accumulate/Float32/dims=1L |
8862333 ns |
8950000 ns |
0.99 |
array/accumulate/Float32/dims=2 |
626208 ns |
538958 ns |
1.16 |
array/accumulate/Float32/dims=2L |
3362292 ns |
2913312.5 ns |
1.15 |
array/accumulate/Int64/1d |
960208 ns |
931417 ns |
1.03 |
array/accumulate/Int64/dims=1 |
1235917 ns |
1114292 ns |
1.11 |
array/accumulate/Int64/dims=1L |
9714584 ns |
9781937 ns |
0.99 |
array/accumulate/Int64/dims=2 |
1593125 ns |
1454062.5 ns |
1.10 |
array/accumulate/Int64/dims=2L |
6584729 ns |
6944084 ns |
0.95 |
array/broadcast |
357417 ns |
333167 ns |
1.07 |
array/construct |
3333 ns |
3333 ns |
1 |
array/permutedims/2d |
704083 ns |
619917 ns |
1.14 |
array/permutedims/3d |
1343625 ns |
1099417 ns |
1.22 |
array/permutedims/4d |
1515770.5 ns |
1231833 ns |
1.23 |
array/private/copy |
405188 ns |
393104 ns |
1.03 |
array/private/copyto!/cpu_to_gpu |
375437.5 ns |
368604.5 ns |
1.02 |
array/private/copyto!/gpu_to_cpu |
382333 ns |
357542 ns |
1.07 |
array/private/copyto!/gpu_to_gpu |
349041.5 ns |
325791 ns |
1.07 |
array/private/iteration/findall/bool |
1195667 ns |
1192542 ns |
1.00 |
array/private/iteration/findall/int |
1337208 ns |
1323145.5 ns |
1.01 |
array/private/iteration/findfirst/bool |
1425458 ns |
1315083 ns |
1.08 |
array/private/iteration/findfirst/int |
1521709 ns |
1375917 ns |
1.11 |
array/private/iteration/findmin/1d |
1621958 ns |
1520792 ns |
1.07 |
array/private/iteration/findmin/2d |
1278541 ns |
1229812.5 ns |
1.04 |
array/private/iteration/logical |
1972917 ns |
1990958 ns |
0.99 |
array/private/iteration/scalar |
2710938 ns |
1747333 ns |
1.55 |
array/random/rand/Float32 |
613917 ns |
603209 ns |
1.02 |
array/random/rand/Int64 |
632666.5 ns |
641812.5 ns |
0.99 |
array/random/rand!/Float32 |
514209 ns |
498917 ns |
1.03 |
array/random/rand!/Int64 |
491604.5 ns |
466875 ns |
1.05 |
array/random/randn/Float32 |
567875 ns |
561042 ns |
1.01 |
array/random/randn!/Float32 |
476917 ns |
454937.5 ns |
1.05 |
array/reductions/mapreduce/Float32/1d |
691375 ns |
620042 ns |
1.12 |
array/reductions/mapreduce/Float32/dims=1 |
472250 ns |
442167 ns |
1.07 |
array/reductions/mapreduce/Float32/dims=1L |
689125 ns |
701292 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=2 |
523417 ns |
449542 ns |
1.16 |
array/reductions/mapreduce/Float32/dims=2L |
1381625 ns |
1001125 ns |
1.38 |
array/reductions/mapreduce/Int64/1d |
903812.5 ns |
808375 ns |
1.12 |
array/reductions/mapreduce/Int64/dims=1 |
776750 ns |
758375 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1L |
1078708 ns |
1102250 ns |
0.98 |
array/reductions/mapreduce/Int64/dims=2 |
1009166.5 ns |
812583 ns |
1.24 |
array/reductions/mapreduce/Int64/dims=2L |
2290042 ns |
2213417 ns |
1.03 |
array/reductions/reduce/Float32/1d |
683084 ns |
610625 ns |
1.12 |
array/reductions/reduce/Float32/dims=1 |
474458 ns |
446333.5 ns |
1.06 |
array/reductions/reduce/Float32/dims=1L |
691167 ns |
702500 ns |
0.98 |
array/reductions/reduce/Float32/dims=2 |
386167 ns |
356917 ns |
1.08 |
array/reductions/reduce/Float32/dims=2L |
495834 ns |
497708 ns |
1.00 |
array/reductions/reduce/Int64/1d |
921812.5 ns |
798708 ns |
1.15 |
array/reductions/reduce/Int64/dims=1 |
795584 ns |
760000 ns |
1.05 |
array/reductions/reduce/Int64/dims=1L |
1073125 ns |
1097542 ns |
0.98 |
array/reductions/reduce/Int64/dims=2 |
289167 ns |
293833.5 ns |
0.98 |
array/reductions/reduce/Int64/dims=2L |
687500 ns |
689500 ns |
1.00 |
array/shared/copy |
152958 ns |
158167 ns |
0.97 |
array/shared/copyto!/cpu_to_gpu |
39541 ns |
39584 ns |
1.00 |
array/shared/copyto!/gpu_to_cpu |
39708 ns |
39917 ns |
0.99 |
array/shared/copyto!/gpu_to_gpu |
40333 ns |
40292 ns |
1.00 |
array/shared/iteration/findall/bool |
1198584 ns |
1199792 ns |
1.00 |
array/shared/iteration/findall/int |
1342416 ns |
1330625 ns |
1.01 |
array/shared/iteration/findfirst/bool |
1152416 ns |
994708 ns |
1.16 |
array/shared/iteration/findfirst/int |
1310209 ns |
1020625 ns |
1.28 |
array/shared/iteration/findmin/1d |
1370750 ns |
1281750 ns |
1.07 |
array/shared/iteration/findmin/2d |
1289417 ns |
1231125 ns |
1.05 |
array/shared/iteration/logical |
1783083 ns |
1796688 ns |
0.99 |
array/shared/iteration/scalar |
3484.375 ns |
3437.5 ns |
1.01 |
array/sorting/1d |
2347709 ns |
1993833.5 ns |
1.18 |
array/sorting/2d |
8372167 ns |
8450062.5 ns |
0.99 |
integration/byval/reference |
1137042 ns |
1150062.5 ns |
0.99 |
integration/byval/slices=1 |
1134792 ns |
1151750 ns |
0.99 |
integration/byval/slices=2 |
2047000 ns |
2079167 ns |
0.98 |
integration/byval/slices=3 |
7460562.5 ns |
18009271 ns |
0.41 |
integration/metaldevrt |
445959 ns |
439250 ns |
1.02 |
kernel/indexing |
345417 ns |
331500 ns |
1.04 |
kernel/indexing_checked |
530417 ns |
492500 ns |
1.08 |
kernel/launch |
1908.3 ns |
1954.1 ns |
0.98 |
kernel/rand |
516208 ns |
497209 ns |
1.04 |
latency/import |
2049781375 ns |
1812323292 ns |
1.13 |
latency/precompile |
44678686833 ns |
39925435375 ns |
1.12 |
latency/ttfp |
2279228791 ns |
2150481396 ns |
1.06 |
metal/synchronization/context |
531.46875 ns |
534.3264248704663 ns |
0.99 |
metal/synchronization/stream |
278.220447284345 ns |
274.8666666666667 ns |
1.01 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
4b8f026 to
ce67b4c
Compare
ce67b4c to
03bb0dd
Compare
415079d to
90e4fb2
Compare
90e4fb2 to
ee7543a
Compare
b0fd1b3 to
865af1a
Compare
c2c8426 to
e52f747
Compare
9f95616 to
fb61557
Compare
86196fa to
9ee5724
Compare
8de4757 to
dae5922
Compare
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #688 +/- ##
=======================================
Coverage 86.65% 86.65%
=======================================
Files 76 76
Lines 5142 5157 +15
=======================================
+ Hits 4456 4469 +13
- Misses 686 688 +2 ☔ View full report in Codecov by Harness. 🚀 New features to boost your workflow:
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks