From 61aff5766283e313b2a14573bb0c41dfb99bfb3f Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Tue, 5 May 2026 16:12:17 +0100 Subject: [PATCH 1/2] Factor CPU matmul into more modular approach Uses previous changes in Pipeline, Descriptor, and the removal of bundles to construct a heterogeneous pipeline with schedules, passes, and descriptor files into a single pipeline. This should serve as the core example on how to compose a complex pipeline using Python, by creating on-the-fly custom schedules while reusing the existing infrastructure (passes, transforms) that we have in Lighthouse. --- examples/cpu/x86/matmul.py | 91 +++++++++++++---------------------- lighthouse/pipeline/driver.py | 4 ++ 2 files changed, 38 insertions(+), 57 deletions(-) diff --git a/examples/cpu/x86/matmul.py b/examples/cpu/x86/matmul.py index ebed973..74db59e 100644 --- a/examples/cpu/x86/matmul.py +++ b/examples/cpu/x86/matmul.py @@ -24,9 +24,9 @@ from lighthouse import dialects as lh_dialects from lighthouse.execution.runner import Runner -from lighthouse.pipeline.driver import TransformDriver +from lighthouse.pipeline.descriptor import Descriptor +from lighthouse.pipeline.driver import PipelineDriver from lighthouse.utils.numpy import numpy_to_mlir_type -from lighthouse.pipeline.helper import apply_registered_pass import lighthouse.utils as lh_utils from lighthouse import schedule as lh_schedule import lighthouse.schedule.x86 as lh_schedule_x86 @@ -65,6 +65,8 @@ def __init__(self, M: int, N: int, K: int, dtype=np.float32, tile_size: int = 32 self.K = K self.dtype = dtype self.tile_size = tile_size + self.mod = ir.Module.create() + self.context = self.mod.context @cached_property def _input_arrays(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: @@ -88,9 +90,7 @@ def get_complexity(self) -> tuple[int, int, int]: return (flop_count, memory_reads, memory_writes) def payload_module(self) -> ir.Module: - mod = ir.Module.create() - - with ir.InsertionPoint(mod.body): + with ir.InsertionPoint(self.mod.body): mlir_dtype = numpy_to_mlir_type(self.dtype) def tensor_t(shape, dtype=mlir_dtype): @@ -118,47 +118,41 @@ def payload(A, B, C): None, matmul, C, restrict=True, writable=True ) - return mod + return self.mod - def schedule_modules( + def get_pipeline( self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None, - ) -> list[ir.Module]: - scheds = [] + ) -> PipelineDriver: + scheds = PipelineDriver(self.context) # Insert performance measurements. - scheds.append(Runner.get_bench_wrapper_schedule(self.payload_function_name)) + scheds.add_transform( + Runner.get_bench_wrapper_schedule(self.payload_function_name) + ) if stop_at_stage == "initial": return scheds # GEMM block packing. # Create cache-friendly access pattern across matmul tiles. - scheds.append( + scheds.add_transform( lh_schedule.block_pack_matmuls( block_factors=[self.tile_size, self.tile_size, self.tile_size], rhs_transpose_outer_block=True, rhs_transpose_inner_block=False, ) ) - scheds.append(lh_schedule_x86.lower_packs_unpacks(self.tile_size)) + scheds.add_transform(lh_schedule_x86.lower_packs_unpacks(self.tile_size)) # Convert to category ops for easier op matching. - with lh_schedule.schedule_boilerplate() as (sched, named_seq): - ops = lh_transform.match_op(named_seq.bodyTarget, "func.func") - transform.apply_registered_pass( - transform.any_op_t(), - ops, + scheds.add_pass( + Descriptor( "linalg-morph-ops", - options={ - "named-to-category": True, - "generic-to-category": True, - }, + opts={"named-to-category": True, "generic-to-category": True}, ) - lh_transform.cleanup(named_seq.bodyTarget) - transform.yield_() - scheds.append(sched) + ) # GEMM cache tiling. # Create memory friendly access pattern. @@ -171,11 +165,11 @@ def schedule_modules( ) transform.yield_() transform.yield_() - scheds.append(sched) + scheds.add_transform(sched) # Fold extra parallel outer unit dims before further tiling to help later # vectorization rewrites to recognize ops. - scheds.append(lh_schedule.linalg_contract_fold_unit_dims()) + scheds.add_transform(lh_schedule.linalg_contract_fold_unit_dims()) # GEMM register tiling. # Ensure that computation can fit into vector registers. @@ -189,7 +183,7 @@ def schedule_modules( reg_peel_loops.append(1) if self.tile_size % reg_tile_m != 0: reg_peel_loops.append(0) - scheds.append( + scheds.add_transform( lh_schedule.tile_ops( gemm_op, tile_sizes=[reg_tile_batch, reg_tile_m, reg_tile_n, reg_tile_k], @@ -209,7 +203,7 @@ def schedule_modules( reg_tile_n // reg_unroll_n, reg_tile_k // reg_unroll_k, ] - scheds.append( + scheds.add_transform( lh_schedule.tile_ops( gemm_op, tile_sizes=[0, reg_unroll_m, reg_unroll_n, reg_unroll_k], @@ -218,15 +212,15 @@ def schedule_modules( ) # Further tiling into hardware-friendly sizes for vectorization. - scheds.append(lh_schedule.tile_ops("linalg.fill", tile_sizes=[1, 1, 1])) - scheds.append(lh_schedule.tile_ops("linalg.generic", tile_sizes=[1, 8])) + scheds.add_transform(lh_schedule.tile_ops("linalg.fill", tile_sizes=[1, 1, 1])) + scheds.add_transform(lh_schedule.tile_ops("linalg.generic", tile_sizes=[1, 8])) if stop_at_stage == "tiled": return scheds # Vectorization. - scheds.append(lh_schedule.vectorize_linalg()) - scheds.append(lh_schedule.hoist_loops()) + scheds.add_transform(lh_schedule.vectorize_linalg()) + scheds.add_transform(lh_schedule.hoist_loops()) with lh_schedule.schedule_boilerplate() as (sched, named_seq): with ir.InsertionPoint( @@ -235,46 +229,31 @@ def schedule_modules( tensor.apply_patterns_tensor_fold_tensor_subset_ops_into_vector_transfers() transform.apply_patterns_canonicalization() transform.yield_() - scheds.append(sched) + scheds.add_transform(sched) # Rewrite vector ops into x86-specific sequences. - scheds.append(lh_schedule.x86_vectorization()) + scheds.add_transform(lh_schedule.x86_vectorization()) # Lower to memrefs. - scheds.append(lh_schedule.bufferize(deallocation_pipeline=True)) + scheds.add_descriptor(Descriptor("bufferization.yaml")) # Apply x86 vectorization again as some patterns require memref abstraction. - scheds.append(lh_schedule.x86_vectorization()) + scheds.add_transform(lh_schedule.x86_vectorization()) # Vectorize any remaining ops. - scheds.append(lh_schedule.vectorize_all()) + scheds.add_transform(lh_schedule.vectorize_all()) # Cleanup vector ops. with lh_schedule.schedule_boilerplate() as (sched, named_seq): lh_transform.flatten_vector_ops(named_seq.bodyTarget) lh_transform.cleanup(named_seq.bodyTarget) transform.yield_() - scheds.append(sched) + scheds.add_transform(sched) if stop_at_stage == "vectorized": return scheds # Lower to LLVM. - with lh_schedule.schedule_boilerplate() as (sched, named_seq): - target = named_seq.bodyTarget - target = apply_registered_pass(target, "convert-linalg-to-loops") - target = apply_registered_pass(target, "fold-memref-alias-ops") - target = apply_registered_pass(target, "expand-strided-metadata") - target = apply_registered_pass(target, "canonicalize") - target = apply_registered_pass(target, "convert-vector-to-scf") - target = apply_registered_pass(target, "lower-affine") - target = apply_registered_pass(target, "convert-scf-to-cf") - target = apply_registered_pass(target, "convert-vector-to-llvm") - target = apply_registered_pass(target, "convert-to-llvm") - target = apply_registered_pass(target, "reconcile-unrealized-casts") - lh_transform.cleanup(target) - - transform.yield_() - scheds.append(sched) + scheds.add_descriptor(Descriptor("llvm-lowering.yaml")) return scheds @@ -363,9 +342,7 @@ def parse_cli(): sys.exit(1) wload = Matmul(*args.sizes, dtype=in_dtype, tile_size=args.tile_size) - pipeline = TransformDriver( - wload.schedule_modules(stop_at_stage=args.dump_kernel) - ) + pipeline = wload.get_pipeline(stop_at_stage=args.dump_kernel) payload = pipeline.apply(wload.payload_module()) if args.dump_kernel or args.dump_schedule: diff --git a/lighthouse/pipeline/driver.py b/lighthouse/pipeline/driver.py index b706571..4fed7c3 100644 --- a/lighthouse/pipeline/driver.py +++ b/lighthouse/pipeline/driver.py @@ -67,6 +67,10 @@ def add_stages(self, stages: list[Descriptor]) -> None: for s in stages: self.add_stage(s) + def add_descriptor(self, stage: Descriptor) -> None: + for s in PipelineDescriptor(stage).get_stages(): + self.add_stage(s) + def apply(self, module: ir.Module, print_after_all: bool = False) -> ir.Module: if module.context != self.context: raise ValueError("Module context does not match driver context.") From e756494fe3850bae63f16bbb43d0bbf5edb05767 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 8 May 2026 10:46:24 +0100 Subject: [PATCH 2/2] fix --- examples/cpu/x86/matmul.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/cpu/x86/matmul.py b/examples/cpu/x86/matmul.py index 74db59e..7f1c87a 100644 --- a/examples/cpu/x86/matmul.py +++ b/examples/cpu/x86/matmul.py @@ -236,6 +236,7 @@ def get_pipeline( # Lower to memrefs. scheds.add_descriptor(Descriptor("bufferization.yaml")) + scheds.add_descriptor(Descriptor("bufferization_cleanup.yaml")) # Apply x86 vectorization again as some patterns require memref abstraction. scheds.add_transform(lh_schedule.x86_vectorization()) @@ -253,7 +254,7 @@ def get_pipeline( return scheds # Lower to LLVM. - scheds.add_descriptor(Descriptor("llvm-lowering.yaml")) + scheds.add_descriptor(Descriptor("llvm_lowering.yaml")) return scheds