From ac49ac67e988f7ca6527db42babff0497190b1f5 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sat, 25 Apr 2026 23:18:37 +0800 Subject: [PATCH 01/25] refactor GPU compact tower witness flow --- Cargo.lock | 124 ++++++- Cargo.toml | 62 ++-- ceno_zkvm/src/bin/e2e.rs | 13 +- .../src/instructions/gpu/chips/keccak.rs | 24 +- .../src/instructions/gpu/chips/shard_ram.rs | 8 +- ceno_zkvm/src/instructions/gpu/dispatch.rs | 4 +- ceno_zkvm/src/instructions/gpu/utils/d2h.rs | 4 +- ceno_zkvm/src/scheme/gpu/memory.rs | 86 ++++- ceno_zkvm/src/scheme/gpu/mod.rs | 330 ++++++++++-------- ceno_zkvm/src/scheme/prover.rs | 15 +- ceno_zkvm/src/scheme/utils.rs | 8 +- ceno_zkvm/src/scheme/verifier.rs | 8 + gkr_iop/src/gkr/layer/gpu/utils.rs | 13 +- gkr_iop/src/gpu/mod.rs | 20 +- gkr_iop/src/utils.rs | 9 +- 15 files changed, 456 insertions(+), 272 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a83e37c45..ba90fc0e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1600,10 +1600,49 @@ version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" +[[package]] +name = "cuda-config" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d" +dependencies = [ + "glob", +] + +[[package]] +name = "cuda-runtime-sys" +version = "0.3.0-alpha.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39" +dependencies = [ + "cuda-config", +] + [[package]] name = "cuda_hal" version = "0.1.0" -source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9" +dependencies = [ + "anyhow", + "cuda-runtime-sys", + "cudarc", + "downcast-rs", + "either", + "ff_ext", + "itertools 0.13.0", + "mpcs", + "multilinear_extensions", + "p3", + "rand 0.8.5", + "rayon", + "sha2", + "sppark", + "sppark_plug", + "sumcheck", + "thiserror 1.0.69", + "tracing", + "transcript", + "witness", +] [[package]] name = "cudarc" @@ -2237,7 +2276,6 @@ dependencies = [ [[package]] name = "ff_ext" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "once_cell", "p3", @@ -2671,6 +2709,15 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.1", +] + [[package]] name = "iana-time-zone" version = "0.1.64" @@ -3102,6 +3149,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -3243,7 +3296,6 @@ dependencies = [ [[package]] name = "mpcs" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "bincode 1.3.3", "clap", @@ -3267,7 +3319,6 @@ dependencies = [ [[package]] name = "multilinear_extensions" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "either", "ff_ext", @@ -4558,7 +4609,6 @@ dependencies = [ [[package]] name = "p3" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "p3-air", "p3-baby-bear", @@ -5126,7 +5176,6 @@ dependencies = [ [[package]] name = "poseidon" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "ff_ext", "p3", @@ -5724,6 +5773,19 @@ dependencies = [ "semver 1.0.26", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.0.7" @@ -5733,7 +5795,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] @@ -6083,7 +6145,6 @@ dependencies = [ [[package]] name = "sp1-curves" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "cfg-if", "dashu", @@ -6118,6 +6179,25 @@ dependencies = [ "der", ] +[[package]] +name = "sppark" +version = "0.1.11" +dependencies = [ + "cc", + "which", +] + +[[package]] +name = "sppark_plug" +version = "0.1.0" +dependencies = [ + "cc", + "ff_ext", + "itertools 0.13.0", + "p3", + "sppark", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -6208,7 +6288,6 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "sumcheck" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "either", "ff_ext", @@ -6226,7 +6305,6 @@ dependencies = [ [[package]] name = "sumcheck_macro" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "itertools 0.13.0", "p3", @@ -6307,7 +6385,7 @@ dependencies = [ "fastrand", "getrandom 0.3.2", "once_cell", - "rustix", + "rustix 1.0.7", "windows-sys 0.59.0", ] @@ -6633,7 +6711,6 @@ dependencies = [ [[package]] name = "transcript" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "ff_ext", "itertools 0.13.0", @@ -6924,10 +7001,21 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.44", +] + [[package]] name = "whir" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "bincode 1.3.3", "clap", @@ -7055,6 +7143,15 @@ dependencies = [ "windows-targets 0.53.4", ] +[[package]] +name = "windows-sys" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -7214,7 +7311,6 @@ dependencies = [ [[package]] name = "witness" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7" dependencies = [ "ff_ext", "multilinear_extensions", diff --git a/Cargo.toml b/Cargo.toml index 8cc5823a5..8b79e59fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [workspace] members = [ - "ceno_cli", - "ceno_emul", - "ceno_host", - "ceno_serde", - "ceno_rt", - "ceno_zkvm", - "ceno_recursion", - "derive", - "examples-builder", - "examples", - "guest_libs/*", + "ceno_cli", + "ceno_emul", + "ceno_host", + "ceno_serde", + "ceno_rt", + "ceno_zkvm", + "ceno_recursion", + "derive", + "examples-builder", + "examples", + "guest_libs/*", ] resolver = "2" @@ -66,11 +66,11 @@ secp = "0.4.1" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" smallvec = { version = "1.13.2", features = [ - "const_generics", - "const_new", - "serde", - "union", - "write", + "const_generics", + "const_new", + "serde", + "union", + "write", ] } strum = "0.26" strum_macros = "0.26" @@ -79,7 +79,7 @@ thiserror = "2" thread_local = "1.1" tiny-keccak = { version = "2.0.2", features = ["keccak"] } tracing = { version = "0.1", features = [ - "attributes", + "attributes", ] } tracing-forest = { version = "0.1.6" } tracing-subscriber = { version = "0.3", features = ["env-filter"] } @@ -127,20 +127,20 @@ lto = "thin" #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" } #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" } -#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] -#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } - -#[patch."https://github.com/scroll-tech/gkr-backend"] -#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } -#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } -#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } -#p3 = { path = "../gkr-backend/crates/p3", package = "p3" } -#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } -#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } -#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } -#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } -#whir = { path = "../gkr-backend/crates/whir", package = "whir" } -#witness = { path = "../gkr-backend/crates/witness", package = "witness" } +[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] +ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } + +[patch."https://github.com/scroll-tech/gkr-backend"] +ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } +mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } +multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } +p3 = { path = "../gkr-backend/crates/p3", package = "p3" } +poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } +sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } +sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } +transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } +whir = { path = "../gkr-backend/crates/whir", package = "whir" } +witness = { path = "../gkr-backend/crates/witness", package = "witness" } # [patch."https://github.com/scroll-tech/openvm.git"] # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false } diff --git a/ceno_zkvm/src/bin/e2e.rs b/ceno_zkvm/src/bin/e2e.rs index 95b15b581..721708389 100644 --- a/ceno_zkvm/src/bin/e2e.rs +++ b/ceno_zkvm/src/bin/e2e.rs @@ -352,17 +352,10 @@ fn run_inner< fs::write(&vk_file, vk_bytes).unwrap(); if checkpoint > Checkpoint::PrepVerify { + // `run_e2e_with_checkpoint` already performs the real verification for the + // complete flow. Re-running it here without the emulation exit code causes + // a false "Unfinished execution" error to be logged. let verifier = ZKVMVerifier::new(vk); - if target_shard_id.is_some() { - run_e2e_single_shard_debug_verify( - &verifier, - zkvm_proofs.first().cloned().expect("missing shard proof"), - None, - max_steps, - ); - } else { - run_e2e_full_trace_verify(&verifier, zkvm_proofs.clone(), None, max_steps); - } soundness_test(zkvm_proofs.first().cloned().unwrap(), &verifier); } } diff --git a/ceno_zkvm/src/instructions/gpu/chips/keccak.rs b/ceno_zkvm/src/instructions/gpu/chips/keccak.rs index 565e0dffa..4dc1bf289 100644 --- a/ceno_zkvm/src/instructions/gpu/chips/keccak.rs +++ b/ceno_zkvm/src/instructions/gpu/chips/keccak.rs @@ -348,8 +348,7 @@ fn replay_keccak_witness_only_from_packed( ) -> Result, ZKVMError> { use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2; - let num_padded_instances = num_instances.next_power_of_two().max(2); - let num_padded_rows = num_padded_instances * 32; + let num_rows = num_instances * 32; let rotation = KECCAK_ROUNDS_CEIL_LOG2; let col_map = info_span!("col_map").in_scope(|| extract_keccak_column_map(config, num_witin)); @@ -358,7 +357,7 @@ fn replay_keccak_witness_only_from_packed( .witgen_keccak( &col_map, packed_instances, - num_padded_rows, + num_rows, shard_offset, fetch_base_pc, fetch_num_slots, @@ -372,9 +371,10 @@ fn replay_keccak_witness_only_from_packed( let raw_witin = if crate::instructions::gpu::config::is_debug_compare_enabled() || !should_materialize_witness_on_gpu() { - info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| { + let produced_rows = gpu_result.witness.num_rows; + info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| { let mut rmm_buffer = hal - .alloc_elems_on_device(num_padded_rows * num_witin, false, None) + .alloc_elems_on_device(produced_rows * num_witin, false, None) .map_err(|e| { ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into()) })?; @@ -382,7 +382,7 @@ fn replay_keccak_witness_only_from_packed( &hal.inner, &mut rmm_buffer, &gpu_result.witness.device_buffer, - num_padded_rows, + produced_rows, num_witin, ) .map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?; @@ -445,8 +445,7 @@ fn gpu_assign_keccak_inner( use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2; let num_instances = step_indices.len(); - let num_padded_instances = num_instances.next_power_of_two().max(2); - let num_padded_rows = num_padded_instances * 32; // 2^5 = 32 rows per instance + let num_rows = num_instances * 32; // 2^5 = 32 rows per instance let rotation = KECCAK_ROUNDS_CEIL_LOG2; // = 5 let materialize_initial_witness = crate::instructions::gpu::config::is_debug_compare_enabled() || should_materialize_witness_on_initial_assign(); @@ -479,7 +478,7 @@ fn gpu_assign_keccak_inner( .witgen_keccak( &col_map, &packed_instances, - num_padded_rows, + num_rows, shard_ctx.current_shard_offset_cycle(), fetch_base_pc, fetch_num_slots, @@ -565,9 +564,10 @@ fn gpu_assign_keccak_inner( } else if crate::instructions::gpu::config::is_debug_compare_enabled() || !should_materialize_witness_on_gpu() { - info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| { + let produced_rows = gpu_result.witness.num_rows; + info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| { let mut rmm_buffer = hal - .alloc_elems_on_device(num_padded_rows * num_witin, false, None) + .alloc_elems_on_device(produced_rows * num_witin, false, None) .map_err(|e| { ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into()) })?; @@ -575,7 +575,7 @@ fn gpu_assign_keccak_inner( &hal.inner, &mut rmm_buffer, &gpu_result.witness.device_buffer, - num_padded_rows, + produced_rows, num_witin, ) .map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?; diff --git a/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs b/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs index 21f1f89a0..0813449e1 100644 --- a/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs +++ b/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs @@ -439,11 +439,11 @@ pub(crate) fn try_gpu_assign_shard_ram( { let struct_data = tracing::info_span!( "gpu_shard_ram_structural_transpose_d2h", - num_rows_padded, + rows = gpu_structural.num_rows, num_structural_witin, ) .in_scope(|| -> Result<_, ZKVMError> { - let wit_num_rows = num_rows_padded; + let wit_num_rows = gpu_structural.num_rows; let struct_num_cols = num_structural_witin; let mut struct_rmm_buf = hal .witgen @@ -684,11 +684,11 @@ pub(crate) fn try_gpu_assign_shard_ram_from_device( { let struct_data = tracing::info_span!( "gpu_shard_ram_structural_transpose_d2h_from_device", - num_rows_padded, + rows = gpu_structural.num_rows, num_structural_witin, ) .in_scope(|| -> Result<_, ZKVMError> { - let wit_num_rows = num_rows_padded; + let wit_num_rows = gpu_structural.num_rows; let struct_num_cols = num_structural_witin; let mut struct_rmm_buf = hal .witgen diff --git a/ceno_zkvm/src/instructions/gpu/dispatch.rs b/ceno_zkvm/src/instructions/gpu/dispatch.rs index be51108c4..f7cf58969 100644 --- a/ceno_zkvm/src/instructions/gpu/dispatch.rs +++ b/ceno_zkvm/src/instructions/gpu/dispatch.rs @@ -481,7 +481,7 @@ fn gpu_assign_instances_inner>( total_instances, num_witin, I::padding_strategy(), - ) + )? }; if materialize_initial_witness { raw_witin.padding_by_strategy(); @@ -1484,7 +1484,7 @@ fn replay_gpu_witness_from_resident_raw>( total_instances, replay.num_witin, I::padding_strategy(), - ); + )?; // Keep replayed witness immutable after attaching the col-major device backing. // Mutating/padding a RowMajorMatrix clears device metadata, but replay consumers diff --git a/ceno_zkvm/src/instructions/gpu/utils/d2h.rs b/ceno_zkvm/src/instructions/gpu/utils/d2h.rs index fc558046d..5647cef12 100644 --- a/ceno_zkvm/src/instructions/gpu/utils/d2h.rs +++ b/ceno_zkvm/src/instructions/gpu/utils/d2h.rs @@ -303,9 +303,9 @@ pub(crate) fn gpu_witness_to_rmm( num_rows: usize, num_cols: usize, padding: InstancePaddingStrategy, -) -> RowMajorMatrix { +) -> Result, ZKVMError> { let mut rmm = RowMajorMatrix::::new(num_rows, num_cols, padding); // Keep the original col-major witness buffer as the source of truth for GPU commit. rmm.set_device_backing(gpu_result.device_buffer, DeviceMatrixLayout::ColMajor); - rmm + Ok(rmm) } diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 6421ad3c9..d4434509c 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -110,12 +110,14 @@ pub fn estimate_chip_proof_memory(replay_plan: &GpuReplayPlan) -> usize { + match replay_plan.kind { + GpuWitgenKind::Keccak => replay_plan + .keccak_instances + .as_ref() + .map(|instances| instances.len() * 32) + .unwrap_or(replay_plan.trace_height), + GpuWitgenKind::ShardRam => replay_plan.trace_height, + _ => replay_plan.step_indices.len(), + } +} + +fn replay_plan_actual_structural_rows(replay_plan: &GpuReplayPlan) -> usize { + match replay_plan.kind { + GpuWitgenKind::ShardRam => replay_plan.shard_ram_num_records, + _ => replay_plan.trace_height, + } +} + pub fn estimate_replay_materialization_bytes( num_witin: usize, _num_structural_witin: usize, @@ -273,7 +294,7 @@ pub fn estimate_replay_materialization_bytes_for_plan( _num_vars: usize, ) -> usize { let elem_size = std::mem::size_of::(); - let witness_bytes = replay_plan.trace_height * replay_plan.num_witin * elem_size; + let witness_bytes = replay_plan_actual_rows(replay_plan) * replay_plan.num_witin * elem_size; let replay_temp_bytes = match replay_plan.kind { GpuWitgenKind::Keccak => replay_plan .keccak_instances @@ -299,6 +320,7 @@ pub fn estimate_replay_materialization_bytes_for_plan( pub(crate) fn estimate_trace_bytes>( composed_cs: &ComposedConstrainSystem, input: &ProofInput<'_, GpuBackend>, + replay_plan: Option<&GpuReplayPlan>, witness_replayable: bool, structural_cached_on_device: bool, ) -> TraceEstimate { @@ -308,14 +330,36 @@ pub(crate) fn estimate_trace_bytes() + }) + .unwrap_or_else(|| { + estimate_structural_mle_bytes( + cs.num_structural_witin as usize, + num_var_with_rotation, + ) + }) } else { estimate_structural_mle_bytes(cs.num_structural_witin as usize, num_var_with_rotation) }; - let (witness_mle_bytes, trace_temporary_bytes) = estimate_trace_extraction_bytes( - cs.num_witin as usize, - num_var_with_rotation, - witness_replayable, - ); + let (witness_mle_bytes, trace_temporary_bytes) = + if should_materialize_witness_on_gpu() && witness_replayable { + let base_elem_size = std::mem::size_of::(); + let actual_rows = replay_plan + .map(replay_plan_actual_rows) + .unwrap_or(1usize << num_var_with_rotation); + (cs.num_witin as usize * actual_rows * base_elem_size, 0) + } else { + estimate_trace_extraction_bytes( + cs.num_witin as usize, + num_var_with_rotation, + witness_replayable, + ) + }; TraceEstimate { trace_resident_bytes: witness_mle_bytes + structural_mle_bytes, @@ -325,11 +369,10 @@ pub(crate) fn estimate_trace_bytes( composed_cs: &ComposedConstrainSystem, - num_var_with_rotation: usize, + occupied_rows: usize, ) -> usize { let elem_size = std::mem::size_of::(); - let record_len = 1usize << num_var_with_rotation; - tower_output_count(composed_cs) * record_len * elem_size + tower_output_count(composed_cs) * occupied_rows * elem_size } pub(crate) fn estimate_main_constraints_bytes< @@ -426,6 +469,23 @@ fn estimate_tower_stage_components 0 { + num_prod_towers * (1 << (num_vars + 1)) * elem_size + } else { + 0 + }; + let logup_split_bytes = if num_logup_towers > 0 { + let denominator_bytes = num_logup_towers * (1 << (num_vars + 1)) * elem_size; + let numerator_bytes = if has_logup_numerator { + denominator_bytes + } else { + 0 + }; + denominator_bytes + numerator_bytes + } else { + 0 + }; + let build_bytes = build_est.total_bytes + prod_split_bytes + logup_split_bytes; let prove_est = estimate_prove_tower_memory( num_prod_towers, num_logup_towers, @@ -439,11 +499,7 @@ fn estimate_tower_stage_components( + mles: impl IntoIterator>, +) -> Vec> { + let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL"); + let stream = gkr_iop::gpu::get_thread_stream(); + mles.into_iter() + .map(|mle| { + let mle_ref = mle.as_ref(); + let full_len = 1usize << mle_ref.num_vars(); + if mle_ref.evaluations_len() == full_len { + return mle; + } + let padded: gkr_iop::gpu::MultilinearExtensionGpu<'static, E> = match mle_ref.inner() { + gkr_iop::gpu::GpuFieldType::Base(poly) => { + let mut host = poly.to_cpu_vec(stream.as_ref()); + host.resize(full_len, BB31Base::ZERO); + unsafe { + std::mem::transmute( + gkr_iop::gpu::MultilinearExtensionGpu::::from_ceno_gpu_base( + ceno_gpu::bb31::GpuPolynomial::from_ceno_vec( + &cuda_hal, + &host, + mle_ref.num_vars(), + stream.as_ref(), + ) + .expect("pad base mle"), + ), + ) + } + } + gkr_iop::gpu::GpuFieldType::Ext(poly) => { + let mut host = poly.to_cpu_vec(stream.as_ref()); + host.resize(full_len, BB31Ext::ZERO); + unsafe { + std::mem::transmute( + gkr_iop::gpu::MultilinearExtensionGpu::::from_ceno_gpu_ext( + ceno_gpu::bb31::GpuPolynomialExt::from_ceno_vec( + &cuda_hal, + &host, + mle_ref.num_vars(), + stream.as_ref(), + ) + .expect("pad ext mle"), + ), + ) + } + } + gkr_iop::gpu::GpuFieldType::Unreachable => unreachable!(), + }; + Arc::new(padded) + }) + .collect() +} mod util; pub(crate) use memory::{ check_gpu_mem_estimation, estimate_chip_proof_memory, estimate_main_witness_bytes, @@ -101,6 +155,30 @@ struct PcsResidentStats { total_rmms: usize, } +fn rmm_device_backing_bytes(rmm: &witness::RowMajorMatrix) -> usize +where + T: FieldAlgebra + Default + Sync + Clone + Send + Copy + 'static, +{ + rmm.device_backing_ref::>() + .map(|device_buffer| device_buffer.len() * std::mem::size_of::()) + .unwrap_or(0) +} + +fn rmm_col_major_device_rows(rmm: &witness::RowMajorMatrix) -> Option +where + T: FieldAlgebra + Default + Sync + Clone + Send + Copy + 'static, +{ + if rmm.device_backing_layout() != Some(DeviceMatrixLayout::ColMajor) { + return None; + } + let cols = rmm.width(); + if cols == 0 { + return Some(0); + } + let device_buffer = rmm.device_backing_ref::>()?; + Some(device_buffer.len() / cols) +} + fn pcs_resident_stats( pcs_data_basefold: &BasefoldCommitmentWithWitnessGpu< BB31Base, @@ -141,7 +219,7 @@ fn pcs_resident_stats( ( rmms.iter() .filter(|rmm| rmm.has_device_backing()) - .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::()) + .map(rmm_device_backing_bytes) .sum::(), rmms.iter().filter(|rmm| rmm.has_device_backing()).count(), rmms.len(), @@ -331,24 +409,12 @@ pub fn prove_tower_relation_impl> = Vec::new(); - let mut _ones_buffer: Vec> = Vec::new(); - let mut _view_last_layers: Vec>>> = Vec::new(); let (prod_gpu, logup_gpu) = info_span!("[ceno] build_tower_witness_gpu").in_scope(|| { - build_tower_witness_gpu( - composed_cs, - input, - records, - challenges, - cuda_hal, - &mut _big_buffers, - &mut _ones_buffer, - &mut _view_last_layers, - ) - .map_err(|e| format!("build_tower_witness_gpu failed: {}", e)) - .unwrap() + build_tower_witness_gpu(composed_cs, input, records, challenges, cuda_hal) + .map_err(|e| format!("build_tower_witness_gpu failed: {}", e)) + .unwrap() }); exit_span!(span); @@ -473,11 +539,12 @@ pub fn prove_rotation_impl let log2_num_instances = input.log2_num_instances(); let num_threads = optimal_sumcheck_threads(log2_num_instances); let num_var_with_rotation = log2_num_instances + composed_cs.rotation_vars().unwrap_or(0); - let wit = LayerWitness( + let padded_wit_storage = pad_gpu_mles_to_full_domain( chain!(&input.witness, &input.fixed, &input.structural_witness) .cloned() - .collect_vec(), + .map(|mle| unsafe { std::mem::transmute(mle) }), ); + let wit = LayerWitness(padded_wit_storage); let (proof, points) = gkr_iop::gkr::layer::gpu::prove_rotation_gpu::( num_threads, @@ -691,11 +758,11 @@ pub fn prove_main_constraints_impl< num_threads, num_var_with_rotation, gkr::GKRCircuitWitness { - layers: vec![LayerWitness( + layers: vec![LayerWitness(pad_gpu_mles_to_full_domain( chain!(&input.witness, &input.fixed, &input.structural_witness,) .cloned() - .collect_vec(), - )], + .map(|mle| unsafe { std::mem::transmute(mle) }), + ))], }, &out_evals, &input @@ -1367,7 +1434,8 @@ where let device_buffer = witness_rmm .device_backing_ref::>() .unwrap_or_else(|| panic!("col-major replay witness device backing type mismatch")); - let rows = witness_rmm.height(); + let rows = rmm_col_major_device_rows(&witness_rmm) + .unwrap_or_else(|| panic!("col-major replay witness device backing row count mismatch")); let cols = witness_rmm.width(); let poly_len_bytes = rows * std::mem::size_of::(); @@ -1380,14 +1448,11 @@ where (0..cols) .map(|col_idx| { let src_byte_offset = col_idx * poly_len_bytes; - // Keep an owned handle to the parent GPU allocation instead of a - // borrowed CudaView. The resulting MLE outlives this helper. let view_buf = device_buffer.owned_subrange(src_byte_offset..src_byte_offset + poly_len_bytes); - let view_poly = GpuPolynomial::new(view_buf, rows.trailing_zeros() as usize); - let view_poly_static: GpuPolynomial<'static> = - unsafe { std::mem::transmute(view_poly) }; - let mle_static = MultilinearExtensionGpu::from_ceno_gpu_base(view_poly_static); + let view_poly = GpuPolynomial::new(view_buf, witness_rmm.num_vars()); + let poly_static: GpuPolynomial<'static> = unsafe { std::mem::transmute(view_poly) }; + let mle_static = MultilinearExtensionGpu::from_ceno_gpu_base(poly_static); Arc::new(unsafe { std::mem::transmute::< MultilinearExtensionGpu<'static, E>, @@ -1421,7 +1486,7 @@ pub fn clear_replayable_trace_device_backing( let before_device_bytes = rmms .iter() .filter(|rmm| rmm.has_device_backing()) - .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::()) + .map(rmm_device_backing_bytes) .sum::(); for (trace_idx, _) in replayable_traces { @@ -1432,7 +1497,7 @@ pub fn clear_replayable_trace_device_backing( let after_device_bytes = rmms .iter() .filter(|rmm| rmm.has_device_backing()) - .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::()) + .map(rmm_device_backing_bytes) .sum::(); tracing::info!( "[gpu] cleared replayable PCS RMM device backing: replayable_traces={}, rmms_device_before={:.2}MB ({}) -> after={:.2}MB ({})", @@ -1508,7 +1573,8 @@ where let device_buffer = structural_rmm .device_backing_ref::>() .unwrap_or_else(|| panic!("col-major structural device backing type mismatch")); - let rows = structural_rmm.height(); + let rows = rmm_col_major_device_rows(structural_rmm) + .unwrap_or_else(|| panic!("col-major structural device backing row count mismatch")); let cols = structural_rmm.width(); let poly_len_bytes = rows * std::mem::size_of::(); let total_bytes = cols * poly_len_bytes; @@ -1517,8 +1583,7 @@ where total_bytes, "structural col-major buffer size mismatch" ); - let num_vars_in_poly = rows.trailing_zeros() as usize; - assert_eq!(rows, 1usize << num_vars_in_poly); + let num_vars_in_poly = structural_rmm.num_vars(); (0..cols) .map(|col_idx| { @@ -1559,25 +1624,22 @@ where } #[allow(clippy::too_many_arguments)] -pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( +pub(crate) fn build_tower_witness_gpu( composed_cs: &ComposedConstrainSystem, input: &ProofInput<'_, GpuBackend>>, records: &[ArcMultilinearExtensionGpu<'_, E>], challenges: &[E; 2], cuda_hal: &CudaHalBB31, - big_buffers: &'buf mut Vec>, - ones_buffer: &mut Vec>, - view_last_layers: &mut Vec>>>, ) -> Result< ( - Vec>, - Vec>, + Vec>, + Vec>, ), String, > { let stream = gkr_iop::gpu::get_thread_stream(); use crate::scheme::constants::{NUM_FANIN, NUM_FANIN_LOGUP}; - use ceno_gpu::{CudaHal as _, bb31::GpuPolynomialExt}; + use ceno_gpu::bb31::GpuPolynomialExt; use p3::field::FieldAlgebra; let ComposedConstrainSystem { @@ -1585,7 +1647,7 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( } = composed_cs; let _num_instances_with_rotation = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0); - let _chip_record_alpha = challenges[0]; + let chip_record_alpha: BB31Ext = unsafe { std::mem::transmute_copy(&challenges[0]) }; // SAFETY: The `records` slice is borrowed for the duration of this function call. // The lifetime is erased to 'static only to satisfy GPU API signatures that require @@ -1616,46 +1678,62 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( &records[offset..][..cs.lk_expressions.len()] }; - assert_eq!(big_buffers.len(), 0, "expect no big buffers"); - - // prod: last layes & buffer - let mut is_prod_buffer_exists = false; + // prod: split last layer once, then build compact tower layers. let prod_last_layers = r_set_wit .iter() .chain(w_set_wit.iter()) - .map(|wit| wit.as_view_chunks(NUM_FANIN)) - .collect::>(); + .map(|wit| match wit.inner() { + gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal + .tower + .masked_mle_split_to_chunks( + &*cuda_hal, + poly, + NUM_FANIN, + BB31Ext::ONE, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to split compact prod tower input: {e}")), + _ => return Err("tower witness expects extension-field record MLEs".to_string()), + }) + .collect::, String>>()?; if !prod_last_layers.is_empty() { let first_layer = &prod_last_layers[0]; assert_eq!(first_layer.len(), 2, "prod last_layer must have 2 MLEs"); - let num_vars = first_layer[0].num_vars(); - let num_towers = prod_last_layers.len(); - view_last_layers.push(prod_last_layers); - - // Allocate one big buffer for all product towers and add it to big_buffers - let tower_size = 1 << (num_vars + 1); // 2 * mle_len elements per tower - let total_buffer_size = num_towers * tower_size; - tracing::debug!( - "prod tower request buffer size: {:.2} MB", - (total_buffer_size * std::mem::size_of::()) as f64 / (1024.0 * 1024.0) - ); - let big_buffer = cuda_hal - .alloc_ext_elems_on_device(total_buffer_size, false, stream.as_ref()) - .map_err(|e| format!("Failed to allocate prod GPU buffer: {:?}", e))?; - big_buffers.push(big_buffer); - is_prod_buffer_exists = true; } - // logup: last layes - let mut is_logup_buffer_exists = false; + // logup: split last layer once, then build compact tower layers. let lk_numerator_last_layer = lk_n_wit .iter() - .map(|wit| wit.as_view_chunks(NUM_FANIN_LOGUP)) - .collect::>(); + .map(|wit| match wit.inner() { + gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal + .tower + .masked_mle_split_to_chunks( + &*cuda_hal, + poly, + NUM_FANIN_LOGUP, + chip_record_alpha, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to split compact logup numerator: {e}")), + _ => Err("tower witness expects extension-field logup numerator MLEs".to_string()), + }) + .collect::, String>>()?; let lk_denominator_last_layer = lk_d_wit .iter() - .map(|wit| wit.as_view_chunks(NUM_FANIN_LOGUP)) - .collect::>(); + .map(|wit| match wit.inner() { + gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal + .tower + .masked_mle_split_to_chunks( + &*cuda_hal, + poly, + NUM_FANIN_LOGUP, + chip_record_alpha, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to split compact logup denominator: {e}")), + _ => Err("tower witness expects extension-field logup denominator MLEs".to_string()), + }) + .collect::, String>>()?; let logup_last_layers = if !lk_numerator_last_layer.is_empty() { // Case when we have both numerator and denominator // Combine [p1, p2] from numerator and [q1, q2] from denominator @@ -1665,100 +1743,47 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( .map(|(lk_n_chunks, lk_d_chunks)| { let mut last_layer = lk_n_chunks; last_layer.extend(lk_d_chunks); - last_layer + Ok(last_layer) }) - .collect::>() + .collect::, String>>()? } else if lk_denominator_last_layer.is_empty() { vec![] } else { - // Case when numerator is empty - create shared ones_buffer and use views - // This saves memory by having all p1, p2 polynomials reference the same buffer + // Case when numerator is empty: share one owned ones buffer across all p1/p2 polynomials. let nv = lk_denominator_last_layer[0][0].num_vars(); - // Create one shared ones_buffer as Owned (can be 'static) let ones_poly = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE, stream.as_ref()) .map_err(|e| format!("Failed to create shared ones_buffer: {:?}", e)) .unwrap(); - // SAFETY: Owned buffer can be safely treated as 'static - let ones_poly_static: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) }; - ones_buffer.push(ones_poly_static); - - // Get reference from storage to ensure proper lifetime - let ones_poly_ref = ones_buffer.last().unwrap(); - let mle_len_bytes = ones_poly_ref.evaluations().len() * std::mem::size_of::(); + let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) }; + let mle_len_bytes = ones_poly.buf.len() * std::mem::size_of::(); - // Create views referencing the shared ones_buffer for each tower's p1, p2 lk_denominator_last_layer .into_iter() .map(|lk_d_chunks| { - // Create views of ones_buffer for p1 and p2 - let p1_view = ones_poly_ref.evaluations().as_slice_range(0..mle_len_bytes); - let p2_view = ones_poly_ref.evaluations().as_slice_range(0..mle_len_bytes); - let p1_gpu = GpuPolynomialExt::new(BufferImpl::new_from_view(p1_view), nv); - let p2_gpu = GpuPolynomialExt::new(BufferImpl::new_from_view(p2_view), nv); - // SAFETY: views from 'static buffer can be 'static - let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) }; - let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) }; - // Use [p1, p2, q1, q2] format for the last layer + let p1_gpu = + GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv); + let p2_gpu = + GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv); let mut last_layer = vec![p1_gpu, p2_gpu]; last_layer.extend(lk_d_chunks); - last_layer + Ok(last_layer) }) - .collect::>() + .collect::, String>>()? }; if !logup_last_layers.is_empty() { let first_layer = &logup_last_layers[0]; assert_eq!(first_layer.len(), 4, "logup last_layer must have 4 MLEs"); - let num_vars = first_layer[0].num_vars(); - let num_towers = logup_last_layers.len(); - view_last_layers.push(logup_last_layers); - - // Allocate one big buffer for all towers and add it to big_buffers - let tower_size = 1 << (num_vars + 2); // 4 * mle_len elements per tower - let total_buffer_size = num_towers * tower_size; - tracing::debug!( - "logup tower request buffer size: {:.2} MB", - (total_buffer_size * std::mem::size_of::()) as f64 / (1024.0 * 1024.0) - ); - let big_buffer = cuda_hal - .alloc_ext_elems_on_device(total_buffer_size, false, stream.as_ref()) - .unwrap(); - big_buffers.push(big_buffer); - is_logup_buffer_exists = true; } - let (_, pushed_big_buffers) = big_buffers.split_at_mut(0); - let (prod_big_buffer, logup_big_buffer) = match ( - is_prod_buffer_exists, - is_logup_buffer_exists, - pushed_big_buffers, - ) { - (false, false, []) => (None, None), - (true, false, [prod]) => (Some(prod), None), - (false, true, [logup]) => (None, Some(logup)), - (true, true, [prod, logup]) => (Some(prod), Some(logup)), - (prod_flag, logup_flag, slice) => { - panic!( - "unexpected state: prod={}, logup={}, newly_pushed_len={}", - prod_flag, - logup_flag, - slice.len() - ); - } - }; - // Build product GpuProverSpecs let mut prod_gpu_specs = Vec::new(); - if is_prod_buffer_exists { - let prod_last_layers = &view_last_layers[0]; + if !prod_last_layers.is_empty() { let first_layer = &prod_last_layers[0]; assert_eq!(first_layer.len(), 2, "prod last_layer must have 2 MLEs"); let num_vars = first_layer[0].num_vars(); let num_towers = prod_last_layers.len(); - let Some(prod_big_buffer) = prod_big_buffer else { - panic!("prod big buffer not found"); - }; let span_prod = entered_span!( "build_prod_tower", @@ -1770,7 +1795,6 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( let gpu_specs = { cuda_hal.tower.build_prod_tower_from_gpu_polys_batch( cuda_hal, - prod_big_buffer, &last_layers_refs, num_vars, num_towers, @@ -1784,15 +1808,11 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( // Build logup GpuProverSpecs let mut logup_gpu_specs = Vec::new(); - if is_logup_buffer_exists { - let logup_last_layers = view_last_layers.last().unwrap(); + if !logup_last_layers.is_empty() { let first_layer = &logup_last_layers[0]; assert_eq!(first_layer.len(), 4, "logup last_layer must have 4 MLEs"); let num_vars = first_layer[0].num_vars(); let num_towers = logup_last_layers.len(); - let Some(logup_big_buffer) = logup_big_buffer else { - panic!("logup big buffer not found"); - }; let span_logup = entered_span!( "build_logup_tower", @@ -1805,14 +1825,12 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>( .tower .build_logup_tower_from_gpu_polys_batch( cuda_hal, - logup_big_buffer, &last_layers_refs, num_vars, num_towers, stream.as_ref(), ) .map_err(|e| format!("build_logup_tower_from_gpu_polys_batch failed: {:?}", e))?; - logup_gpu_specs.extend(gpu_specs); exit_span!(span_logup); } @@ -2005,7 +2023,7 @@ impl> OpeningProver> task.circuit_name, estimated_replay_bytes as f64 / (1024.0 * 1024.0), ); - let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed"); - check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes); - task.input.witness = info_span!("[ceno] replay_gpu_witness_from_raw") - .in_scope(|| extract_witness_mles_for_trace_rmm::(witness_rmm)); + task.input.witness = if let Some(trace_idx) = task.witness_trace_idx { + check_gpu_mem_estimation(gpu_mem_tracker, 0); + info_span!("[ceno] extract_witness_mles").in_scope(|| { + extract_witness_mles_for_trace::( + pcs_data, + trace_idx, + task.num_witin, + num_vars, + ) + }) + } else { + let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed"); + check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes); + info_span!("[ceno] replay_gpu_witness_from_raw") + .in_scope(|| extract_witness_mles_for_trace_rmm::(witness_rmm)) + }; if let Some(rmm) = task.structural_rmm.as_ref() { task.input.structural_witness = info_span!("[ceno] transport_structural_witness") .in_scope(|| { diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index 516dee741..45f786c6f 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1264,22 +1264,12 @@ where ); let tower_build_mem_tracker = crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "build_tower_witness_gpu"); - let mut big_buffers = Vec::new(); - let mut ones_buffer = Vec::new(); - let mut view_last_layers = Vec::new(); log_gpu_device_state(&format!("{name}:before_build_tower_witness")); log_gpu_pool_usage(&format!("{name}:before_build_tower_witness")); let (prod_gpu, logup_gpu, lk_out_evals, w_out_evals, r_out_evals) = info_span!("[ceno] build_tower_witness_gpu").in_scope(|| { let (prod_gpu, logup_gpu) = build_tower_witness_gpu( - cs, - &input, - &records, - challenges, - &cuda_hal, - &mut big_buffers, - &mut ones_buffer, - &mut view_last_layers, + cs, &input, &records, challenges, &cuda_hal, ) .map_err(|e| { ZKVMError::InvalidWitness(format!("build_tower_witness_gpu failed: {e}").into()) @@ -1332,9 +1322,6 @@ where check_gpu_mem_estimation(tower_prove_mem_tracker, tower_prove_estimated_bytes); drop(records); drop(tower_input); - drop(big_buffers); - drop(ones_buffer); - drop(view_last_layers); log_gpu_device_state(&format!("{name}:after_drop_tower")); exit_span!(span); diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs index ead260f7d..a52925b9a 100644 --- a/ceno_zkvm/src/scheme/utils.rs +++ b/ceno_zkvm/src/scheme/utils.rs @@ -680,7 +680,10 @@ pub fn build_main_witness< .iter() .chain(&input.structural_witness) .chain(&input.fixed) - .all(|v| { v.evaluations_len() == 1 << num_var_with_rotation }) + .all(|v| { + v.num_vars() == num_var_with_rotation + && v.evaluations_len() <= (1 << num_var_with_rotation) + }) ); // GPU memory estimation @@ -704,8 +707,9 @@ pub fn build_main_witness< // GPU memory check: validate estimation against actual usage #[cfg(feature = "gpu")] { + let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0); let estimated_bytes = - crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, num_var_with_rotation); + crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, occupied_rows); crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes); } diff --git a/ceno_zkvm/src/scheme/verifier.rs b/ceno_zkvm/src/scheme/verifier.rs index dbbae6326..3738c11ec 100644 --- a/ceno_zkvm/src/scheme/verifier.rs +++ b/ceno_zkvm/src/scheme/verifier.rs @@ -516,6 +516,14 @@ impl> .into(), )); }; + if q1 == E::ZERO || q2 == E::ZERO { + return Err(ZKVMError::InvalidProof( + format!( + "{shard_id}th shard {circuit_name} has zero logup denominator in lk_out_evals: {evals:?}" + ) + .into(), + )); + } Ok(p1 * q1.inverse() + p2 * q2.inverse()) }) .sum::>()?; diff --git a/gkr_iop/src/gkr/layer/gpu/utils.rs b/gkr_iop/src/gkr/layer/gpu/utils.rs index e67153cc2..3e185da47 100644 --- a/gkr_iop/src/gkr/layer/gpu/utils.rs +++ b/gkr_iop/src/gkr/layer/gpu/utils.rs @@ -251,8 +251,9 @@ pub fn build_rotation_mles_gpu panic!("should be base field"), _ => panic!("unimplemented input mle"), }; + let logical_len = 1usize << input_mle.mle.num_vars(); let mut output_buf = cuda_hal - .alloc_elems_on_device(input_buf.len(), false, stream.as_ref()) + .alloc_elems_on_device(logical_len, false, stream.as_ref()) .unwrap(); // Safety: GPU buffers are actually 'static lifetime. We only read from input_buf @@ -294,8 +295,8 @@ pub fn build_rotation_selector_gpu MultilinearExtensionGpu<'static, E> { let stream = crate::gpu::get_thread_stream(); - let total_len = wit[0].evaluations_len(); // Take first mle just to retrieve total length - assert!(total_len.is_power_of_two()); + let num_vars = wit[0].num_vars(); + let total_len = 1usize << num_vars; let mut output_buf = cuda_hal .alloc_ext_elems_on_device(total_len, false, stream.as_ref()) .unwrap(); @@ -322,10 +323,8 @@ pub fn build_rotation_selector_gpu, diff --git a/gkr_iop/src/gpu/mod.rs b/gkr_iop/src/gpu/mod.rs index 54ae3744a..62d19ca85 100644 --- a/gkr_iop/src/gpu/mod.rs +++ b/gkr_iop/src/gpu/mod.rs @@ -222,7 +222,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { let cpu_evaluations = poly.to_cpu_vec(stream.as_ref()); let cpu_evaluations_base: Vec = unsafe { std::mem::transmute(cpu_evaluations) }; - MultilinearExtension::from_evaluations_vec( + MultilinearExtension::from_evaluations_vec_compact( self.mle.num_vars(), cpu_evaluations_base, ) @@ -230,7 +230,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { GpuFieldType::Ext(poly) => { let cpu_evaluations = poly.to_cpu_vec(stream.as_ref()); let cpu_evaluations_ext: Vec = unsafe { std::mem::transmute(cpu_evaluations) }; - MultilinearExtension::from_evaluations_ext_vec( + MultilinearExtension::from_evaluations_ext_vec_compact( self.mle.num_vars(), cpu_evaluations_ext, ) @@ -506,13 +506,23 @@ impl> let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = unsafe { std::mem::transmute(all_witins_gpu) }; let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec(); + // Match the CPU witness inference path: layer outputs are materialized over + // the occupied prefix of the layer witness domain, not the maximum length of + // any referenced structural/fixed MLE. + let output_len = all_witins_gpu_gl64 + .first() + .map(|mle| mle.evaluations_len()) + .unwrap_or(0); + let output_lengths = + std::iter::repeat_n(output_len, mle_indices_per_term.len()).collect_vec(); // buffer for output witness from gpu let cuda_hal = get_cuda_hal().unwrap(); - let mut next_witness_buf = (0..num_non_zero_expr) - .map(|_| { + let mut next_witness_buf = output_lengths + .iter() + .map(|&output_len| { cuda_hal - .alloc_ext_elems_on_device(1 << num_vars, false, stream.as_ref()) + .alloc_ext_elems_on_device(output_len, false, stream.as_ref()) .map_err(|e| format!("Failed to allocate prod GPU buffer: {:?}", e)) }) .collect::, _>>() diff --git a/gkr_iop/src/utils.rs b/gkr_iop/src/utils.rs index e1c8d7453..f133ae126 100644 --- a/gkr_iop/src/utils.rs +++ b/gkr_iop/src/utils.rs @@ -5,7 +5,6 @@ use itertools::Itertools; use multilinear_extensions::{ Fixed, WitIn, WitnessId, mle::{ArcMultilinearExtension, MultilinearExtension}, - util::ceil_log2, virtual_poly::{build_eq_x_r_vec, eq_eval}, }; use p3::field::FieldAlgebra; @@ -49,7 +48,7 @@ pub fn rotation_next_base_mle<'a, E: ExtensionField>( rotate_chunk[to] = original_chunk[from]; } }); - MultilinearExtension::from_evaluation_vec_smart(mle.num_vars(), rotated_mle_evals) + MultilinearExtension::from_evaluation_vec_smart_compact(mle.num_vars(), rotated_mle_evals) } pub fn rotation_selector<'a, E: ExtensionField>( @@ -59,7 +58,6 @@ pub fn rotation_selector<'a, E: ExtensionField>( cyclic_group_log2_size: usize, total_len: usize, ) -> MultilinearExtension<'a, E> { - assert!(total_len.is_power_of_two()); let cyclic_group_size = 1 << cyclic_group_log2_size; assert!(cyclic_subgroup_size <= cyclic_group_size); let rotation_index = bh.into_iter().take(cyclic_subgroup_size).collect_vec(); @@ -74,7 +72,10 @@ pub fn rotation_selector<'a, E: ExtensionField>( rotate_chunk[to] = eq_chunk[to]; } }); - MultilinearExtension::from_evaluation_vec_smart(ceil_log2(total_len), rotated_mle_evals) + MultilinearExtension::from_evaluation_vec_smart_compact( + eq.len().ilog2() as usize, + rotated_mle_evals, + ) } /// sel(rx) From 84a2631f8bfb446a50b0db3c75e2ef83dd00118d Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 10:30:28 +0800 Subject: [PATCH 02/25] Fix compact tower memory accounting --- ceno_zkvm/src/scheme/gpu/memory.rs | 34 +++- ceno_zkvm/src/scheme/gpu/mod.rs | 34 ++-- summary.md | 288 +++++++++++++++++++++++++++++ 3 files changed, 335 insertions(+), 21 deletions(-) create mode 100644 summary.md diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index d4434509c..679c9e8c6 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -1,7 +1,7 @@ use crate::{ instructions::gpu::dispatch::GpuWitgenKind, scheme::{ - constants::{NUM_FANIN, SEPTIC_EXTENSION_DEGREE}, + constants::{NUM_FANIN, NUM_FANIN_LOGUP, SEPTIC_EXTENSION_DEGREE}, hal::ProofInput, utils::tower_output_count, }, @@ -461,27 +461,29 @@ fn estimate_tower_stage_components(); let has_logup_numerator = composed_cs.is_with_lk_table(); + let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0); let build_est = estimate_build_tower_memory( num_prod_towers, num_logup_towers, num_vars, num_vars, + occupied_rows, elem_size, has_logup_numerator, ); let prod_split_bytes = if num_prod_towers > 0 { - num_prod_towers * (1 << (num_vars + 1)) * elem_size + num_prod_towers + * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN) + * elem_size } else { 0 }; let logup_split_bytes = if num_logup_towers > 0 { - let denominator_bytes = num_logup_towers * (1 << (num_vars + 1)) * elem_size; - let numerator_bytes = if has_logup_numerator { - denominator_bytes - } else { - 0 - }; - denominator_bytes + numerator_bytes + let denominator_bytes = num_logup_towers + * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN_LOGUP) + * elem_size; + let numerator_or_ones_bytes = denominator_bytes; + denominator_bytes + numerator_or_ones_bytes } else { 0 }; @@ -491,6 +493,7 @@ fn estimate_tower_stage_components usize { + let chunk_size = logical_len / num_chunks; + (0..num_chunks) + .map(|chunk_idx| { + let chunk_start = chunk_idx * chunk_size; + occupied_len + .saturating_sub(chunk_start) + .min(chunk_size) + .max(1) + }) + .sum() +} + /// Estimate temporary GPU memory for the tower proving stage (build + prove). /// Used by prove_tower_relation to validate against actual mem_tracker measurements. pub(crate) fn estimate_tower_stage_bytes>( diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index 3d25202de..7ccc0b6ad 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -1749,23 +1749,33 @@ pub(crate) fn build_tower_witness_gpu( } else if lk_denominator_last_layer.is_empty() { vec![] } else { - // Case when numerator is empty: share one owned ones buffer across all p1/p2 polynomials. + // Case when numerator is empty: create one-polynomials matching each + // denominator chunk's stored length. let nv = lk_denominator_last_layer[0][0].num_vars(); - let ones_poly = - GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE, stream.as_ref()) - .map_err(|e| format!("Failed to create shared ones_buffer: {:?}", e)) - .unwrap(); - let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) }; - let mle_len_bytes = ones_poly.buf.len() * std::mem::size_of::(); - lk_denominator_last_layer .into_iter() .map(|lk_d_chunks| { - let p1_gpu = - GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv); - let p2_gpu = - GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv); + let p1_len = lk_d_chunks[0].evaluations().len(); + let p2_len = lk_d_chunks[1].evaluations().len(); + let p1_gpu = GpuPolynomialExt::new_with_scalar_len( + &cuda_hal.inner, + nv, + p1_len, + BB31Ext::ONE, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to create compact ones numerator p1: {e:?}"))?; + let p2_gpu = GpuPolynomialExt::new_with_scalar_len( + &cuda_hal.inner, + nv, + p2_len, + BB31Ext::ONE, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to create compact ones numerator p2: {e:?}"))?; + let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) }; + let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) }; let mut last_layer = vec![p1_gpu, p2_gpu]; last_layer.extend(lk_d_chunks); Ok(last_layer) diff --git a/summary.md b/summary.md new file mode 100644 index 000000000..a062bb59f --- /dev/null +++ b/summary.md @@ -0,0 +1,288 @@ +# WIP Summary: non-pow2 prover storage / GPU tower + PCS follow-up + +Date: 2026-04-25 + +Repos involved +- current repo: `/home/wusm/rust/ceno` +- GPU repo: `/home/wusm/rust/ceno-gpu` +- backend repo: `/home/wusm/rust/gkr-backend` + +Primary goal +- Remove prover-side MLE zero padding to next power-of-two. +- Keep prover storage compact by occupied length. +- Verifier semantics stay unchanged. + +Design agreed in this WIP +- Raw/original MLE inputs before sumcheck round 0 should use one unified policy: + - direct/native order + - occupied length respected + - this applies to both tower and PCS batch opening +- After round 0: + - folded values can use the normal later-round in-place buffer layout +- No separate application-specific policy for tower vs PCS. +- For tower specifically: + - within one tower layer, all MLEs should have the same `num_vars` + - tower should not rely on a meaningful “small MLE” mixed-size case + +What was fixed earlier in this WIP + +1. PCS / batch-open path +- Fixed missing round evaluations from GPU V2 sumcheck: + - `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` +- Fixed compact raw-data handling in batch open and commit/open consistency. +- Fixed an earlier `RootMismatch` by correcting raw trace -> encode padding boundary in batch commit. +- PCS later reached `final_codeword.values[idx] != folded`, then was narrowed further. +- At one point PCS/basefold batch-open `eq` layout mismatch was fixed by using Ceno/direct order. +- CPU e2e for the lightweight repro still passes. + +2. Tower witness/materialization direction +- Compact CPU oracle for tower semantics was added in: + - `../ceno-gpu/cuda_hal/src/common/tower/utils.rs` +- GPU tower build path was refactored toward compact storage in: + - `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` + - `../ceno-gpu/cpp/common/tower.cuh` + - `../ceno-gpu/cpp/bb31/kernels/tower.cu` + - `../ceno-gpu/cpp/gl64/kernels/tower.cu` +- A lifetime bug causing segfault in GPU tower eval extraction was fixed by retaining owned buffer backing: + - `../ceno-gpu/cuda_hal/src/common/buffer.rs` + - `../ceno-gpu/cuda_hal/src/lib.rs` + +3. Important debug correction +- There was a previous debug bug caused by cloning the transcript after GPU proving. +- That was fixed. +- Current CPU/GPU prover compares should assume transcript state is cloned before proof generation. + +Current CPU/GPU status + +CPU baseline +- Command: + - `cargo run --release --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` +- Result: + - passes + +GPU lightweight repro +- Command: + - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` +- Current result: + - still fails with tower verification mismatch + - source: + - `ceno_zkvm/src/e2e.rs:2347` + - `VerifyError("mismatch tower evaluation")` + +Most important findings from the latest tower debug + +1. Tower witness is not the first bad stage +- CPU/GPU tower witness compare did not fail first. +- Tower witness transport/leaf construction is not the main active bug. + +2. The earlier isolated layer-2 compare proved: +- `cpu_direct == v1` +- `v2 != cpu_direct` +- This was on a tower layer where all MLEs were full occupied: + - debug payload showed `mle_shape=[(?, 2, 4), ...]` + - meaning `num_vars=2`, `len=4` for all MLEs in that isolated layer +- That means the tower failure is not because tower requires mixed-size/small-MLE semantics. + +3. The current design conclusion +- Tower should use the same original-input policy as PCS: + - direct order before round 0 + - later rounds use the in-place buffer +- Do NOT think of this as two policies. + +4. Terminology decision +- Do not call later-round folded storage “replay buffer”. +- Call it: + - in-place buffer +- Round 0: + - non-in-place, reading original inputs +- Round > 0: + - in-place + +Latest code changes in the current session + +In `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` +- Renamed V2 metadata from `compact_layout_flags` to `original_layout_flags` +- This now means: + - `1` => original round-0 input is direct/native order +- This is intended to make the model explicit and shared across tower + PCS + +In `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` +- Added `direct_pair_index_v2` +- Changed direct-order round-0 reads for full-size equal-`num_vars` originals to use adjacent pairs: + - `(2p, 2p+1)` + - not `(p, p + stride)` +- Restored small-MLE helper mapping back to high-bit based mapping: + - `suffix_small_index_v2(...)` currently uses: + - `tid >> (num_vars - 1 - mle_num_vars)` +- Reverted an incorrect attempt to bit-reverse first-fold writes into the in-place buffer +- Current code writes first-fold results contiguously into the in-place buffer + +In `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` +- Relaxed tower assertions so layers can be compact-by-occupation, not necessarily full logical length at Rust-side checks + +What the latest tower debug showed + +Most recent trustworthy mismatch before the last interrupted run +- CPU/GPU tower compare failed at: + - `ceno_zkvm/src/scheme/gpu/mod.rs:665` +- Message: + - `CPU/GPU tower sumcheck proof mismatch: first_round=Some(2)` +- Interpretation: + - earlier proof entries already match + - divergence starts later, consistent with in-place-buffer semantics rather than original-input semantics + +Important caution about last run +- A later run was interrupted before producing a new useful payload. +- So do NOT assume the very latest in-place-buffer edits fixed anything. +- The last reliable signal is still: + - tower mismatch has moved later than round 0 + - current bug is likely in round > 0 in-place-buffer semantics + +Debug helpers currently present in `ceno_zkvm/src/scheme/gpu/mod.rs` +- `debug_compare_tower_cpu_gpu_prover(...)` +- `debug_compare_tower_eq_layers(...)` +- `debug_compare_tower_layer_v1_v2(..., round)` +- currently called for: + - `round = 2` + - `round = 3` + +Be careful +- Some helpers use fresh local transcripts like: + - `BasicTranscript::new(b"tower-layer2-debug")` +- These are only valid for isolated V1/V2/CPU direct comparisons. +- They are NOT end-to-end transcript or verifier oracles. + +Current best hypothesis +- The active tower bug is now in V2 later-round in-place-buffer semantics, not in: + - tower witness layout + - original round-0 direct-order policy + - transcript clone bugs + +Most relevant files to inspect next + +Current repo +- `ceno_zkvm/src/scheme/gpu/mod.rs` +- `ceno_zkvm/src/e2e.rs` + +GPU repo +- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` +- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` +- `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` +- `../ceno-gpu/cuda_hal/src/common/tower/utils.rs` +- `../ceno-gpu/cuda_hal/src/lib.rs` +- `../ceno-gpu/cuda_hal/src/common/buffer.rs` + +Backend repo +- `../gkr-backend/crates/mpcs/...` +- `../gkr-backend/crates/sumcheck/...` + +Recommended next step for the new session +1. Read this file. +2. Keep CPU baseline as source of truth. +3. Continue from the latest tower state, focusing only on later-round in-place-buffer semantics in: + - `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` +4. Run exactly one lightweight GPU repro at a time: + - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` + +Backups / snapshots +- Earlier stash-save/apply snapshots were created in this workstream. +- There is also filesystem snapshot history under: + - `/home/wusm/rust/ceno/.codex-backups/` + + +## E2E / validation commands executed in compact tower batch + estimator work + +Context +- Full clean was run before validating newly added CUDA kernels, to avoid stale C++/CUDA artifacts. +- Heavy commands used `timeout 1800s` so compilation can be slow, but execution cannot hang indefinitely. +- Logs were written to `/tmp` for later inspection. + +Clean/build commands +```bash +cargo clean +cargo clean --manifest-path ../ceno-gpu/cuda_hal/Cargo.toml +``` + +```bash +cargo build --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e +``` +Result +- Passed. +- Elapsed: `4:07.82`. + +Lightweight sanity e2e after clean +```bash +RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci +``` +Result +- Passed. +- Elapsed: `0:09.29`. + +Cargo check after compact batch/estimator edits +```bash +timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e +``` +Result +- Passed. + +Final lightweight sanity e2e after removing temporary debug probe +```bash +RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci +``` +Result +- Passed. +- Elapsed: `0:08.34`. + +Heavy e2e command 1: serial proving + GPU mem tracking +```bash +CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall +``` +Executed with timeout/log wrapper: +```bash +/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial.log +``` +Initial result +- Failed due strict memory-estimator overestimate, not proof failure. +- Panic: + - `[memcheck] build_tower_witness_gpu: over-estimate! estimated=146.93MB, actual=126.43MB, diff=20.50MB, margin=10.00MB` +- Elapsed: `1:19.48`. + +After estimator fix, rerun with log: +```bash +/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial-after-estimate.log +``` +Final result +- Passed. +- Elapsed: `1:15.43`. +- Log: `/tmp/ceno-keccak-memtracking-serial-after-estimate.log`. + +Heavy e2e command 2: concurrent chip proving + GPU witgen +```bash +CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall +``` +Executed with timeout/log wrapper before estimator fix: +```bash +/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen.log +``` +Result +- Passed. +- Elapsed: `0:10.02`. +- Final pool peak around `291MB`. +- Log: `/tmp/ceno-keccak-concurrent-witgen.log`. + +Executed again after estimator fix: +```bash +/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen-after-estimate.log +``` +Final result +- Passed. +- Elapsed: `0:10.74`. +- Log: `/tmp/ceno-keccak-concurrent-witgen-after-estimate.log`. + +Diff hygiene commands +```bash +git diff --check +git -C ../ceno-gpu diff --check +``` +Result +- Both passed. From 12453f6ef1bc22099a0f18587b7288396caaebab Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 10:51:05 +0800 Subject: [PATCH 03/25] Optimize compact logup ones allocation --- ceno_zkvm/src/scheme/gpu/memory.rs | 6 ++++- ceno_zkvm/src/scheme/gpu/mod.rs | 37 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 679c9e8c6..b225ede85 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -482,7 +482,11 @@ fn estimate_tower_stage_components( } else if lk_denominator_last_layer.is_empty() { vec![] } else { - // Case when numerator is empty: create one-polynomials matching each - // denominator chunk's stored length. + // Case when numerator is empty: share one scalar compact polynomial. + // Its tail default is also ONE, so all logical numerator entries read as ONE + // without materializing per-chunk denominator-sized buffers. let nv = lk_denominator_last_layer[0][0].num_vars(); + let ones_poly = GpuPolynomialExt::new_with_scalar_len( + &cuda_hal.inner, + nv, + 1, + BB31Ext::ONE, + stream.as_ref(), + ) + .map_err(|e| format!("Failed to create compact shared ones numerator: {e:?}"))?; + let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) }; + let one_len_bytes = ones_poly.buf.len() * std::mem::size_of::(); lk_denominator_last_layer .into_iter() .map(|lk_d_chunks| { - let p1_len = lk_d_chunks[0].evaluations().len(); - let p2_len = lk_d_chunks[1].evaluations().len(); - let p1_gpu = GpuPolynomialExt::new_with_scalar_len( - &cuda_hal.inner, + let p1_gpu = GpuPolynomialExt::new_with_tail_default( + ones_poly.buf.owned_subrange(0..one_len_bytes), nv, - p1_len, BB31Ext::ONE, - stream.as_ref(), - ) - .map_err(|e| format!("Failed to create compact ones numerator p1: {e:?}"))?; - let p2_gpu = GpuPolynomialExt::new_with_scalar_len( - &cuda_hal.inner, + ); + let p2_gpu = GpuPolynomialExt::new_with_tail_default( + ones_poly.buf.owned_subrange(0..one_len_bytes), nv, - p2_len, BB31Ext::ONE, - stream.as_ref(), - ) - .map_err(|e| format!("Failed to create compact ones numerator p2: {e:?}"))?; - let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) }; - let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) }; + ); let mut last_layer = vec![p1_gpu, p2_gpu]; last_layer.extend(lk_d_chunks); Ok(last_layer) From 7d60f015ed6fe4fb57927ccee5178896b5ae8070 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 11:22:35 +0800 Subject: [PATCH 04/25] update dep --- Cargo.lock | 124 ++++++----------------------------------------------- Cargo.toml | 82 +++++++++++++++++------------------ 2 files changed, 55 insertions(+), 151 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba90fc0e6..04bcabaf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1600,49 +1600,10 @@ version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" -[[package]] -name = "cuda-config" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d" -dependencies = [ - "glob", -] - -[[package]] -name = "cuda-runtime-sys" -version = "0.3.0-alpha.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39" -dependencies = [ - "cuda-config", -] - [[package]] name = "cuda_hal" version = "0.1.0" -dependencies = [ - "anyhow", - "cuda-runtime-sys", - "cudarc", - "downcast-rs", - "either", - "ff_ext", - "itertools 0.13.0", - "mpcs", - "multilinear_extensions", - "p3", - "rand 0.8.5", - "rayon", - "sha2", - "sppark", - "sppark_plug", - "sumcheck", - "thiserror 1.0.69", - "tracing", - "transcript", - "witness", -] +source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9" [[package]] name = "cudarc" @@ -2276,6 +2237,7 @@ dependencies = [ [[package]] name = "ff_ext" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "once_cell", "p3", @@ -2709,15 +2671,6 @@ dependencies = [ "digest 0.10.7", ] -[[package]] -name = "home" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" -dependencies = [ - "windows-sys 0.61.1", -] - [[package]] name = "iana-time-zone" version = "0.1.64" @@ -3149,12 +3102,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -3296,6 +3243,7 @@ dependencies = [ [[package]] name = "mpcs" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -3319,6 +3267,7 @@ dependencies = [ [[package]] name = "multilinear_extensions" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -4609,6 +4558,7 @@ dependencies = [ [[package]] name = "p3" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "p3-air", "p3-baby-bear", @@ -5176,6 +5126,7 @@ dependencies = [ [[package]] name = "poseidon" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "p3", @@ -5773,19 +5724,6 @@ dependencies = [ "semver 1.0.26", ] -[[package]] -name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - [[package]] name = "rustix" version = "1.0.7" @@ -5795,7 +5733,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.9.4", + "linux-raw-sys", "windows-sys 0.59.0", ] @@ -6145,6 +6083,7 @@ dependencies = [ [[package]] name = "sp1-curves" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "cfg-if", "dashu", @@ -6179,25 +6118,6 @@ dependencies = [ "der", ] -[[package]] -name = "sppark" -version = "0.1.11" -dependencies = [ - "cc", - "which", -] - -[[package]] -name = "sppark_plug" -version = "0.1.0" -dependencies = [ - "cc", - "ff_ext", - "itertools 0.13.0", - "p3", - "sppark", -] - [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -6288,6 +6208,7 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "sumcheck" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -6305,6 +6226,7 @@ dependencies = [ [[package]] name = "sumcheck_macro" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "itertools 0.13.0", "p3", @@ -6385,7 +6307,7 @@ dependencies = [ "fastrand", "getrandom 0.3.2", "once_cell", - "rustix 1.0.7", + "rustix", "windows-sys 0.59.0", ] @@ -6711,6 +6633,7 @@ dependencies = [ [[package]] name = "transcript" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "itertools 0.13.0", @@ -7001,21 +6924,10 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix 0.38.44", -] - [[package]] name = "whir" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -7143,15 +7055,6 @@ dependencies = [ "windows-targets 0.53.4", ] -[[package]] -name = "windows-sys" -version = "0.61.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-targets" version = "0.52.6" @@ -7311,6 +7214,7 @@ dependencies = [ [[package]] name = "witness" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "multilinear_extensions", diff --git a/Cargo.toml b/Cargo.toml index 8b79e59fe..1aa0a77fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [workspace] members = [ - "ceno_cli", - "ceno_emul", - "ceno_host", - "ceno_serde", - "ceno_rt", - "ceno_zkvm", - "ceno_recursion", - "derive", - "examples-builder", - "examples", - "guest_libs/*", + "ceno_cli", + "ceno_emul", + "ceno_host", + "ceno_serde", + "ceno_rt", + "ceno_zkvm", + "ceno_recursion", + "derive", + "examples-builder", + "examples", + "guest_libs/*", ] resolver = "2" @@ -27,16 +27,16 @@ version = "0.1.0" ceno_crypto_primitives = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_crypto_primitives", branch = "main" } ceno_syscall = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_syscall", branch = "main" } -ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", tag = "v1.0.0-alpha.24" } -mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", tag = "v1.0.0-alpha.24" } -multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", tag = "v1.0.0-alpha.24" } -p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", tag = "v1.0.0-alpha.24" } -poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", tag = "v1.0.0-alpha.24" } -sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", tag = "v1.0.0-alpha.24" } -sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", tag = "v1.0.0-alpha.24" } -transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", tag = "v1.0.0-alpha.24" } -whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", tag = "v1.0.0-alpha.24" } -witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", tag = "v1.0.0-alpha.24" } +ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", branch = "feat/mle_no_padding" } +mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", branch = "feat/mle_no_padding" } +multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", branch = "feat/mle_no_padding" } +p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", branch = "feat/mle_no_padding" } +poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", branch = "feat/mle_no_padding" } +sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", branch = "feat/mle_no_padding" } +sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", branch = "feat/mle_no_padding" } +transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", branch = "feat/mle_no_padding" } +whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", branch = "feat/mle_no_padding" } +witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", branch = "feat/mle_no_padding" } anyhow = { version = "1.0", default-features = false } bincode = "1" @@ -66,11 +66,11 @@ secp = "0.4.1" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" smallvec = { version = "1.13.2", features = [ - "const_generics", - "const_new", - "serde", - "union", - "write", + "const_generics", + "const_new", + "serde", + "union", + "write", ] } strum = "0.26" strum_macros = "0.26" @@ -79,7 +79,7 @@ thiserror = "2" thread_local = "1.1" tiny-keccak = { version = "2.0.2", features = ["keccak"] } tracing = { version = "0.1", features = [ - "attributes", + "attributes", ] } tracing-forest = { version = "0.1.6" } tracing-subscriber = { version = "0.3", features = ["env-filter"] } @@ -127,20 +127,20 @@ lto = "thin" #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" } #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" } -[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] -ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } - -[patch."https://github.com/scroll-tech/gkr-backend"] -ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } -mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } -multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } -p3 = { path = "../gkr-backend/crates/p3", package = "p3" } -poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } -sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } -sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } -transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } -whir = { path = "../gkr-backend/crates/whir", package = "whir" } -witness = { path = "../gkr-backend/crates/witness", package = "witness" } +#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] +#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } + +#[patch."https://github.com/scroll-tech/gkr-backend"] +#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } +#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } +#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } +#p3 = { path = "../gkr-backend/crates/p3", package = "p3" } +#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } +#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } +#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } +#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } +#whir = { path = "../gkr-backend/crates/whir", package = "whir" } +#witness = { path = "../gkr-backend/crates/witness", package = "witness" } # [patch."https://github.com/scroll-tech/openvm.git"] # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false } From e9fbe9c7612a1dbb2e91c094ccd818509b50c350 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 13:30:35 +0800 Subject: [PATCH 05/25] fix main mem estimation --- ceno_zkvm/src/scheme/gpu/memory.rs | 31 ++++++++++++++++++++++-------- ceno_zkvm/src/scheme/utils.rs | 8 ++++++-- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index b225ede85..5aaaf982c 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -11,11 +11,15 @@ use ceno_gpu::{ estimate_build_tower_memory, estimate_prove_tower_memory, estimate_sumcheck_memory, }; use ff_ext::ExtensionField; -use gkr_iop::gpu::{ - BB31Base, GpuBackend, - gpu_prover::{ - BB31Ext, CacheLevel, CudaHalBB31, MemTracker, get_gpu_cache_level, get_mem_tracking_mode, +use gkr_iop::{ + gpu::{ + BB31Base, GpuBackend, + gpu_prover::{ + BB31Ext, CacheLevel, CudaHalBB31, MemTracker, get_gpu_cache_level, + get_mem_tracking_mode, + }, }, + hal::MultilinearPolynomial, }; use mpcs::PolynomialCommitmentScheme; @@ -116,8 +120,8 @@ pub fn estimate_chip_proof_memory( composed_cs: &ComposedConstrainSystem, - occupied_rows: usize, + output_rows: usize, ) -> usize { let elem_size = std::mem::size_of::(); - tower_output_count(composed_cs) * occupied_rows * elem_size + tower_output_count(composed_cs) * output_rows * elem_size +} + +pub fn main_witness_output_rows>( + composed_cs: &ComposedConstrainSystem, + input: &ProofInput<'_, GpuBackend>, +) -> usize { + input + .witness + .first() + .map(|mle| mle.evaluations_len()) + .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)) } pub(crate) fn estimate_main_constraints_bytes< diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs index a52925b9a..bc53168db 100644 --- a/ceno_zkvm/src/scheme/utils.rs +++ b/ceno_zkvm/src/scheme/utils.rs @@ -707,9 +707,13 @@ pub fn build_main_witness< // GPU memory check: validate estimation against actual usage #[cfg(feature = "gpu")] { - let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0); + let output_rows = input + .witness + .first() + .map(|mle| mle.evaluations_len()) + .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)); let estimated_bytes = - crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, occupied_rows); + crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, output_rows); crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes); } From 5ecce046212ce157936df7b7b856f571bd72869a Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 13:43:30 +0800 Subject: [PATCH 06/25] fix mem estimator --- ceno_zkvm/src/scheme/gpu/memory.rs | 43 +++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 5aaaf982c..ea59f4fb5 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -12,6 +12,7 @@ use ceno_gpu::{ }; use ff_ext::ExtensionField; use gkr_iop::{ + evaluation::EvalExpression, gpu::{ BB31Base, GpuBackend, gpu_prover::{ @@ -376,7 +377,47 @@ pub fn estimate_main_witness_bytes( output_rows: usize, ) -> usize { let elem_size = std::mem::size_of::(); - tower_output_count(composed_cs) * output_rows * elem_size + main_witness_materialized_output_count(composed_cs) * output_rows * elem_size +} + +fn main_witness_materialized_output_count( + composed_cs: &ComposedConstrainSystem, +) -> usize { + let Some(gkr_circuit) = composed_cs.gkr_circuit.as_ref() else { + return 0; + }; + let final_layer_output_count = tower_output_count(composed_cs); + + gkr_circuit + .layers + .iter() + .enumerate() + .map(|(layer_index, layer)| { + let final_layer = layer_index == 0; + let out_evals = layer + .out_sel_and_eval_exprs + .iter() + .flat_map(|(_, out_eval)| out_eval.iter()); + + if final_layer { + out_evals + .take(final_layer_output_count) + .filter(|out_eval| main_witness_materializes_output(out_eval)) + .count() + } else { + out_evals + .filter(|out_eval| main_witness_materializes_output(out_eval)) + .count() + } + }) + .sum() +} + +fn main_witness_materializes_output(out_eval: &EvalExpression) -> bool { + matches!( + out_eval, + EvalExpression::Single(_) | EvalExpression::Linear(_, _, _) + ) } pub fn main_witness_output_rows>( From be14006053ad4e1a8073753564bb5a59e9ca9e5b Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Sun, 26 Apr 2026 21:14:51 +0800 Subject: [PATCH 07/25] snapshot compact tower estimator state --- Cargo.lock | 124 ++++++++++-- Cargo.toml | 62 +++--- ceno_zkvm/src/scheme/gpu/memory.rs | 35 +++- ceno_zkvm/src/scheme/gpu/mod.rs | 48 ++++- ceno_zkvm/src/scheme/prover.rs | 29 ++- ceno_zkvm/src/scheme/utils.rs | 26 ++- summary.md | 295 +++++++++++++++++++++++++++++ 7 files changed, 550 insertions(+), 69 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04bcabaf7..ba90fc0e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1600,10 +1600,49 @@ version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" +[[package]] +name = "cuda-config" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d" +dependencies = [ + "glob", +] + +[[package]] +name = "cuda-runtime-sys" +version = "0.3.0-alpha.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39" +dependencies = [ + "cuda-config", +] + [[package]] name = "cuda_hal" version = "0.1.0" -source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9" +dependencies = [ + "anyhow", + "cuda-runtime-sys", + "cudarc", + "downcast-rs", + "either", + "ff_ext", + "itertools 0.13.0", + "mpcs", + "multilinear_extensions", + "p3", + "rand 0.8.5", + "rayon", + "sha2", + "sppark", + "sppark_plug", + "sumcheck", + "thiserror 1.0.69", + "tracing", + "transcript", + "witness", +] [[package]] name = "cudarc" @@ -2237,7 +2276,6 @@ dependencies = [ [[package]] name = "ff_ext" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "once_cell", "p3", @@ -2671,6 +2709,15 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.1", +] + [[package]] name = "iana-time-zone" version = "0.1.64" @@ -3102,6 +3149,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -3243,7 +3296,6 @@ dependencies = [ [[package]] name = "mpcs" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -3267,7 +3319,6 @@ dependencies = [ [[package]] name = "multilinear_extensions" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -4558,7 +4609,6 @@ dependencies = [ [[package]] name = "p3" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "p3-air", "p3-baby-bear", @@ -5126,7 +5176,6 @@ dependencies = [ [[package]] name = "poseidon" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "p3", @@ -5724,6 +5773,19 @@ dependencies = [ "semver 1.0.26", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.0.7" @@ -5733,7 +5795,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] @@ -6083,7 +6145,6 @@ dependencies = [ [[package]] name = "sp1-curves" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "cfg-if", "dashu", @@ -6118,6 +6179,25 @@ dependencies = [ "der", ] +[[package]] +name = "sppark" +version = "0.1.11" +dependencies = [ + "cc", + "which", +] + +[[package]] +name = "sppark_plug" +version = "0.1.0" +dependencies = [ + "cc", + "ff_ext", + "itertools 0.13.0", + "p3", + "sppark", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -6208,7 +6288,6 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "sumcheck" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -6226,7 +6305,6 @@ dependencies = [ [[package]] name = "sumcheck_macro" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "itertools 0.13.0", "p3", @@ -6307,7 +6385,7 @@ dependencies = [ "fastrand", "getrandom 0.3.2", "once_cell", - "rustix", + "rustix 1.0.7", "windows-sys 0.59.0", ] @@ -6633,7 +6711,6 @@ dependencies = [ [[package]] name = "transcript" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "itertools 0.13.0", @@ -6924,10 +7001,21 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.44", +] + [[package]] name = "whir" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -7055,6 +7143,15 @@ dependencies = [ "windows-targets 0.53.4", ] +[[package]] +name = "windows-sys" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -7214,7 +7311,6 @@ dependencies = [ [[package]] name = "witness" version = "0.1.0" -source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "multilinear_extensions", diff --git a/Cargo.toml b/Cargo.toml index 1aa0a77fb..fbbbab29a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [workspace] members = [ - "ceno_cli", - "ceno_emul", - "ceno_host", - "ceno_serde", - "ceno_rt", - "ceno_zkvm", - "ceno_recursion", - "derive", - "examples-builder", - "examples", - "guest_libs/*", + "ceno_cli", + "ceno_emul", + "ceno_host", + "ceno_serde", + "ceno_rt", + "ceno_zkvm", + "ceno_recursion", + "derive", + "examples-builder", + "examples", + "guest_libs/*", ] resolver = "2" @@ -66,11 +66,11 @@ secp = "0.4.1" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" smallvec = { version = "1.13.2", features = [ - "const_generics", - "const_new", - "serde", - "union", - "write", + "const_generics", + "const_new", + "serde", + "union", + "write", ] } strum = "0.26" strum_macros = "0.26" @@ -79,7 +79,7 @@ thiserror = "2" thread_local = "1.1" tiny-keccak = { version = "2.0.2", features = ["keccak"] } tracing = { version = "0.1", features = [ - "attributes", + "attributes", ] } tracing-forest = { version = "0.1.6" } tracing-subscriber = { version = "0.3", features = ["env-filter"] } @@ -127,20 +127,20 @@ lto = "thin" #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" } #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" } -#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] -#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } - -#[patch."https://github.com/scroll-tech/gkr-backend"] -#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } -#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } -#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } -#p3 = { path = "../gkr-backend/crates/p3", package = "p3" } -#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } -#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } -#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } -#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } -#whir = { path = "../gkr-backend/crates/whir", package = "whir" } -#witness = { path = "../gkr-backend/crates/witness", package = "witness" } +[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] +ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } + +[patch."https://github.com/scroll-tech/gkr-backend"] +ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } +mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } +multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } +p3 = { path = "../gkr-backend/crates/p3", package = "p3" } +poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } +sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } +sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } +transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } +whir = { path = "../gkr-backend/crates/whir", package = "whir" } +witness = { path = "../gkr-backend/crates/witness", package = "witness" } # [patch."https://github.com/scroll-tech/openvm.git"] # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false } diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index ea59f4fb5..7ef24d36d 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -50,10 +50,22 @@ const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved head /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES` /// - Over-estimate (estimated > actual): diff must be <= `ESTIMATION_SAFETY_MARGIN_BYTES` pub fn check_gpu_mem_estimation(mem_tracker: Option, estimated_bytes: usize) { + check_gpu_mem_estimation_with_context(mem_tracker, estimated_bytes, None); +} + +pub fn check_gpu_mem_estimation_with_context( + mem_tracker: Option, + estimated_bytes: usize, + context: Option<&str>, +) { // `mem_tracker will` be Some only in sequential mode with mem tracking enabled, so if it's None, do nothing if let Some(mem_tracker) = mem_tracker { const ONE_MB: usize = 1024 * 1024; let label = mem_tracker.name(); + let label = context + .filter(|context| !context.is_empty()) + .map(|context| format!("{label}[{context}]")) + .unwrap_or_else(|| label.to_string()); let mem_stats = mem_tracker.finish(); let actual_bytes = mem_stats.mem_occupancy as usize; let diff = estimated_bytes as isize - actual_bytes as isize; @@ -424,6 +436,17 @@ pub fn main_witness_output_rows, input: &ProofInput<'_, GpuBackend>, ) -> usize { + if composed_cs + .gkr_circuit + .as_ref() + .and_then(|circuit| circuit.layers.last()) + .is_some_and(|input_layer| input_layer.in_eval_expr.is_empty()) + { + if let Some(structural_mle) = input.structural_witness.first() { + return structural_mle.evaluations_len(); + } + } + input .witness .first() @@ -547,7 +570,17 @@ fn estimate_tower_stage_components( } mod util; pub(crate) use memory::{ - check_gpu_mem_estimation, estimate_chip_proof_memory, estimate_main_witness_bytes, - estimate_replay_materialization_bytes_for_plan, estimate_tower_bytes, - estimate_tower_stage_bytes, init_gpu_mem_tracker, + check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory, + estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan, + estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker, }; use memory::{ estimate_ecc_quark_bytes_from_num_vars, estimate_main_constraints_bytes, @@ -1907,7 +1907,15 @@ impl> TowerProver(composed_cs, input); - check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes); + check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + estimated_bytes, + composed_cs + .gkr_circuit + .as_ref() + .and_then(|circuit| circuit.layers.first()) + .map(|layer| layer.name.as_str()), + ); res } @@ -1956,7 +1964,15 @@ impl> MainSumcheckProver(composed_cs, input); - check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes); + check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + estimated_bytes, + composed_cs + .gkr_circuit + .as_ref() + .and_then(|circuit| circuit.layers.first()) + .map(|layer| layer.name.as_str()), + ); res } @@ -1993,7 +2009,15 @@ impl> EccQuarkProver> estimated_replay_bytes as f64 / (1024.0 * 1024.0), ); task.input.witness = if let Some(trace_idx) = task.witness_trace_idx { - check_gpu_mem_estimation(gpu_mem_tracker, 0); + check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + 0, + Some(task.circuit_name.as_str()), + ); info_span!("[ceno] extract_witness_mles").in_scope(|| { extract_witness_mles_for_trace::( pcs_data, @@ -2210,7 +2238,11 @@ impl> }) } else { let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed"); - check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes); + check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + estimated_replay_bytes, + Some(task.circuit_name.as_str()), + ); info_span!("[ceno] replay_gpu_witness_from_raw") .in_scope(|| extract_witness_mles_for_trace_rmm::(witness_rmm)) }; diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index 45f786c6f..651810b30 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1117,12 +1117,11 @@ where scheme::{ constants::NUM_FANIN, gpu::{ - build_tower_witness_gpu, check_gpu_mem_estimation, - estimate_replay_materialization_bytes_for_plan, estimate_tower_stage_bytes, - extract_out_evals_from_gpu_towers, extract_witness_mles_for_trace, - log_gpu_device_state, log_gpu_pool_usage, prove_ec_sum_quark_impl, - prove_main_constraints_impl, prove_rotation_impl, prove_tower_relation_impl, - transport_structural_witness_to_gpu, + build_tower_witness_gpu, estimate_replay_materialization_bytes_for_plan, + estimate_tower_stage_bytes, extract_out_evals_from_gpu_towers, + extract_witness_mles_for_trace, log_gpu_device_state, log_gpu_pool_usage, + prove_ec_sum_quark_impl, prove_main_constraints_impl, prove_rotation_impl, + prove_tower_relation_impl, transport_structural_witness_to_gpu, }, }, }; @@ -1164,7 +1163,11 @@ where log_gpu_device_state(&format!("{name}:before_replay")); log_gpu_pool_usage(&format!("{name}:before_replay")); let witness_rmm = replay_plan.replay_witness()?; - check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes); + crate::scheme::gpu::check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + estimated_replay_bytes, + Some(name), + ); input.witness = info_span!("[ceno] replay_gpu_witness_from_raw") .in_scope(|| crate::scheme::gpu::extract_witness_mles_for_trace_rmm::(witness_rmm)); if let Some(structural_rmm_cached) = structural_rmm.as_ref() { @@ -1278,7 +1281,11 @@ where extract_out_evals_from_gpu_towers(&prod_gpu, &logup_gpu, r_set_len); Ok::<_, ZKVMError>((prod_gpu, logup_gpu, lk_out_evals, w_out_evals, r_out_evals)) })?; - check_gpu_mem_estimation(tower_build_mem_tracker, tower_build_estimated_bytes); + crate::scheme::gpu::check_gpu_mem_estimation_with_context( + tower_build_mem_tracker, + tower_build_estimated_bytes, + Some(name), + ); log_gpu_device_state(&format!("{name}:after_build_tower_witness")); log_gpu_pool_usage(&format!("{name}:after_build_tower_witness")); @@ -1319,7 +1326,11 @@ where log_gpu_pool_usage(&format!("{name}:after_prove_tower")); let rt_tower: Point = unsafe { std::mem::transmute(rt_tower_gl) }; let tower_proof: TowerProofs = unsafe { std::mem::transmute(tower_proof_gpu) }; - check_gpu_mem_estimation(tower_prove_mem_tracker, tower_prove_estimated_bytes); + crate::scheme::gpu::check_gpu_mem_estimation_with_context( + tower_prove_mem_tracker, + tower_prove_estimated_bytes, + Some(name), + ); drop(records); drop(tower_input); log_gpu_device_state(&format!("{name}:after_drop_tower")); diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs index bc53168db..4921d7f8c 100644 --- a/ceno_zkvm/src/scheme/utils.rs +++ b/ceno_zkvm/src/scheme/utils.rs @@ -707,14 +707,28 @@ pub fn build_main_witness< // GPU memory check: validate estimation against actual usage #[cfg(feature = "gpu")] { - let output_rows = input - .witness - .first() - .map(|mle| mle.evaluations_len()) - .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)); + let input_layer_has_only_structural_inputs = composed_cs + .gkr_circuit + .as_ref() + .and_then(|circuit| circuit.layers.last()) + .is_some_and(|input_layer| input_layer.in_eval_expr.is_empty()); + let output_rows = if input_layer_has_only_structural_inputs { + input + .structural_witness + .first() + .map(|mle| mle.evaluations_len()) + } else { + None + } + .or_else(|| input.witness.first().map(|mle| mle.evaluations_len())) + .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)); let estimated_bytes = crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, output_rows); - crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes); + crate::scheme::gpu::check_gpu_mem_estimation_with_context( + gpu_mem_tracker, + estimated_bytes, + gkr_circuit.layers.first().map(|layer| layer.name.as_str()), + ); } gkr_circuit_out.0.0 diff --git a/summary.md b/summary.md index a062bb59f..3b6979e24 100644 --- a/summary.md +++ b/summary.md @@ -286,3 +286,298 @@ git -C ../ceno-gpu diff --check ``` Result - Both passed. + +## Restart state: benchmark memcheck under-estimate follow-up + +Date: 2026-04-26 + +Current task +- User reported a remaining GPU memory under-estimate when running the top-entry repo `/home/wusm/rust/ceno-reth-benchmark` against the local `/home/wusm/rust/ceno` repo. +- The benchmark command must use `--rpc-url "$CENO_RPC"`; do not paste or persist concrete RPC URLs in logs or docs. +- In the current shell, `CENO_RPC` was not set, so the benchmark repro could not be completed before restart. + +Current repo state +- Main repo: `/home/wusm/rust/ceno` +- Current branch includes commit: + - `5ecce046 fix mem estimator` +- Important existing fix already present: + - `ceno_zkvm/src/scheme/gpu/memory.rs` now estimates `build_main_witness` by materialized GKR outputs, not only final tower outputs. + - This fixed the earlier `Ecall_Keccak` under-estimate where old estimate was around `11.73MB` and actual was `16.00MB`. +- Local root `Cargo.toml` and `Cargo.lock` are dirty from pre-existing dependency/local-path work; do not accidentally revert them unless explicitly requested. + +New diagnostic patch added before restart +- Added contextual labels to GPU memcheck output so future failures identify both stage and circuit. +- Files touched: + - `ceno_zkvm/src/scheme/gpu/memory.rs` + - added `check_gpu_mem_estimation_with_context(...)` + - labels now print like `build_main_witness[Ecall_Keccak]` + - `ceno_zkvm/src/scheme/utils.rs` + - `build_main_witness` memcheck now includes first GKR layer name + - `ceno_zkvm/src/scheme/prover.rs` + - replay/build-tower/prove-tower memchecks now include circuit name in sequential GPU proving path + - `ceno_zkvm/src/scheme/gpu/mod.rs` + - prover trait memchecks now include first GKR layer name or task circuit name where available +- This patch is diagnostic/safety oriented; it does not change memory estimates. + +Validation already run after diagnostic patch +```bash +cargo fmt --check +``` +Result +- Passed. + +```bash +timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e +``` +Result +- Passed. + +Lightweight memcheck e2e command run after diagnostic patch +```bash +/usr/bin/time -f 'elapsed %E' timeout 900s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-light-keccak-context-memcheck.log +``` +Result +- Memcheck stages passed; no under-estimate panic. +- The run still fails later at the known verifier assertion in `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`. +- Useful log examples: + - `replay_gpu_witness_from_raw[Ecall_Keccak]: estimated=11.23MB, actual=11.23MB` + - `build_main_witness[Ecall_Keccak]: estimated=32.41MB, actual=32.59MB` + - `build_tower_witness_gpu[Ecall_Keccak]: estimated=105.83MB, actual=106.01MB` + - `prove_tower_relation_gpu[Ecall_Keccak]: estimated=36.84MB, actual=37.26MB` + - `replay_gpu_witness_from_raw[ShardRamCircuit]: estimated=0.38MB, actual=0.38MB` + - `build_main_witness[ShardRamCircuit_main]: estimated=0.01MB, actual=0.01MB` + - `build_tower_witness_gpu[ShardRamCircuit]: estimated=0.01MB, actual=0.02MB` + +Important conclusion so far +- Lightweight Ceno `keccak_syscall` no longer reproduces the reported memcheck under-estimate. +- The remaining issue appears large-payload/top-entry specific and needs the benchmark repro with `CENO_RPC` exported. +- Because contextual memcheck labels are now in place, the next benchmark run should immediately identify the failing stage and circuit. + +Required environment for next session +```bash +export CENO_RPC='' +``` +- The assistant cannot see shell variables unless they are present in the execution environment. +- Verify with: +```bash +if [ -n "${CENO_RPC:-}" ]; then echo 'CENO_RPC is set'; else echo 'CENO_RPC is NOT set'; fi +``` + +Benchmark repro command to run next +- Workdir: `/home/wusm/rust/ceno-reth-benchmark` +- Use timeout and tee log. +- Keep `--rpc-url "$CENO_RPC"` exactly; do not expand into a persisted command string. + +```bash +/usr/bin/time -f 'elapsed %E' timeout 2400s env \ + CENO_GPU_MEM_TRACKING=1 \ + CENO_CONCURRENT_CHIP_PROVING=0 \ + CENO_GPU_ENABLE_WITGEN=1 \ + RUST_MIN_STACK=16777216 \ + RUST_BACKTRACE=1 \ + CYCLE_TRACKER_MAX_DEPTH=4 \ + OUTPUT_PATH=metrics.json \ + CENO_GPU_CACHE_LEVEL=0 \ + RUSTFLAGS='-C target-feature=+avx2' \ + JEMALLOC_SYS_WITH_MALLOC_CONF='retain:true,metadata_thp:always,thp:always,dirty_decay_ms:-1,muzzy_decay_ms:-1' \ + RUST_LOG=debug \ + cargo run --features jemalloc --features metrics --features perf-metrics --features gpu --bin ceno-reth-benchmark-bin -- \ + --block-number 23587691 \ + --rpc-url "$CENO_RPC" \ + --cache-dir block_data \ + --mode prove-app \ + --app-proofs ./app_proof.bitcode \ + --shard-id 0 \ + --chain-id 1 \ + 2>&1 | tee /tmp/ceno-reth-benchmark-memcheck.log +``` + +After benchmark fails or completes +1. Extract memcheck failure context: +```bash +rg -n "under-estimate|over-estimate|\\[memcheck\\].*diff=-" /tmp/ceno-reth-benchmark-memcheck.log | tail -120 +``` +2. The failing line should now include a label like: + - `build_main_witness[]` + - `build_tower_witness_gpu[]` + - `prove_tower_relation_gpu[]` + - `replay_gpu_witness_from_raw[]` +3. Patch only the relevant estimator in `/home/wusm/rust/ceno`. +4. Re-run lightweight Ceno check first: +```bash +cargo fmt --check +timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e +``` +5. Then rerun the benchmark command above. + +Security hygiene +- If a concrete RPC URL accidentally appears in any local log, scrub it immediately: +```bash +for f in /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark/*.txt /home/wusm/rust/ceno-reth-benchmark/*.log; do + [ -f "$f" ] || continue + perl -0pi -e 's#https://eth-mainnet\\.g\\.alchemy\\.com/v2/[^\\s\\x27\\"]+#\\$CENO_RPC#g' "$f" +done +``` +- Verify no RPC string remains: +```bash +rg -n 'alchemy|eth-mainnet\.g\.alchemy' /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark -g '*.txt' -g '*.log' -g '*.md' -g '*.json' 2>/dev/null || true +``` + +## Architecture refresher: compact GPU witness / memory-estimator terminology + +This section is intended for a fresh session before touching estimators or compact witness code. + +Core terminology +- `occupied rows` / `actual rows`: + - Real number of rows with data for a chip or replay plan. + - Usually `input.num_instances() << rotation_vars` for normal chip inputs. + - For replayed GPU witgen, prefer replay-plan-specific real rows when available. +- `logical domain` / `full domain`: + - Power-of-two domain implied by `num_vars`. + - Some protocols/verifier semantics still reason over this domain. + - Prover storage should avoid allocating it when compact storage is sufficient. +- `compact witness`: + - Device/host storage sized by occupied rows, not full logical domain. + - This is the intended design for the GPU witgen/prover path. +- `materialized output`: + - GKR layer output MLE that is actually allocated during `build_main_witness`. + - `EvalExpression::Single` and `EvalExpression::Linear` materialize; `Zero` does not. +- `final/output GKR layer`: + - `gkr_circuit.layers[0]` because circuit layers are ordered output-to-input. + - The `output_mask` is applied only to this final/output layer during tower witness build. +- `internal GKR layers`: + - Any layer after index 0 in `gkr_circuit.layers`. + - These do not receive the final tower `output_mask`; all non-zero outputs are materialized. +- `replay path`: + - GPU witgen can replay raw records into device-backed witness matrices just-in-time. + - Large replay-heavy chips currently include `Ecall_Keccak` and `ShardRamCircuit`. +- `stage split`: + - Large replay chips materialize witness multiple times for separate stages to reduce peak VRAM. + - Estimator must model stage-local peaks, not sum all stages as simultaneously live. + +Important module map +- `ceno_zkvm/src/scheme/gpu/memory.rs` + - Central GPU memory estimator and memcheck assertion logic. + - Key functions: + - `estimate_chip_proof_memory` + - `estimate_trace_bytes` + - `estimate_main_witness_bytes` + - `estimate_tower_stage_components` + - `estimate_main_constraints_bytes` + - `estimate_replay_materialization_bytes_for_plan` + - `check_gpu_mem_estimation_with_context` +- `ceno_zkvm/src/scheme/utils.rs` + - Builds main GKR witness through `build_main_witness` / `gkr_witness`. + - Owns output materialization mask logic: + - `tower_output_count` + - `build_output_materialization_mask` + - `first_layer_output_group_stage_masks` + - Critical design point: + - `output_mask` is applied only to final/output GKR layer. +- `ceno_zkvm/src/scheme/prover.rs` + - Sequential per-chip GPU proving flow and replay stage splitting. + - Important stages: + - replay raw GPU witness + - build main witness + - build tower witness + - prove tower + - replay again for ECC/main constraints if needed +- `ceno_zkvm/src/scheme/gpu/mod.rs` + - GPU prover trait implementations and shared helpers. + - Includes trait-level memchecks for tower/main/ecc/replay helper paths. +- `../ceno-gpu/cuda_hal/src/common/tower/*` + - GPU tower witness/proof host-side implementation. +- `../ceno-gpu/cpp/common/tower.cuh` and kernel files under `../ceno-gpu/cpp/*/kernels/tower.cu` + - CUDA tower kernels and compact split logic. +- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` + - Rust host-side V2 sumcheck setup. +- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` + - CUDA V2 sumcheck logic. + +Current compact witness design assumptions +- Whole flow target: + - commit + - tower prove + - main prove + - rotation prove + - ECC prove + - batch opening + - should operate on compact witness storage wherever prover-side full-domain padding is not semantically required. +- Round-0 original inputs: + - Use direct/native order over real occupied data. + - Do not invent tower-specific order separate from PCS. +- Later folded rounds: + - Use normal in-place/folded buffer semantics. + - Do not call this a replay buffer; call it `in-place buffer`. +- Compact even/odd tails: + - Avoid branch-per-element loops for odd real lengths. + - Decide odd/even outside the loop and process the leftover tail separately. +- Cloning policy: + - Avoid full `clone` / `to_vec` on large witness buffers unless it is intentionally debug-only. + +Main witness memory-estimator design +- Old broken model: + - `tower_output_count(composed_cs) * rows * sizeof(BB31Ext)`. + - This only counts final tower outputs. +- Correct current model: + - Count final/output layer materialized tower outputs under the output mask. + - Plus count all internal layer non-zero outputs because internal layers are not masked. + - Multiply by real output rows, normally `input.witness.first().evaluations_len()`. +- Why this matters: + - Multi-layer GKR circuits like `Ecall_Keccak` materialize internal outputs during `build_main_witness`. + - Single-layer circuits like `ShardRamCircuit_main` usually do not have the same missing-internal-output issue. + +Replay / trace estimator design +- Normal non-replay path: + - Extracted witness and structural MLEs can stay resident across chip proof. + - Stage peak is resident trace plus max temporary stage. +- Replay-heavy path (`Ecall_Keccak`, `ShardRamCircuit`): + - Estimate replay materialization from replay plan real rows, not full logical domain. + - Replay witness is materialized for tower stages, then cleared before tower prove/main stages as designed. + - Estimator should use max of replay/build/prove/ecc/main stage peaks plus safety margin. +- Structural witness caveat: + - If structural RMM already has device backing, transport may be view-only and estimate zero new bytes. + - If not device-backed, estimate structural upload by real rows when possible. + +Tower estimator design +- Build stage estimate includes: + - CUDA tower build temporary allocations from `estimate_build_tower_memory`. + - Compact product split buffers. + - Compact logup split buffers. +- Prove stage estimate separates: + - live tower input buffers + - local create-proof temporary allocations +- For logup: + - If table lookup has numerator, numerator buffers are real compact buffers. + - If no numerator, ones/default numerator should not allocate a full domain buffer. + +Scheduler / memcheck relationship +- Sequential + `CENO_GPU_MEM_TRACKING=1`: + - Runs memcheck assertions stage-by-stage. + - This is the best mode for estimator debugging. +- Concurrent + mem tracking disabled: + - Uses estimator for booking/scheduling VRAM, not direct memcheck assertions. +- Booking can include extra safety margin for replay-heavy chips in concurrent mode. +- A stage-local memcheck pass does not automatically prove concurrent booking is optimal, but it strongly validates the per-stage estimator. + +Current known caveats +- Lightweight `keccak_syscall` memchecks pass after current estimator fixes. +- The lightweight run still hits a known verifier assertion later at: + - `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306` +- The remaining reported under-estimate is only known from the top-entry benchmark payload and must be reproduced with `CENO_RPC` exported. +- Do not guess the failing estimator from the old generic label; use the new contextual memcheck label first. + +Recommended investigation discipline +1. Reproduce with sequential mem tracking first: + - `CENO_GPU_MEM_TRACKING=1` + - `CENO_CONCURRENT_CHIP_PROVING=0` +2. Read the exact contextual label: + - `build_main_witness[...]` + - `build_tower_witness_gpu[...]` + - `prove_tower_relation_gpu[...]` + - `replay_gpu_witness_from_raw[...]` +3. Patch only the estimator for that stage/circuit class. +4. Validate in `/home/wusm/rust/ceno` first: + - `cargo fmt --check` + - `timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e` +5. Then rerun the top-entry benchmark. From df88decc60fff7deac779c8b5318b2d821cfa1fe Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 11:08:43 +0800 Subject: [PATCH 08/25] rollback Cargo.toml, Cargo.lock change --- Cargo.lock | 124 ++++++----------------------------------------------- Cargo.toml | 62 +++++++++++++-------------- 2 files changed, 45 insertions(+), 141 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba90fc0e6..04bcabaf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1600,49 +1600,10 @@ version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" -[[package]] -name = "cuda-config" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d" -dependencies = [ - "glob", -] - -[[package]] -name = "cuda-runtime-sys" -version = "0.3.0-alpha.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39" -dependencies = [ - "cuda-config", -] - [[package]] name = "cuda_hal" version = "0.1.0" -dependencies = [ - "anyhow", - "cuda-runtime-sys", - "cudarc", - "downcast-rs", - "either", - "ff_ext", - "itertools 0.13.0", - "mpcs", - "multilinear_extensions", - "p3", - "rand 0.8.5", - "rayon", - "sha2", - "sppark", - "sppark_plug", - "sumcheck", - "thiserror 1.0.69", - "tracing", - "transcript", - "witness", -] +source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9" [[package]] name = "cudarc" @@ -2276,6 +2237,7 @@ dependencies = [ [[package]] name = "ff_ext" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "once_cell", "p3", @@ -2709,15 +2671,6 @@ dependencies = [ "digest 0.10.7", ] -[[package]] -name = "home" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" -dependencies = [ - "windows-sys 0.61.1", -] - [[package]] name = "iana-time-zone" version = "0.1.64" @@ -3149,12 +3102,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -3296,6 +3243,7 @@ dependencies = [ [[package]] name = "mpcs" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -3319,6 +3267,7 @@ dependencies = [ [[package]] name = "multilinear_extensions" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -4609,6 +4558,7 @@ dependencies = [ [[package]] name = "p3" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "p3-air", "p3-baby-bear", @@ -5176,6 +5126,7 @@ dependencies = [ [[package]] name = "poseidon" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "p3", @@ -5773,19 +5724,6 @@ dependencies = [ "semver 1.0.26", ] -[[package]] -name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - [[package]] name = "rustix" version = "1.0.7" @@ -5795,7 +5733,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.9.4", + "linux-raw-sys", "windows-sys 0.59.0", ] @@ -6145,6 +6083,7 @@ dependencies = [ [[package]] name = "sp1-curves" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "cfg-if", "dashu", @@ -6179,25 +6118,6 @@ dependencies = [ "der", ] -[[package]] -name = "sppark" -version = "0.1.11" -dependencies = [ - "cc", - "which", -] - -[[package]] -name = "sppark_plug" -version = "0.1.0" -dependencies = [ - "cc", - "ff_ext", - "itertools 0.13.0", - "p3", - "sppark", -] - [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -6288,6 +6208,7 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "sumcheck" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "either", "ff_ext", @@ -6305,6 +6226,7 @@ dependencies = [ [[package]] name = "sumcheck_macro" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "itertools 0.13.0", "p3", @@ -6385,7 +6307,7 @@ dependencies = [ "fastrand", "getrandom 0.3.2", "once_cell", - "rustix 1.0.7", + "rustix", "windows-sys 0.59.0", ] @@ -6711,6 +6633,7 @@ dependencies = [ [[package]] name = "transcript" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "itertools 0.13.0", @@ -7001,21 +6924,10 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix 0.38.44", -] - [[package]] name = "whir" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "bincode 1.3.3", "clap", @@ -7143,15 +7055,6 @@ dependencies = [ "windows-targets 0.53.4", ] -[[package]] -name = "windows-sys" -version = "0.61.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-targets" version = "0.52.6" @@ -7311,6 +7214,7 @@ dependencies = [ [[package]] name = "witness" version = "0.1.0" +source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131" dependencies = [ "ff_ext", "multilinear_extensions", diff --git a/Cargo.toml b/Cargo.toml index fbbbab29a..59a7e8653 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [workspace] members = [ - "ceno_cli", - "ceno_emul", - "ceno_host", - "ceno_serde", - "ceno_rt", - "ceno_zkvm", - "ceno_recursion", - "derive", - "examples-builder", - "examples", - "guest_libs/*", + "ceno_cli", + "ceno_emul", + "ceno_host", + "ceno_serde", + "ceno_rt", + "ceno_zkvm", + "ceno_recursion", + "derive", + "examples-builder", + "examples", + "guest_libs/*", ] resolver = "2" @@ -66,11 +66,11 @@ secp = "0.4.1" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" smallvec = { version = "1.13.2", features = [ - "const_generics", - "const_new", - "serde", - "union", - "write", + "const_generics", + "const_new", + "serde", + "union", + "write", ] } strum = "0.26" strum_macros = "0.26" @@ -79,7 +79,7 @@ thiserror = "2" thread_local = "1.1" tiny-keccak = { version = "2.0.2", features = ["keccak"] } tracing = { version = "0.1", features = [ - "attributes", + "attributes", ] } tracing-forest = { version = "0.1.6" } tracing-subscriber = { version = "0.3", features = ["env-filter"] } @@ -127,20 +127,20 @@ lto = "thin" #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" } #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" } -[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] -ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } - -[patch."https://github.com/scroll-tech/gkr-backend"] -ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } -mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } -multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } -p3 = { path = "../gkr-backend/crates/p3", package = "p3" } -poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } -sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } -sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } -transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } -whir = { path = "../gkr-backend/crates/whir", package = "whir" } -witness = { path = "../gkr-backend/crates/witness", package = "witness" } +#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"] +#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] } +# +#[patch."https://github.com/scroll-tech/gkr-backend"] +#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } +#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" } +#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" } +#p3 = { path = "../gkr-backend/crates/p3", package = "p3" } +#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" } +#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" } +#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" } +#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" } +#whir = { path = "../gkr-backend/crates/whir", package = "whir" } +#witness = { path = "../gkr-backend/crates/witness", package = "witness" } # [patch."https://github.com/scroll-tech/openvm.git"] # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false } From b57b6928000e82256b43477e04766dc48787db86 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 13:43:18 +0800 Subject: [PATCH 09/25] fix memory estimation --- ceno_zkvm/src/scheme/gpu/memory.rs | 11 +++++++---- ceno_zkvm/src/scheme/gpu/mod.rs | 13 +++++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 7ef24d36d..3baf10bc5 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -374,6 +374,7 @@ pub(crate) fn estimate_trace_bytes (usize, usize) { let base_elem_size = std::mem::size_of::(); let mle_len = 1usize << num_vars; - let poly_bytes = num_witin * mle_len * base_elem_size; + let compact_poly_bytes = num_witin * occupied_rows * base_elem_size; + let logical_poly_bytes = num_witin * mle_len * base_elem_size; if should_materialize_witness_on_gpu() { if should_retain_witness_device_backing_after_commit() { @@ -660,19 +663,19 @@ pub(crate) fn estimate_trace_extraction_bytes( // duration of the chip proof. There is no separate extraction temp // buffer, but the replayed witness itself must be accounted for as // resident task memory. - return (poly_bytes, 0); + return (compact_poly_bytes, 0); } // GPU witgen alone does not imply replayability. Non-replayable traces // still go through basefold::get_trace in cache-none mode, which // allocates the extracted witness plus a temporary 2x transpose buffer. - return (poly_bytes, 2 * poly_bytes); + return (compact_poly_bytes, 2 * logical_poly_bytes); } if matches!(get_gpu_cache_level(), CacheLevel::None) { // Default cache level is None // get_trace allocates poly copies (resident) + temp_buffer (2x, freed after) - (poly_bytes, 2 * poly_bytes) + (compact_poly_bytes, 2 * logical_poly_bytes) } else { (0, 0) } diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index 445a8dd98..bc81a20d0 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -1339,9 +1339,13 @@ impl> TraceCommitter>> = poly_group From c50b793cc702c8122a4cc99b9d7cb5458514a705 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 15:24:44 +0800 Subject: [PATCH 10/25] verifier log --- ceno_zkvm/src/scheme/verifier.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ceno_zkvm/src/scheme/verifier.rs b/ceno_zkvm/src/scheme/verifier.rs index 351e1d402..e45cae99d 100644 --- a/ceno_zkvm/src/scheme/verifier.rs +++ b/ceno_zkvm/src/scheme/verifier.rs @@ -340,6 +340,13 @@ impl> vm_proof: ZKVMProof, mut transcript: impl ForkableTranscript, ) -> Result, ZKVMError> { + tracing::info!( + "verifying shard proof: expected_shard_id={}, proof_shard_id={}, chip_groups={}", + shard_id, + vm_proof.public_values.shard_id, + vm_proof.chip_proofs.len() + ); + // main invariant between opcode circuits and table circuits let mut prod_r = E::ONE; let mut prod_w = E::ONE; From 89b86987b2d1598b4c4d591cf449b31700bf1d66 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 16:48:06 +0800 Subject: [PATCH 11/25] Pass tower input by value for GPU proving --- ceno_zkvm/src/scheme/gpu/memory.rs | 1 - ceno_zkvm/src/scheme/gpu/mod.rs | 2 +- ceno_zkvm/src/scheme/prover.rs | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 3baf10bc5..69c39761e 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -148,7 +148,6 @@ pub fn estimate_chip_proof_memory Date: Mon, 27 Apr 2026 17:50:40 +0800 Subject: [PATCH 12/25] split tower layer by view --- ceno_zkvm/src/scheme/gpu/memory.rs | 40 ++---------------------------- ceno_zkvm/src/scheme/gpu/mod.rs | 12 +++------ 2 files changed, 5 insertions(+), 47 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 69c39761e..f0764c17f 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -1,7 +1,7 @@ use crate::{ instructions::gpu::dispatch::GpuWitgenKind, scheme::{ - constants::{NUM_FANIN, NUM_FANIN_LOGUP, SEPTIC_EXTENSION_DEGREE}, + constants::{NUM_FANIN, SEPTIC_EXTENSION_DEGREE}, hal::ProofInput, utils::tower_output_count, }, @@ -550,26 +550,6 @@ fn estimate_tower_stage_components 0 { - num_prod_towers - * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN) - * elem_size - } else { - 0 - }; - let logup_split_bytes = if num_logup_towers > 0 { - let denominator_bytes = num_logup_towers - * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN_LOGUP) - * elem_size; - let numerator_or_ones_bytes = if has_logup_numerator { - denominator_bytes - } else { - elem_size - }; - denominator_bytes + numerator_or_ones_bytes - } else { - 0 - }; let shard_ram_tower_batch_overhead = composed_cs .gkr_circuit .as_ref() @@ -577,10 +557,7 @@ fn estimate_tower_stage_components usize { - let chunk_size = logical_len / num_chunks; - (0..num_chunks) - .map(|chunk_idx| { - let chunk_start = chunk_idx * chunk_size; - occupied_len - .saturating_sub(chunk_start) - .min(chunk_size) - .max(1) - }) - .sum() -} - /// Estimate temporary GPU memory for the tower proving stage (build + prove). /// Used by prove_tower_relation to validate against actual mem_tracker measurements. pub(crate) fn estimate_tower_stage_bytes>( diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index 2113b3ed8..d2473faa6 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -1694,13 +1694,7 @@ pub(crate) fn build_tower_witness_gpu( .map(|wit| match wit.inner() { gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal .tower - .masked_mle_split_to_chunks( - &*cuda_hal, - poly, - NUM_FANIN, - BB31Ext::ONE, - stream.as_ref(), - ) + .masked_mle_view_chunks(&*cuda_hal, poly, NUM_FANIN, BB31Ext::ONE, stream.as_ref()) .map_err(|e| format!("Failed to split compact prod tower input: {e}")), _ => return Err("tower witness expects extension-field record MLEs".to_string()), }) @@ -1716,7 +1710,7 @@ pub(crate) fn build_tower_witness_gpu( .map(|wit| match wit.inner() { gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal .tower - .masked_mle_split_to_chunks( + .masked_mle_view_chunks( &*cuda_hal, poly, NUM_FANIN_LOGUP, @@ -1732,7 +1726,7 @@ pub(crate) fn build_tower_witness_gpu( .map(|wit| match wit.inner() { gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal .tower - .masked_mle_split_to_chunks( + .masked_mle_view_chunks( &*cuda_hal, poly, NUM_FANIN_LOGUP, From 99b7a94524ecf46f530fc2ae8b14d40b763fc069 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 21:10:27 +0800 Subject: [PATCH 13/25] Use dense tower build for compact GPU input --- ceno_zkvm/src/scheme/gpu/mod.rs | 86 +++++++++------------------------ 1 file changed, 22 insertions(+), 64 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index d2473faa6..ea7090807 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -32,7 +32,7 @@ use gkr_iop::{ layer::{LayerWitness, gpu::utils::extract_mle_relationships_from_monomial_terms}, }, gpu::{GpuBackend, GpuProver, gpu_prover::BB31Ext}, - hal::{MultilinearPolynomial, ProverBackend}, + hal::ProverBackend, }; use itertools::{Itertools, chain}; use mpcs::{Point, PolynomialCommitmentScheme}; @@ -69,59 +69,6 @@ use gkr_iop::gpu::gpu_prover::*; mod memory; -fn pad_gpu_mles_to_full_domain( - mles: impl IntoIterator>, -) -> Vec> { - let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL"); - let stream = gkr_iop::gpu::get_thread_stream(); - mles.into_iter() - .map(|mle| { - let mle_ref = mle.as_ref(); - let full_len = 1usize << mle_ref.num_vars(); - if mle_ref.evaluations_len() == full_len { - return mle; - } - let padded: gkr_iop::gpu::MultilinearExtensionGpu<'static, E> = match mle_ref.inner() { - gkr_iop::gpu::GpuFieldType::Base(poly) => { - let mut host = poly.to_cpu_vec(stream.as_ref()); - host.resize(full_len, BB31Base::ZERO); - unsafe { - std::mem::transmute( - gkr_iop::gpu::MultilinearExtensionGpu::::from_ceno_gpu_base( - ceno_gpu::bb31::GpuPolynomial::from_ceno_vec( - &cuda_hal, - &host, - mle_ref.num_vars(), - stream.as_ref(), - ) - .expect("pad base mle"), - ), - ) - } - } - gkr_iop::gpu::GpuFieldType::Ext(poly) => { - let mut host = poly.to_cpu_vec(stream.as_ref()); - host.resize(full_len, BB31Ext::ZERO); - unsafe { - std::mem::transmute( - gkr_iop::gpu::MultilinearExtensionGpu::::from_ceno_gpu_ext( - ceno_gpu::bb31::GpuPolynomialExt::from_ceno_vec( - &cuda_hal, - &host, - mle_ref.num_vars(), - stream.as_ref(), - ) - .expect("pad ext mle"), - ), - ) - } - } - gkr_iop::gpu::GpuFieldType::Unreachable => unreachable!(), - }; - Arc::new(padded) - }) - .collect() -} mod util; pub(crate) use memory::{ check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory, @@ -539,12 +486,12 @@ pub fn prove_rotation_impl let log2_num_instances = input.log2_num_instances(); let num_threads = optimal_sumcheck_threads(log2_num_instances); let num_var_with_rotation = log2_num_instances + composed_cs.rotation_vars().unwrap_or(0); - let padded_wit_storage = pad_gpu_mles_to_full_domain( + let wit = LayerWitness( chain!(&input.witness, &input.fixed, &input.structural_witness) .cloned() - .map(|mle| unsafe { std::mem::transmute(mle) }), + .map(|mle| unsafe { std::mem::transmute(mle) }) + .collect(), ); - let wit = LayerWitness(padded_wit_storage); let (proof, points) = gkr_iop::gkr::layer::gpu::prove_rotation_gpu::( num_threads, @@ -758,11 +705,12 @@ pub fn prove_main_constraints_impl< num_threads, num_var_with_rotation, gkr::GKRCircuitWitness { - layers: vec![LayerWitness(pad_gpu_mles_to_full_domain( + layers: vec![LayerWitness( chain!(&input.witness, &input.fixed, &input.structural_witness,) .cloned() - .map(|mle| unsafe { std::mem::transmute(mle) }), - ))], + .map(|mle| unsafe { std::mem::transmute(mle) }) + .collect(), + )], }, &out_evals, &input @@ -1807,7 +1755,7 @@ pub(crate) fn build_tower_witness_gpu( let last_layers_refs: Vec<&[GpuPolynomialExt<'_>]> = prod_last_layers.iter().map(|v| v.as_slice()).collect(); let gpu_specs = { - cuda_hal.tower.build_prod_tower_from_gpu_polys_batch( + cuda_hal.tower.build_prod_tower_dense_from_gpu_polys_batch( cuda_hal, &last_layers_refs, num_vars, @@ -1815,7 +1763,12 @@ pub(crate) fn build_tower_witness_gpu( stream.as_ref(), ) } - .map_err(|e| format!("build_prod_tower_from_gpu_polys_batch failed: {:?}", e))?; + .map_err(|e| { + format!( + "build_prod_tower_dense_from_gpu_polys_batch failed: {:?}", + e + ) + })?; prod_gpu_specs.extend(gpu_specs); exit_span!(span_prod); } @@ -1837,14 +1790,19 @@ pub(crate) fn build_tower_witness_gpu( logup_last_layers.iter().map(|v| v.as_slice()).collect(); let gpu_specs = cuda_hal .tower - .build_logup_tower_from_gpu_polys_batch( + .build_logup_tower_dense_from_gpu_polys_batch( cuda_hal, &last_layers_refs, num_vars, num_towers, stream.as_ref(), ) - .map_err(|e| format!("build_logup_tower_from_gpu_polys_batch failed: {:?}", e))?; + .map_err(|e| { + format!( + "build_logup_tower_dense_from_gpu_polys_batch failed: {:?}", + e + ) + })?; logup_gpu_specs.extend(gpu_specs); exit_span!(span_logup); } From f0d81b641f730eb7824eaa7d0a893a4ef2cff6e0 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 21:40:40 +0800 Subject: [PATCH 14/25] Pass logup shape to tower prove estimator --- ceno_zkvm/src/scheme/gpu/memory.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index f0764c17f..6eaabd50b 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -566,6 +566,7 @@ fn estimate_tower_stage_components Date: Mon, 27 Apr 2026 22:19:45 +0800 Subject: [PATCH 15/25] Deduplicate borrowed tower input booking --- ceno_zkvm/src/scheme/gpu/memory.rs | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 6eaabd50b..68a200182 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -145,9 +145,17 @@ pub fn estimate_chip_proof_memory>( composed_cs: &ComposedConstrainSystem, input: &ProofInput<'_, GpuBackend>, -) -> (usize, usize, usize) { +) -> (usize, usize, usize, usize) { let cs = &composed_cs.zkvm_v1_css; let num_prod_towers = composed_cs.num_reads() + composed_cs.num_writes(); let num_logup_towers = if composed_cs.is_with_lk_table() { @@ -571,9 +579,16 @@ fn estimate_tower_stage_components, input: &ProofInput<'_, GpuBackend>, ) -> (usize, usize) { - let (build_bytes, prove_local_bytes, _) = estimate_tower_stage_components(composed_cs, input); + let (build_bytes, prove_local_bytes, _, _) = + estimate_tower_stage_components(composed_cs, input); (build_bytes, prove_local_bytes) } @@ -590,7 +606,7 @@ pub(crate) fn estimate_tower_bytes, input: &ProofInput<'_, GpuBackend>, ) -> usize { - let (build_bytes, prove_local_bytes, tower_input_live_bytes) = + let (build_bytes, prove_local_bytes, tower_input_live_bytes, _) = estimate_tower_stage_components(composed_cs, input); build_bytes.max(tower_input_live_bytes + prove_local_bytes) } From 4fc8daeb59eafb82b532cc6ba39ec61e6a2799d6 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 22:21:57 +0800 Subject: [PATCH 16/25] fix logging --- ceno_zkvm/src/scheme/gpu/memory.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 68a200182..fb8e5871f 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -240,7 +240,7 @@ pub fn estimate_chip_proof_memory Date: Mon, 27 Apr 2026 23:11:57 +0800 Subject: [PATCH 17/25] Check scheduler memory estimate in mem tracking --- ceno_zkvm/src/scheme/gpu/memory.rs | 73 ++++++++++++++++++++++++++++-- ceno_zkvm/src/scheme/gpu/mod.rs | 3 +- ceno_zkvm/src/scheme/prover.rs | 11 +++++ 3 files changed, 82 insertions(+), 5 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index fb8e5871f..f61bce28f 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -45,6 +45,7 @@ pub fn init_gpu_mem_tracker<'a>( const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB +const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024; /// Validate that the estimated GPU memory matches actual usage within tolerance. /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES` @@ -57,6 +58,70 @@ pub fn check_gpu_mem_estimation_with_context( mem_tracker: Option, estimated_bytes: usize, context: Option<&str>, +) { + check_gpu_mem_estimation_with_margins( + mem_tracker, + estimated_bytes, + context, + ESTIMATION_TOLERANCE_BYTES, + ESTIMATION_SAFETY_MARGIN_BYTES, + ); +} + +pub fn check_gpu_scheduler_mem_estimation_with_context( + mem_tracker: Option, + estimated_bytes: usize, + context: Option<&str>, +) { + // Scheduler estimates are admission-control estimates, not exact stage-local allocation + // estimates. They intentionally include safety margins and conservative lifetime overlap, so + // large over-estimates should be surfaced as warnings rather than failing the proof. Under- + // estimates remain hard failures because they can admit unsafe concurrent work. + if let Some(mem_tracker) = mem_tracker { + const ONE_MB: usize = 1024 * 1024; + let label = mem_tracker.name(); + let label = context + .filter(|context| !context.is_empty()) + .map(|context| format!("{label}[{context}]")) + .unwrap_or_else(|| label.to_string()); + let mem_stats = mem_tracker.finish(); + let actual_bytes = mem_stats.mem_occupancy as usize; + let diff = estimated_bytes as isize - actual_bytes as isize; + let to_mb = |b: usize| b as f64 / ONE_MB as f64; + let diff_mb = diff as f64 / ONE_MB as f64; + tracing::info!( + "[memcheck] {label}: scheduler_estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB", + to_mb(estimated_bytes), + to_mb(actual_bytes), + diff_mb + ); + if diff < 0 { + assert!( + (-diff) as usize <= ESTIMATION_TOLERANCE_BYTES, + "[memcheck] {label}: scheduler under-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, tolerance={:.2}MB", + to_mb(estimated_bytes), + to_mb(actual_bytes), + diff_mb, + to_mb(ESTIMATION_TOLERANCE_BYTES), + ); + } else if diff as usize > SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES { + tracing::warn!( + "[memcheck] {label}: scheduler over-estimate warning: estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, warning_margin={:.2}MB", + to_mb(estimated_bytes), + to_mb(actual_bytes), + diff_mb, + to_mb(SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES), + ); + } + } +} + +fn check_gpu_mem_estimation_with_margins( + mem_tracker: Option, + estimated_bytes: usize, + context: Option<&str>, + under_tolerance_bytes: usize, + over_tolerance_bytes: usize, ) { // `mem_tracker will` be Some only in sequential mode with mem tracking enabled, so if it's None, do nothing if let Some(mem_tracker) = mem_tracker { @@ -80,22 +145,22 @@ pub fn check_gpu_mem_estimation_with_context( if diff < 0 { // Under-estimate: actual exceeds estimated assert!( - (-diff) as usize <= ESTIMATION_TOLERANCE_BYTES, + (-diff) as usize <= under_tolerance_bytes, "[memcheck] {label}: under-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, tolerance={:.2}MB", to_mb(estimated_bytes), to_mb(actual_bytes), diff_mb, - to_mb(ESTIMATION_TOLERANCE_BYTES), + to_mb(under_tolerance_bytes), ); } else { // Over-estimate: estimated exceeds actual assert!( - diff as usize <= ESTIMATION_SAFETY_MARGIN_BYTES, + diff as usize <= over_tolerance_bytes, "[memcheck] {label}: over-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, margin={:.2}MB", to_mb(estimated_bytes), to_mb(actual_bytes), diff_mb, - to_mb(ESTIMATION_SAFETY_MARGIN_BYTES), + to_mb(over_tolerance_bytes), ); } } diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index ea7090807..986b04439 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -71,7 +71,8 @@ mod memory; mod util; pub(crate) use memory::{ - check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory, + check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, + check_gpu_scheduler_mem_estimation_with_context, estimate_chip_proof_memory, estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan, estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker, }; diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index b48aa017a..d1041aeb5 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -603,6 +603,12 @@ impl< task.circuit_idx as u64, )); + let task_name = task.circuit_name.clone(); + let estimated_memory_bytes = task.estimated_memory_bytes as usize; + let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL"); + let chip_mem_tracker = + crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "create_chip_proof"); + let gpu_input: ProofInput<'static, gkr_iop::gpu::GpuBackend> = unsafe { std::mem::transmute(task.input) }; @@ -619,6 +625,11 @@ impl< task.num_witin, task.structural_rmm, )?; + crate::scheme::gpu::check_gpu_scheduler_mem_estimation_with_context( + chip_mem_tracker, + estimated_memory_bytes, + Some(task_name.as_str()), + ); Ok(ChipTaskResult { task_id: task.task_id, From 011a8981324320c60de431bfcd5d738732b35f68 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 23:32:03 +0800 Subject: [PATCH 18/25] Refine replay tower proof memory estimate --- ceno_zkvm/src/scheme/prover.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index d1041aeb5..33627ae68 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1268,13 +1268,13 @@ where let span = entered_span!("prove_tower_relation", profiling_2 = true); let r_set_len = cs.zkvm_v1_css.r_expressions.len() + cs.zkvm_v1_css.r_table_expressions.len(); - let (tower_build_estimated_bytes, tower_prove_estimated_bytes) = + let (tower_build_estimated_bytes, tower_prove_prebuild_estimated_bytes) = estimate_tower_stage_bytes::(cs, &input); tracing::info!( "[gpu tower][{}] estimated: build_tower={:.2}MB, prove_tower={:.2}MB", name, tower_build_estimated_bytes as f64 / (1024.0 * 1024.0), - tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0), + tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0), ); let tower_build_mem_tracker = crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "build_tower_witness_gpu"); @@ -1316,6 +1316,27 @@ where prod_specs: prod_gpu, logup_specs: logup_gpu, }; + let tower_prove_estimate = cuda_hal + .tower + .estimate_memory_requirements(&tower_input, NUM_FANIN); + let tower_input_live_bytes = tower_prove_estimate.prod_tower_buffer_bytes + + tower_prove_estimate.logup_tower_buffer_bytes; + let runtime_layout_prove_bytes = tower_prove_estimate + .total_bytes + .saturating_sub(tower_input_live_bytes); + let release_adjusted_prebuild_bytes = + tower_prove_prebuild_estimated_bytes / NUM_FANIN + 4 * 1024 * 1024; + let tower_prove_estimated_bytes = + runtime_layout_prove_bytes.max(release_adjusted_prebuild_bytes); + tracing::info!( + "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, local={:.2}MB, tower_live={:.2}MB", + name, + tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0), + runtime_layout_prove_bytes as f64 / (1024.0 * 1024.0), + release_adjusted_prebuild_bytes as f64 / (1024.0 * 1024.0), + tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0), + tower_input_live_bytes as f64 / (1024.0 * 1024.0), + ); let tower_prove_mem_tracker = crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "prove_tower_relation_gpu"); log_gpu_device_state(&format!("{name}:before_prove_tower")); From f3ca1cf35f4ad40db9b116693c4b1bce7832c32f Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Mon, 27 Apr 2026 23:38:32 +0800 Subject: [PATCH 19/25] clippy fix --- ceno_zkvm/src/bin/e2e.rs | 3 +- summary.md | 583 --------------------------------------- 2 files changed, 1 insertion(+), 585 deletions(-) delete mode 100644 summary.md diff --git a/ceno_zkvm/src/bin/e2e.rs b/ceno_zkvm/src/bin/e2e.rs index 721708389..c489567a7 100644 --- a/ceno_zkvm/src/bin/e2e.rs +++ b/ceno_zkvm/src/bin/e2e.rs @@ -5,8 +5,7 @@ use ceno_zkvm::print_allocated_bytes; use ceno_zkvm::{ e2e::{ Checkpoint, FieldType, MultiProver, PcsKind, Preset, public_io_words_to_digest_words, - run_e2e_full_trace_verify, run_e2e_single_shard_debug_verify, run_e2e_with_checkpoint, - setup_platform, setup_platform_debug, + run_e2e_with_checkpoint, setup_platform, setup_platform_debug, }, scheme::{ ZKVMProof, constants::MAX_NUM_VARIABLES, create_backend, create_prover, hal::ProverDevice, diff --git a/summary.md b/summary.md deleted file mode 100644 index 3b6979e24..000000000 --- a/summary.md +++ /dev/null @@ -1,583 +0,0 @@ -# WIP Summary: non-pow2 prover storage / GPU tower + PCS follow-up - -Date: 2026-04-25 - -Repos involved -- current repo: `/home/wusm/rust/ceno` -- GPU repo: `/home/wusm/rust/ceno-gpu` -- backend repo: `/home/wusm/rust/gkr-backend` - -Primary goal -- Remove prover-side MLE zero padding to next power-of-two. -- Keep prover storage compact by occupied length. -- Verifier semantics stay unchanged. - -Design agreed in this WIP -- Raw/original MLE inputs before sumcheck round 0 should use one unified policy: - - direct/native order - - occupied length respected - - this applies to both tower and PCS batch opening -- After round 0: - - folded values can use the normal later-round in-place buffer layout -- No separate application-specific policy for tower vs PCS. -- For tower specifically: - - within one tower layer, all MLEs should have the same `num_vars` - - tower should not rely on a meaningful “small MLE” mixed-size case - -What was fixed earlier in this WIP - -1. PCS / batch-open path -- Fixed missing round evaluations from GPU V2 sumcheck: - - `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` -- Fixed compact raw-data handling in batch open and commit/open consistency. -- Fixed an earlier `RootMismatch` by correcting raw trace -> encode padding boundary in batch commit. -- PCS later reached `final_codeword.values[idx] != folded`, then was narrowed further. -- At one point PCS/basefold batch-open `eq` layout mismatch was fixed by using Ceno/direct order. -- CPU e2e for the lightweight repro still passes. - -2. Tower witness/materialization direction -- Compact CPU oracle for tower semantics was added in: - - `../ceno-gpu/cuda_hal/src/common/tower/utils.rs` -- GPU tower build path was refactored toward compact storage in: - - `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` - - `../ceno-gpu/cpp/common/tower.cuh` - - `../ceno-gpu/cpp/bb31/kernels/tower.cu` - - `../ceno-gpu/cpp/gl64/kernels/tower.cu` -- A lifetime bug causing segfault in GPU tower eval extraction was fixed by retaining owned buffer backing: - - `../ceno-gpu/cuda_hal/src/common/buffer.rs` - - `../ceno-gpu/cuda_hal/src/lib.rs` - -3. Important debug correction -- There was a previous debug bug caused by cloning the transcript after GPU proving. -- That was fixed. -- Current CPU/GPU prover compares should assume transcript state is cloned before proof generation. - -Current CPU/GPU status - -CPU baseline -- Command: - - `cargo run --release --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` -- Result: - - passes - -GPU lightweight repro -- Command: - - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` -- Current result: - - still fails with tower verification mismatch - - source: - - `ceno_zkvm/src/e2e.rs:2347` - - `VerifyError("mismatch tower evaluation")` - -Most important findings from the latest tower debug - -1. Tower witness is not the first bad stage -- CPU/GPU tower witness compare did not fail first. -- Tower witness transport/leaf construction is not the main active bug. - -2. The earlier isolated layer-2 compare proved: -- `cpu_direct == v1` -- `v2 != cpu_direct` -- This was on a tower layer where all MLEs were full occupied: - - debug payload showed `mle_shape=[(?, 2, 4), ...]` - - meaning `num_vars=2`, `len=4` for all MLEs in that isolated layer -- That means the tower failure is not because tower requires mixed-size/small-MLE semantics. - -3. The current design conclusion -- Tower should use the same original-input policy as PCS: - - direct order before round 0 - - later rounds use the in-place buffer -- Do NOT think of this as two policies. - -4. Terminology decision -- Do not call later-round folded storage “replay buffer”. -- Call it: - - in-place buffer -- Round 0: - - non-in-place, reading original inputs -- Round > 0: - - in-place - -Latest code changes in the current session - -In `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` -- Renamed V2 metadata from `compact_layout_flags` to `original_layout_flags` -- This now means: - - `1` => original round-0 input is direct/native order -- This is intended to make the model explicit and shared across tower + PCS - -In `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` -- Added `direct_pair_index_v2` -- Changed direct-order round-0 reads for full-size equal-`num_vars` originals to use adjacent pairs: - - `(2p, 2p+1)` - - not `(p, p + stride)` -- Restored small-MLE helper mapping back to high-bit based mapping: - - `suffix_small_index_v2(...)` currently uses: - - `tid >> (num_vars - 1 - mle_num_vars)` -- Reverted an incorrect attempt to bit-reverse first-fold writes into the in-place buffer -- Current code writes first-fold results contiguously into the in-place buffer - -In `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` -- Relaxed tower assertions so layers can be compact-by-occupation, not necessarily full logical length at Rust-side checks - -What the latest tower debug showed - -Most recent trustworthy mismatch before the last interrupted run -- CPU/GPU tower compare failed at: - - `ceno_zkvm/src/scheme/gpu/mod.rs:665` -- Message: - - `CPU/GPU tower sumcheck proof mismatch: first_round=Some(2)` -- Interpretation: - - earlier proof entries already match - - divergence starts later, consistent with in-place-buffer semantics rather than original-input semantics - -Important caution about last run -- A later run was interrupted before producing a new useful payload. -- So do NOT assume the very latest in-place-buffer edits fixed anything. -- The last reliable signal is still: - - tower mismatch has moved later than round 0 - - current bug is likely in round > 0 in-place-buffer semantics - -Debug helpers currently present in `ceno_zkvm/src/scheme/gpu/mod.rs` -- `debug_compare_tower_cpu_gpu_prover(...)` -- `debug_compare_tower_eq_layers(...)` -- `debug_compare_tower_layer_v1_v2(..., round)` -- currently called for: - - `round = 2` - - `round = 3` - -Be careful -- Some helpers use fresh local transcripts like: - - `BasicTranscript::new(b"tower-layer2-debug")` -- These are only valid for isolated V1/V2/CPU direct comparisons. -- They are NOT end-to-end transcript or verifier oracles. - -Current best hypothesis -- The active tower bug is now in V2 later-round in-place-buffer semantics, not in: - - tower witness layout - - original round-0 direct-order policy - - transcript clone bugs - -Most relevant files to inspect next - -Current repo -- `ceno_zkvm/src/scheme/gpu/mod.rs` -- `ceno_zkvm/src/e2e.rs` - -GPU repo -- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` -- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` -- `../ceno-gpu/cuda_hal/src/common/tower/mod.rs` -- `../ceno-gpu/cuda_hal/src/common/tower/utils.rs` -- `../ceno-gpu/cuda_hal/src/lib.rs` -- `../ceno-gpu/cuda_hal/src/common/buffer.rs` - -Backend repo -- `../gkr-backend/crates/mpcs/...` -- `../gkr-backend/crates/sumcheck/...` - -Recommended next step for the new session -1. Read this file. -2. Keep CPU baseline as source of truth. -3. Continue from the latest tower state, focusing only on later-round in-place-buffer semantics in: - - `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` -4. Run exactly one lightweight GPU repro at a time: - - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci` - -Backups / snapshots -- Earlier stash-save/apply snapshots were created in this workstream. -- There is also filesystem snapshot history under: - - `/home/wusm/rust/ceno/.codex-backups/` - - -## E2E / validation commands executed in compact tower batch + estimator work - -Context -- Full clean was run before validating newly added CUDA kernels, to avoid stale C++/CUDA artifacts. -- Heavy commands used `timeout 1800s` so compilation can be slow, but execution cannot hang indefinitely. -- Logs were written to `/tmp` for later inspection. - -Clean/build commands -```bash -cargo clean -cargo clean --manifest-path ../ceno-gpu/cuda_hal/Cargo.toml -``` - -```bash -cargo build --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -``` -Result -- Passed. -- Elapsed: `4:07.82`. - -Lightweight sanity e2e after clean -```bash -RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci -``` -Result -- Passed. -- Elapsed: `0:09.29`. - -Cargo check after compact batch/estimator edits -```bash -timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e -``` -Result -- Passed. - -Final lightweight sanity e2e after removing temporary debug probe -```bash -RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci -``` -Result -- Passed. -- Elapsed: `0:08.34`. - -Heavy e2e command 1: serial proving + GPU mem tracking -```bash -CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall -``` -Executed with timeout/log wrapper: -```bash -/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial.log -``` -Initial result -- Failed due strict memory-estimator overestimate, not proof failure. -- Panic: - - `[memcheck] build_tower_witness_gpu: over-estimate! estimated=146.93MB, actual=126.43MB, diff=20.50MB, margin=10.00MB` -- Elapsed: `1:19.48`. - -After estimator fix, rerun with log: -```bash -/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial-after-estimate.log -``` -Final result -- Passed. -- Elapsed: `1:15.43`. -- Log: `/tmp/ceno-keccak-memtracking-serial-after-estimate.log`. - -Heavy e2e command 2: concurrent chip proving + GPU witgen -```bash -CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall -``` -Executed with timeout/log wrapper before estimator fix: -```bash -/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen.log -``` -Result -- Passed. -- Elapsed: `0:10.02`. -- Final pool peak around `291MB`. -- Log: `/tmp/ceno-keccak-concurrent-witgen.log`. - -Executed again after estimator fix: -```bash -/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen-after-estimate.log -``` -Final result -- Passed. -- Elapsed: `0:10.74`. -- Log: `/tmp/ceno-keccak-concurrent-witgen-after-estimate.log`. - -Diff hygiene commands -```bash -git diff --check -git -C ../ceno-gpu diff --check -``` -Result -- Both passed. - -## Restart state: benchmark memcheck under-estimate follow-up - -Date: 2026-04-26 - -Current task -- User reported a remaining GPU memory under-estimate when running the top-entry repo `/home/wusm/rust/ceno-reth-benchmark` against the local `/home/wusm/rust/ceno` repo. -- The benchmark command must use `--rpc-url "$CENO_RPC"`; do not paste or persist concrete RPC URLs in logs or docs. -- In the current shell, `CENO_RPC` was not set, so the benchmark repro could not be completed before restart. - -Current repo state -- Main repo: `/home/wusm/rust/ceno` -- Current branch includes commit: - - `5ecce046 fix mem estimator` -- Important existing fix already present: - - `ceno_zkvm/src/scheme/gpu/memory.rs` now estimates `build_main_witness` by materialized GKR outputs, not only final tower outputs. - - This fixed the earlier `Ecall_Keccak` under-estimate where old estimate was around `11.73MB` and actual was `16.00MB`. -- Local root `Cargo.toml` and `Cargo.lock` are dirty from pre-existing dependency/local-path work; do not accidentally revert them unless explicitly requested. - -New diagnostic patch added before restart -- Added contextual labels to GPU memcheck output so future failures identify both stage and circuit. -- Files touched: - - `ceno_zkvm/src/scheme/gpu/memory.rs` - - added `check_gpu_mem_estimation_with_context(...)` - - labels now print like `build_main_witness[Ecall_Keccak]` - - `ceno_zkvm/src/scheme/utils.rs` - - `build_main_witness` memcheck now includes first GKR layer name - - `ceno_zkvm/src/scheme/prover.rs` - - replay/build-tower/prove-tower memchecks now include circuit name in sequential GPU proving path - - `ceno_zkvm/src/scheme/gpu/mod.rs` - - prover trait memchecks now include first GKR layer name or task circuit name where available -- This patch is diagnostic/safety oriented; it does not change memory estimates. - -Validation already run after diagnostic patch -```bash -cargo fmt --check -``` -Result -- Passed. - -```bash -timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e -``` -Result -- Passed. - -Lightweight memcheck e2e command run after diagnostic patch -```bash -/usr/bin/time -f 'elapsed %E' timeout 900s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-light-keccak-context-memcheck.log -``` -Result -- Memcheck stages passed; no under-estimate panic. -- The run still fails later at the known verifier assertion in `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`. -- Useful log examples: - - `replay_gpu_witness_from_raw[Ecall_Keccak]: estimated=11.23MB, actual=11.23MB` - - `build_main_witness[Ecall_Keccak]: estimated=32.41MB, actual=32.59MB` - - `build_tower_witness_gpu[Ecall_Keccak]: estimated=105.83MB, actual=106.01MB` - - `prove_tower_relation_gpu[Ecall_Keccak]: estimated=36.84MB, actual=37.26MB` - - `replay_gpu_witness_from_raw[ShardRamCircuit]: estimated=0.38MB, actual=0.38MB` - - `build_main_witness[ShardRamCircuit_main]: estimated=0.01MB, actual=0.01MB` - - `build_tower_witness_gpu[ShardRamCircuit]: estimated=0.01MB, actual=0.02MB` - -Important conclusion so far -- Lightweight Ceno `keccak_syscall` no longer reproduces the reported memcheck under-estimate. -- The remaining issue appears large-payload/top-entry specific and needs the benchmark repro with `CENO_RPC` exported. -- Because contextual memcheck labels are now in place, the next benchmark run should immediately identify the failing stage and circuit. - -Required environment for next session -```bash -export CENO_RPC='' -``` -- The assistant cannot see shell variables unless they are present in the execution environment. -- Verify with: -```bash -if [ -n "${CENO_RPC:-}" ]; then echo 'CENO_RPC is set'; else echo 'CENO_RPC is NOT set'; fi -``` - -Benchmark repro command to run next -- Workdir: `/home/wusm/rust/ceno-reth-benchmark` -- Use timeout and tee log. -- Keep `--rpc-url "$CENO_RPC"` exactly; do not expand into a persisted command string. - -```bash -/usr/bin/time -f 'elapsed %E' timeout 2400s env \ - CENO_GPU_MEM_TRACKING=1 \ - CENO_CONCURRENT_CHIP_PROVING=0 \ - CENO_GPU_ENABLE_WITGEN=1 \ - RUST_MIN_STACK=16777216 \ - RUST_BACKTRACE=1 \ - CYCLE_TRACKER_MAX_DEPTH=4 \ - OUTPUT_PATH=metrics.json \ - CENO_GPU_CACHE_LEVEL=0 \ - RUSTFLAGS='-C target-feature=+avx2' \ - JEMALLOC_SYS_WITH_MALLOC_CONF='retain:true,metadata_thp:always,thp:always,dirty_decay_ms:-1,muzzy_decay_ms:-1' \ - RUST_LOG=debug \ - cargo run --features jemalloc --features metrics --features perf-metrics --features gpu --bin ceno-reth-benchmark-bin -- \ - --block-number 23587691 \ - --rpc-url "$CENO_RPC" \ - --cache-dir block_data \ - --mode prove-app \ - --app-proofs ./app_proof.bitcode \ - --shard-id 0 \ - --chain-id 1 \ - 2>&1 | tee /tmp/ceno-reth-benchmark-memcheck.log -``` - -After benchmark fails or completes -1. Extract memcheck failure context: -```bash -rg -n "under-estimate|over-estimate|\\[memcheck\\].*diff=-" /tmp/ceno-reth-benchmark-memcheck.log | tail -120 -``` -2. The failing line should now include a label like: - - `build_main_witness[]` - - `build_tower_witness_gpu[]` - - `prove_tower_relation_gpu[]` - - `replay_gpu_witness_from_raw[]` -3. Patch only the relevant estimator in `/home/wusm/rust/ceno`. -4. Re-run lightweight Ceno check first: -```bash -cargo fmt --check -timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e -``` -5. Then rerun the benchmark command above. - -Security hygiene -- If a concrete RPC URL accidentally appears in any local log, scrub it immediately: -```bash -for f in /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark/*.txt /home/wusm/rust/ceno-reth-benchmark/*.log; do - [ -f "$f" ] || continue - perl -0pi -e 's#https://eth-mainnet\\.g\\.alchemy\\.com/v2/[^\\s\\x27\\"]+#\\$CENO_RPC#g' "$f" -done -``` -- Verify no RPC string remains: -```bash -rg -n 'alchemy|eth-mainnet\.g\.alchemy' /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark -g '*.txt' -g '*.log' -g '*.md' -g '*.json' 2>/dev/null || true -``` - -## Architecture refresher: compact GPU witness / memory-estimator terminology - -This section is intended for a fresh session before touching estimators or compact witness code. - -Core terminology -- `occupied rows` / `actual rows`: - - Real number of rows with data for a chip or replay plan. - - Usually `input.num_instances() << rotation_vars` for normal chip inputs. - - For replayed GPU witgen, prefer replay-plan-specific real rows when available. -- `logical domain` / `full domain`: - - Power-of-two domain implied by `num_vars`. - - Some protocols/verifier semantics still reason over this domain. - - Prover storage should avoid allocating it when compact storage is sufficient. -- `compact witness`: - - Device/host storage sized by occupied rows, not full logical domain. - - This is the intended design for the GPU witgen/prover path. -- `materialized output`: - - GKR layer output MLE that is actually allocated during `build_main_witness`. - - `EvalExpression::Single` and `EvalExpression::Linear` materialize; `Zero` does not. -- `final/output GKR layer`: - - `gkr_circuit.layers[0]` because circuit layers are ordered output-to-input. - - The `output_mask` is applied only to this final/output layer during tower witness build. -- `internal GKR layers`: - - Any layer after index 0 in `gkr_circuit.layers`. - - These do not receive the final tower `output_mask`; all non-zero outputs are materialized. -- `replay path`: - - GPU witgen can replay raw records into device-backed witness matrices just-in-time. - - Large replay-heavy chips currently include `Ecall_Keccak` and `ShardRamCircuit`. -- `stage split`: - - Large replay chips materialize witness multiple times for separate stages to reduce peak VRAM. - - Estimator must model stage-local peaks, not sum all stages as simultaneously live. - -Important module map -- `ceno_zkvm/src/scheme/gpu/memory.rs` - - Central GPU memory estimator and memcheck assertion logic. - - Key functions: - - `estimate_chip_proof_memory` - - `estimate_trace_bytes` - - `estimate_main_witness_bytes` - - `estimate_tower_stage_components` - - `estimate_main_constraints_bytes` - - `estimate_replay_materialization_bytes_for_plan` - - `check_gpu_mem_estimation_with_context` -- `ceno_zkvm/src/scheme/utils.rs` - - Builds main GKR witness through `build_main_witness` / `gkr_witness`. - - Owns output materialization mask logic: - - `tower_output_count` - - `build_output_materialization_mask` - - `first_layer_output_group_stage_masks` - - Critical design point: - - `output_mask` is applied only to final/output GKR layer. -- `ceno_zkvm/src/scheme/prover.rs` - - Sequential per-chip GPU proving flow and replay stage splitting. - - Important stages: - - replay raw GPU witness - - build main witness - - build tower witness - - prove tower - - replay again for ECC/main constraints if needed -- `ceno_zkvm/src/scheme/gpu/mod.rs` - - GPU prover trait implementations and shared helpers. - - Includes trait-level memchecks for tower/main/ecc/replay helper paths. -- `../ceno-gpu/cuda_hal/src/common/tower/*` - - GPU tower witness/proof host-side implementation. -- `../ceno-gpu/cpp/common/tower.cuh` and kernel files under `../ceno-gpu/cpp/*/kernels/tower.cu` - - CUDA tower kernels and compact split logic. -- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs` - - Rust host-side V2 sumcheck setup. -- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh` - - CUDA V2 sumcheck logic. - -Current compact witness design assumptions -- Whole flow target: - - commit - - tower prove - - main prove - - rotation prove - - ECC prove - - batch opening - - should operate on compact witness storage wherever prover-side full-domain padding is not semantically required. -- Round-0 original inputs: - - Use direct/native order over real occupied data. - - Do not invent tower-specific order separate from PCS. -- Later folded rounds: - - Use normal in-place/folded buffer semantics. - - Do not call this a replay buffer; call it `in-place buffer`. -- Compact even/odd tails: - - Avoid branch-per-element loops for odd real lengths. - - Decide odd/even outside the loop and process the leftover tail separately. -- Cloning policy: - - Avoid full `clone` / `to_vec` on large witness buffers unless it is intentionally debug-only. - -Main witness memory-estimator design -- Old broken model: - - `tower_output_count(composed_cs) * rows * sizeof(BB31Ext)`. - - This only counts final tower outputs. -- Correct current model: - - Count final/output layer materialized tower outputs under the output mask. - - Plus count all internal layer non-zero outputs because internal layers are not masked. - - Multiply by real output rows, normally `input.witness.first().evaluations_len()`. -- Why this matters: - - Multi-layer GKR circuits like `Ecall_Keccak` materialize internal outputs during `build_main_witness`. - - Single-layer circuits like `ShardRamCircuit_main` usually do not have the same missing-internal-output issue. - -Replay / trace estimator design -- Normal non-replay path: - - Extracted witness and structural MLEs can stay resident across chip proof. - - Stage peak is resident trace plus max temporary stage. -- Replay-heavy path (`Ecall_Keccak`, `ShardRamCircuit`): - - Estimate replay materialization from replay plan real rows, not full logical domain. - - Replay witness is materialized for tower stages, then cleared before tower prove/main stages as designed. - - Estimator should use max of replay/build/prove/ecc/main stage peaks plus safety margin. -- Structural witness caveat: - - If structural RMM already has device backing, transport may be view-only and estimate zero new bytes. - - If not device-backed, estimate structural upload by real rows when possible. - -Tower estimator design -- Build stage estimate includes: - - CUDA tower build temporary allocations from `estimate_build_tower_memory`. - - Compact product split buffers. - - Compact logup split buffers. -- Prove stage estimate separates: - - live tower input buffers - - local create-proof temporary allocations -- For logup: - - If table lookup has numerator, numerator buffers are real compact buffers. - - If no numerator, ones/default numerator should not allocate a full domain buffer. - -Scheduler / memcheck relationship -- Sequential + `CENO_GPU_MEM_TRACKING=1`: - - Runs memcheck assertions stage-by-stage. - - This is the best mode for estimator debugging. -- Concurrent + mem tracking disabled: - - Uses estimator for booking/scheduling VRAM, not direct memcheck assertions. -- Booking can include extra safety margin for replay-heavy chips in concurrent mode. -- A stage-local memcheck pass does not automatically prove concurrent booking is optimal, but it strongly validates the per-stage estimator. - -Current known caveats -- Lightweight `keccak_syscall` memchecks pass after current estimator fixes. -- The lightweight run still hits a known verifier assertion later at: - - `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306` -- The remaining reported under-estimate is only known from the top-entry benchmark payload and must be reproduced with `CENO_RPC` exported. -- Do not guess the failing estimator from the old generic label; use the new contextual memcheck label first. - -Recommended investigation discipline -1. Reproduce with sequential mem tracking first: - - `CENO_GPU_MEM_TRACKING=1` - - `CENO_CONCURRENT_CHIP_PROVING=0` -2. Read the exact contextual label: - - `build_main_witness[...]` - - `build_tower_witness_gpu[...]` - - `prove_tower_relation_gpu[...]` - - `replay_gpu_witness_from_raw[...]` -3. Patch only the estimator for that stage/circuit class. -4. Validate in `/home/wusm/rust/ceno` first: - - `cargo fmt --check` - - `timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e` -5. Then rerun the top-entry benchmark. From 147f5679142911e16a6057d06cd4f5fc0bbd4d89 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Tue, 28 Apr 2026 08:11:12 +0800 Subject: [PATCH 20/25] add missing syncronization, avoid race condition --- ceno_zkvm/src/scheme/prover.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index 33627ae68..0d5a9feab 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1,6 +1,7 @@ use ff_ext::ExtensionField; use gkr_iop::{ cpu::{CpuBackend, CpuProver}, + error::BackendError, hal::ProverBackend, }; use std::{collections::BTreeMap, marker::PhantomData, sync::Arc}; @@ -1144,6 +1145,20 @@ where .get_pool_stream() .expect("should acquire stream"); let _thread_stream_guard = gkr_iop::gpu::bind_thread_stream(_stream.clone()); + let sync_concurrent_chip_stream = || -> Result<(), ZKVMError> { + if ChipScheduler::is_concurrent_mode() { + cuda_hal + .inner + .synchronize_stream(_stream.stream()) + .map_err(|e| { + ZKVMError::BackendError(BackendError::CircuitError( + format!("failed to synchronize GPU chip proof stream for {name}: {e:?}") + .into_boxed_str(), + )) + })?; + } + Ok(()) + }; let replay_stage_split = gpu_replay_plan .as_ref() .is_some_and(|plan| matches!(plan.kind, GpuWitgenKind::Keccak | GpuWitgenKind::ShardRam)); @@ -1399,6 +1414,7 @@ where wits_in_evals, fixed_in_evals, } = evals; + sync_concurrent_chip_stream()?; clear_materialized_input(&mut input); log_gpu_device_state(&format!("{name}:after_main_constraints")); exit_span!(span); @@ -1483,6 +1499,7 @@ where wits_in_evals, fixed_in_evals, } = evals; + sync_concurrent_chip_stream()?; exit_span!(span); Ok(( From 94fc7bfb1cd40b90488135e78ad25c0e69b35f9c Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Tue, 28 Apr 2026 10:41:27 +0800 Subject: [PATCH 21/25] Account ShardRam tower prove allocator overhead --- ceno_zkvm/src/scheme/gpu/memory.rs | 23 ++++++++++++++++++----- ceno_zkvm/src/scheme/gpu/mod.rs | 1 + ceno_zkvm/src/scheme/prover.rs | 10 +++++++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index f61bce28f..4abad8566 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -46,6 +46,15 @@ pub fn init_gpu_mem_tracker<'a>( const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024; +const SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES: usize = 16 * 1024 * 1024; + +pub(crate) fn tower_prove_allocator_overhead_bytes(circuit_name: &str) -> usize { + if circuit_name == "ShardRamCircuit" { + SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES + } else { + 0 + } +} /// Validate that the estimated GPU memory matches actual usage within tolerance. /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES` @@ -623,13 +632,12 @@ fn estimate_tower_stage_components Date: Tue, 28 Apr 2026 10:49:41 +0800 Subject: [PATCH 22/25] misc: clippy fix --- ceno_zkvm/src/scheme/prover.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index f4a525bf3..4294adfbb 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1,7 +1,8 @@ use ff_ext::ExtensionField; +#[cfg(feature = "gpu")] +use gkr_iop::error::BackendError; use gkr_iop::{ cpu::{CpuBackend, CpuProver}, - error::BackendError, hal::ProverBackend, }; use std::{collections::BTreeMap, marker::PhantomData, sync::Arc}; From d14e66a2cf5bb75859a6ad433c686921a4414d41 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Tue, 28 Apr 2026 14:52:38 +0800 Subject: [PATCH 23/25] Fix GPU proof memory estimation --- ceno_zkvm/src/scheme/gpu/memory.rs | 37 +++++++++++++++--------------- ceno_zkvm/src/scheme/gpu/mod.rs | 36 ++++++++++++++++++----------- ceno_zkvm/src/scheme/prover.rs | 11 ++++++--- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index 4abad8566..a2bf8f086 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -225,11 +225,11 @@ pub fn estimate_chip_proof_memory (usize, usize) { let base_elem_size = std::mem::size_of::(); - let mle_len = 1usize << num_vars; let compact_poly_bytes = num_witin * occupied_rows * base_elem_size; - let logical_poly_bytes = num_witin * mle_len * base_elem_size; + let transpose_temporary_bytes = 2 * compact_poly_bytes; if should_materialize_witness_on_gpu() { if should_retain_witness_device_backing_after_commit() { @@ -725,15 +724,17 @@ pub(crate) fn estimate_trace_extraction_bytes( } // GPU witgen alone does not imply replayability. Non-replayable traces - // still go through basefold::get_trace in cache-none mode, which - // allocates the extracted witness plus a temporary 2x transpose buffer. - return (compact_poly_bytes, 2 * logical_poly_bytes); + // still go through basefold::get_trace in cache-none mode. The fallback + // transpose buffer is 2x the compact RMM backing, not 2x the logical + // domain length. + return (compact_poly_bytes, transpose_temporary_bytes); } if matches!(get_gpu_cache_level(), CacheLevel::None) { // Default cache level is None - // get_trace allocates poly copies (resident) + temp_buffer (2x, freed after) - (compact_poly_bytes, 2 * logical_poly_bytes) + // get_trace allocates poly copies (resident) + temp_buffer over the + // compact RMM backing (2x, freed after). + (compact_poly_bytes, transpose_temporary_bytes) } else { (0, 0) } diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index f816f0e7b..4e5f3d0de 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -27,6 +27,7 @@ use ceno_gpu::{ use either::Either; use ff_ext::ExtensionField; use gkr_iop::{ + error::BackendError, gkr::{ self, Evaluation, GKRProof, GKRProverOutput, layer::{LayerWitness, gpu::utils::extract_mle_relationships_from_monomial_terms}, @@ -345,7 +346,7 @@ pub fn prove_tower_relation_impl as ProverBackend>::E>, cuda_hal: &Arc, -) -> TowerRelationOutput { +) -> Result, ZKVMError> { let stream = gkr_iop::gpu::get_thread_stream(); if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); @@ -360,11 +361,12 @@ pub fn prove_tower_relation_impl> TowerProver(composed_cs, input); check_gpu_mem_estimation_with_context( diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index 4294adfbb..eafad53ff 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -1372,8 +1372,13 @@ where basic_tr, gkr_iop::gpu::get_thread_stream().as_ref(), ) - .expect("gpu tower create_proof failed") - }); + .map_err(|e| { + ZKVMError::BackendError(BackendError::CircuitError( + format!("gpu tower create_proof failed for {name}: {e:?}") + .into_boxed_str(), + )) + }) + })?; log_gpu_device_state(&format!("{name}:after_prove_tower")); log_gpu_pool_usage(&format!("{name}:after_prove_tower")); let rt_tower: Point = unsafe { std::mem::transmute(rt_tower_gl) }; @@ -1469,7 +1474,7 @@ where prove_tower_relation_impl::( cs, &input, &records, challenges, transcript, &cuda_hal, ) - }); + })?; exit_span!(span); drop(records); From ceced51d0da6df0c6a767682f390f2b00ee72a59 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Tue, 28 Apr 2026 15:26:48 +0800 Subject: [PATCH 24/25] Fix GPU proof estimate row basis --- ceno_zkvm/src/scheme/gpu/memory.rs | 44 ++++++++++++++++++++++++++---- ceno_zkvm/src/scheme/prover.rs | 19 +++++++++++++ ceno_zkvm/src/scheme/scheduler.rs | 3 ++ 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index a2bf8f086..c4b54a2ff 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -183,11 +183,14 @@ pub fn estimate_chip_proof_memory>, circuit_name: &str, replay_plan: Option<&GpuReplayPlan>, + witness_trace_rows: Option, structural_cached_on_device: bool, ) -> u64 { let num_var_with_rotation = input.log2_num_instances() + composed_cs.rotation_vars().unwrap_or(0); let witness_replayable = replay_plan.is_some(); + let occupied_rows = + estimate_witness_occupied_rows(composed_cs, input, replay_plan, witness_trace_rows); let structural_resident_bytes = if structural_cached_on_device { 0 } else { @@ -204,10 +207,11 @@ pub fn estimate_chip_proof_memory>( + composed_cs: &ComposedConstrainSystem, + input: &ProofInput<'_, GpuBackend>, + replay_plan: Option<&GpuReplayPlan>, + witness_trace_rows: Option, +) -> usize { + if let Some(replay_plan) = replay_plan { + return replay_plan_actual_rows(replay_plan); + } + input + .witness + .first() + .map(|mle| mle.evaluations_len()) + .or(witness_trace_rows) + .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)) +} + pub(crate) struct TraceEstimate { /// Persistent resident bytes (witness polys + structural MLEs) pub(crate) trace_resident_bytes: usize, @@ -421,6 +442,7 @@ pub(crate) fn estimate_trace_bytes>, witness_replayable: bool, structural_cached_on_device: bool, + occupied_rows_override: Option, ) -> TraceEstimate { let cs = &composed_cs.zkvm_v1_css; let num_var_with_rotation = @@ -455,7 +477,9 @@ pub(crate) fn estimate_trace_bytes(out_eval: &EvalExpression pub fn main_witness_output_rows>( composed_cs: &ComposedConstrainSystem, input: &ProofInput<'_, GpuBackend>, + occupied_rows_override: Option, ) -> usize { if composed_cs .gkr_circuit @@ -533,6 +558,7 @@ pub fn main_witness_output_rows>( composed_cs: &ComposedConstrainSystem, input: &ProofInput<'_, GpuBackend>, + occupied_rows_override: Option, ) -> (usize, usize, usize, usize) { let cs = &composed_cs.zkvm_v1_css; let num_prod_towers = composed_cs.num_reads() + composed_cs.num_writes(); @@ -622,7 +649,12 @@ fn estimate_tower_stage_components(); let has_logup_numerator = composed_cs.is_with_lk_table(); - let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0); + let occupied_rows = input + .witness + .first() + .map(|mle| mle.evaluations_len()) + .or(occupied_rows_override) + .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)); let build_est = estimate_build_tower_memory( num_prod_towers, num_logup_towers, @@ -676,7 +708,7 @@ pub(crate) fn estimate_tower_stage_bytes>, ) -> (usize, usize) { let (build_bytes, prove_local_bytes, _, _) = - estimate_tower_stage_components(composed_cs, input); + estimate_tower_stage_components(composed_cs, input, None); (build_bytes, prove_local_bytes) } @@ -685,7 +717,7 @@ pub(crate) fn estimate_tower_bytes>, ) -> usize { let (build_bytes, prove_local_bytes, tower_input_live_bytes, _) = - estimate_tower_stage_components(composed_cs, input); + estimate_tower_stage_components(composed_cs, input, None); build_bytes.max(tower_input_live_bytes + prove_local_bytes) } diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index eafad53ff..eb31e8901 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -222,6 +222,8 @@ impl< let mut structural_rmms = Vec::with_capacity(name_and_instances.len()); #[cfg(feature = "gpu")] let mut gpu_replay_plans = Vec::with_capacity(name_and_instances.len()); + #[cfg(feature = "gpu")] + let mut witness_trace_rows = Vec::with_capacity(name_and_instances.len()); // commit to opcode circuits first and then commit to table circuits, sorted by name for (i, chip_input) in witnesses.into_iter_sorted().enumerate() { let crate::structs::ChipInput { @@ -235,6 +237,15 @@ impl< #[cfg(feature = "gpu")] let use_deferred_gpu_commit = crate::instructions::gpu::config::is_gpu_witgen_enabled() && !crate::instructions::gpu::config::should_retain_witness_device_backing_after_commit(); + #[cfg(feature = "gpu")] + let trace_rows_for_estimate = + if !crate::instructions::gpu::config::is_gpu_witgen_enabled() + && witness_rmm.num_instances() > 0 + { + Some(witness_rmm.height()) + } else { + None + }; #[cfg(feature = "gpu")] if use_deferred_gpu_commit { @@ -255,6 +266,8 @@ impl< } structural_rmms.push(structural_witness_rmm); #[cfg(feature = "gpu")] + witness_trace_rows.push(trace_rows_for_estimate); + #[cfg(feature = "gpu")] gpu_replay_plans.push(gpu_replay_plan); } @@ -366,6 +379,8 @@ impl< structural_rmms, #[cfg(feature = "gpu")] gpu_replay_plans, + #[cfg(feature = "gpu")] + witness_trace_rows, witness_mles, &witness_data, fixed_mles, @@ -873,6 +888,7 @@ impl< name_and_instances: Vec<(String, [usize; 2])>, structural_rmms: Vec>, #[cfg(feature = "gpu")] gpu_replay_plans: Vec>>, + #[cfg(feature = "gpu")] witness_trace_rows: Vec>, #[allow(unused_mut)] mut witness_mles: Vec>, witness_data: &PB::PcsData, mut fixed_mles: Vec>>, @@ -1001,6 +1017,7 @@ impl< gpu_input, &circuit_name, gpu_replay_plans[this_idx].as_ref(), + witness_trace_rows[this_idx], structural_cached_on_device, ) }; @@ -1054,6 +1071,8 @@ impl< witness_trace_idx, #[cfg(feature = "gpu")] gpu_replay_plan, + #[cfg(feature = "gpu")] + witness_trace_rows: witness_trace_rows[this_idx], num_witin: cs.num_witin(), structural_rmm: task_structural_rmm, }); diff --git a/ceno_zkvm/src/scheme/scheduler.rs b/ceno_zkvm/src/scheme/scheduler.rs index e792b6fd4..060f91083 100644 --- a/ceno_zkvm/src/scheme/scheduler.rs +++ b/ceno_zkvm/src/scheme/scheduler.rs @@ -90,6 +90,9 @@ pub struct ChipTask<'a, PB: ProverBackend> { /// Replay witness directly from shard-resident raw GPU data when available. #[cfg(feature = "gpu")] pub gpu_replay_plan: Option>, + /// Actual witness trace rows used for cache-none extraction estimates. + #[cfg(feature = "gpu")] + pub witness_trace_rows: Option, /// Expected number of witness polynomials for this circuit pub num_witin: usize, /// CPU-side structural witness RowMajorMatrix, transported to GPU on-demand From d1ab71a052c7ef5a49a9aa3b8daaec66e1c380f8 Mon Sep 17 00:00:00 2001 From: "sm.wu" Date: Tue, 28 Apr 2026 15:44:25 +0800 Subject: [PATCH 25/25] Tune ShardRam tower proof estimate --- ceno_zkvm/src/scheme/gpu/memory.rs | 39 ++++++++++++++++++------------ ceno_zkvm/src/scheme/gpu/mod.rs | 4 +-- ceno_zkvm/src/scheme/prover.rs | 12 +++------ 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs index c4b54a2ff..02617b83a 100644 --- a/ceno_zkvm/src/scheme/gpu/memory.rs +++ b/ceno_zkvm/src/scheme/gpu/memory.rs @@ -46,15 +46,7 @@ pub fn init_gpu_mem_tracker<'a>( const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024; -const SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES: usize = 16 * 1024 * 1024; - -pub(crate) fn tower_prove_allocator_overhead_bytes(circuit_name: &str) -> usize { - if circuit_name == "ShardRamCircuit" { - SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES - } else { - 0 - } -} +const SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES: usize = 16 * 1024 * 1024; /// Validate that the estimated GPU memory matches actual usage within tolerance. /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES` @@ -77,6 +69,28 @@ pub fn check_gpu_mem_estimation_with_context( ); } +pub(crate) fn check_gpu_tower_prove_mem_estimation_with_context( + mem_tracker: Option, + estimated_bytes: usize, + context: Option<&str>, +) { + let (under_tolerance_bytes, over_tolerance_bytes) = if context == Some("ShardRamCircuit") { + ( + SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES, + SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES, + ) + } else { + (ESTIMATION_TOLERANCE_BYTES, ESTIMATION_SAFETY_MARGIN_BYTES) + }; + check_gpu_mem_estimation_with_margins( + mem_tracker, + estimated_bytes, + context, + under_tolerance_bytes, + over_tolerance_bytes, + ); +} + pub fn check_gpu_scheduler_mem_estimation_with_context( mem_tracker: Option, estimated_bytes: usize, @@ -686,12 +700,7 @@ fn estimate_tower_stage_components = unsafe { std::mem::transmute(rt_tower_gl) }; let tower_proof: TowerProofs = unsafe { std::mem::transmute(tower_proof_gpu) }; - crate::scheme::gpu::check_gpu_mem_estimation_with_context( + crate::scheme::gpu::check_gpu_tower_prove_mem_estimation_with_context( tower_prove_mem_tracker, tower_prove_estimated_bytes, Some(name),