From ac49ac67e988f7ca6527db42babff0497190b1f5 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sat, 25 Apr 2026 23:18:37 +0800
Subject: [PATCH 01/25] refactor GPU compact tower witness flow

---
 Cargo.lock                                    | 124 ++++++-
 Cargo.toml                                    |  62 ++--
 ceno_zkvm/src/bin/e2e.rs                      |  13 +-
 .../src/instructions/gpu/chips/keccak.rs      |  24 +-
 .../src/instructions/gpu/chips/shard_ram.rs   |   8 +-
 ceno_zkvm/src/instructions/gpu/dispatch.rs    |   4 +-
 ceno_zkvm/src/instructions/gpu/utils/d2h.rs   |   4 +-
 ceno_zkvm/src/scheme/gpu/memory.rs            |  86 ++++-
 ceno_zkvm/src/scheme/gpu/mod.rs               | 330 ++++++++++--------
 ceno_zkvm/src/scheme/prover.rs                |  15 +-
 ceno_zkvm/src/scheme/utils.rs                 |   8 +-
 ceno_zkvm/src/scheme/verifier.rs              |   8 +
 gkr_iop/src/gkr/layer/gpu/utils.rs            |  13 +-
 gkr_iop/src/gpu/mod.rs                        |  20 +-
 gkr_iop/src/utils.rs                          |   9 +-
 15 files changed, 456 insertions(+), 272 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a83e37c45..ba90fc0e6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1600,10 +1600,49 @@ version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
 
+[[package]]
+name = "cuda-config"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
+dependencies = [
+ "glob",
+]
+
+[[package]]
+name = "cuda-runtime-sys"
+version = "0.3.0-alpha.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39"
+dependencies = [
+ "cuda-config",
+]
+
 [[package]]
 name = "cuda_hal"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9"
+dependencies = [
+ "anyhow",
+ "cuda-runtime-sys",
+ "cudarc",
+ "downcast-rs",
+ "either",
+ "ff_ext",
+ "itertools 0.13.0",
+ "mpcs",
+ "multilinear_extensions",
+ "p3",
+ "rand 0.8.5",
+ "rayon",
+ "sha2",
+ "sppark",
+ "sppark_plug",
+ "sumcheck",
+ "thiserror 1.0.69",
+ "tracing",
+ "transcript",
+ "witness",
+]
 
 [[package]]
 name = "cudarc"
@@ -2237,7 +2276,6 @@ dependencies = [
 [[package]]
 name = "ff_ext"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "once_cell",
  "p3",
@@ -2671,6 +2709,15 @@ dependencies = [
  "digest 0.10.7",
 ]
 
+[[package]]
+name = "home"
+version = "0.5.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
+dependencies = [
+ "windows-sys 0.61.1",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.64"
@@ -3102,6 +3149,12 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
@@ -3243,7 +3296,6 @@ dependencies = [
 [[package]]
 name = "mpcs"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -3267,7 +3319,6 @@ dependencies = [
 [[package]]
 name = "multilinear_extensions"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "either",
  "ff_ext",
@@ -4558,7 +4609,6 @@ dependencies = [
 [[package]]
 name = "p3"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "p3-air",
  "p3-baby-bear",
@@ -5126,7 +5176,6 @@ dependencies = [
 [[package]]
 name = "poseidon"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "ff_ext",
  "p3",
@@ -5724,6 +5773,19 @@ dependencies = [
  "semver 1.0.26",
 ]
 
+[[package]]
+name = "rustix"
+version = "0.38.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.15",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "rustix"
 version = "1.0.7"
@@ -5733,7 +5795,7 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.9.4",
  "windows-sys 0.59.0",
 ]
 
@@ -6083,7 +6145,6 @@ dependencies = [
 [[package]]
 name = "sp1-curves"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "cfg-if",
  "dashu",
@@ -6118,6 +6179,25 @@ dependencies = [
  "der",
 ]
 
+[[package]]
+name = "sppark"
+version = "0.1.11"
+dependencies = [
+ "cc",
+ "which",
+]
+
+[[package]]
+name = "sppark_plug"
+version = "0.1.0"
+dependencies = [
+ "cc",
+ "ff_ext",
+ "itertools 0.13.0",
+ "p3",
+ "sppark",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -6208,7 +6288,6 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 [[package]]
 name = "sumcheck"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "either",
  "ff_ext",
@@ -6226,7 +6305,6 @@ dependencies = [
 [[package]]
 name = "sumcheck_macro"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "itertools 0.13.0",
  "p3",
@@ -6307,7 +6385,7 @@ dependencies = [
  "fastrand",
  "getrandom 0.3.2",
  "once_cell",
- "rustix",
+ "rustix 1.0.7",
  "windows-sys 0.59.0",
 ]
 
@@ -6633,7 +6711,6 @@ dependencies = [
 [[package]]
 name = "transcript"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "ff_ext",
  "itertools 0.13.0",
@@ -6924,10 +7001,21 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix 0.38.44",
+]
+
 [[package]]
 name = "whir"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -7055,6 +7143,15 @@ dependencies = [
  "windows-targets 0.53.4",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -7214,7 +7311,6 @@ dependencies = [
 [[package]]
 name = "witness"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?tag=v1.0.0-alpha.24#a3538e3529a7eb87e8867f4a87b760d7ad9991f7"
 dependencies = [
  "ff_ext",
  "multilinear_extensions",
diff --git a/Cargo.toml b/Cargo.toml
index 8cc5823a5..8b79e59fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,16 @@
 [workspace]
 members = [
-  "ceno_cli",
-  "ceno_emul",
-  "ceno_host",
-  "ceno_serde",
-  "ceno_rt",
-  "ceno_zkvm",
-  "ceno_recursion",
-  "derive",
-  "examples-builder",
-  "examples",
-  "guest_libs/*",
+    "ceno_cli",
+    "ceno_emul",
+    "ceno_host",
+    "ceno_serde",
+    "ceno_rt",
+    "ceno_zkvm",
+    "ceno_recursion",
+    "derive",
+    "examples-builder",
+    "examples",
+    "guest_libs/*",
 ]
 resolver = "2"
 
@@ -66,11 +66,11 @@ secp = "0.4.1"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0"
 smallvec = { version = "1.13.2", features = [
-  "const_generics",
-  "const_new",
-  "serde",
-  "union",
-  "write",
+    "const_generics",
+    "const_new",
+    "serde",
+    "union",
+    "write",
 ] }
 strum = "0.26"
 strum_macros = "0.26"
@@ -79,7 +79,7 @@ thiserror = "2"
 thread_local = "1.1"
 tiny-keccak = { version = "2.0.2", features = ["keccak"] }
 tracing = { version = "0.1", features = [
-  "attributes",
+    "attributes",
 ] }
 tracing-forest = { version = "0.1.6" }
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@@ -127,20 +127,20 @@ lto = "thin"
 #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" }
 #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" }
 
-#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
-#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
-
-#[patch."https://github.com/scroll-tech/gkr-backend"]
-#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
-#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
-#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
-#p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
-#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
-#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
-#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
-#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
-#whir = { path = "../gkr-backend/crates/whir", package = "whir" }
-#witness = { path = "../gkr-backend/crates/witness", package = "witness" }
+[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
+ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
+
+[patch."https://github.com/scroll-tech/gkr-backend"]
+ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
+mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
+multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
+p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
+poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
+sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
+sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
+transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
+whir = { path = "../gkr-backend/crates/whir", package = "whir" }
+witness = { path = "../gkr-backend/crates/witness", package = "witness" }
 
 # [patch."https://github.com/scroll-tech/openvm.git"]
 # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false }
diff --git a/ceno_zkvm/src/bin/e2e.rs b/ceno_zkvm/src/bin/e2e.rs
index 95b15b581..721708389 100644
--- a/ceno_zkvm/src/bin/e2e.rs
+++ b/ceno_zkvm/src/bin/e2e.rs
@@ -352,17 +352,10 @@ fn run_inner<
     fs::write(&vk_file, vk_bytes).unwrap();
 
     if checkpoint > Checkpoint::PrepVerify {
+        // `run_e2e_with_checkpoint` already performs the real verification for the
+        // complete flow. Re-running it here without the emulation exit code causes
+        // a false "Unfinished execution" error to be logged.
         let verifier = ZKVMVerifier::new(vk);
-        if target_shard_id.is_some() {
-            run_e2e_single_shard_debug_verify(
-                &verifier,
-                zkvm_proofs.first().cloned().expect("missing shard proof"),
-                None,
-                max_steps,
-            );
-        } else {
-            run_e2e_full_trace_verify(&verifier, zkvm_proofs.clone(), None, max_steps);
-        }
         soundness_test(zkvm_proofs.first().cloned().unwrap(), &verifier);
     }
 }
diff --git a/ceno_zkvm/src/instructions/gpu/chips/keccak.rs b/ceno_zkvm/src/instructions/gpu/chips/keccak.rs
index 565e0dffa..4dc1bf289 100644
--- a/ceno_zkvm/src/instructions/gpu/chips/keccak.rs
+++ b/ceno_zkvm/src/instructions/gpu/chips/keccak.rs
@@ -348,8 +348,7 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
 ) -> Result<RowMajorMatrix<E::BaseField>, ZKVMError> {
     use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2;
 
-    let num_padded_instances = num_instances.next_power_of_two().max(2);
-    let num_padded_rows = num_padded_instances * 32;
+    let num_rows = num_instances * 32;
     let rotation = KECCAK_ROUNDS_CEIL_LOG2;
 
     let col_map = info_span!("col_map").in_scope(|| extract_keccak_column_map(config, num_witin));
@@ -358,7 +357,7 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
             .witgen_keccak(
                 &col_map,
                 packed_instances,
-                num_padded_rows,
+                num_rows,
                 shard_offset,
                 fetch_base_pc,
                 fetch_num_slots,
@@ -372,9 +371,10 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
     let raw_witin = if crate::instructions::gpu::config::is_debug_compare_enabled()
         || !should_materialize_witness_on_gpu()
     {
-        info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| {
+        let produced_rows = gpu_result.witness.num_rows;
+        info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| {
             let mut rmm_buffer = hal
-                .alloc_elems_on_device(num_padded_rows * num_witin, false, None)
+                .alloc_elems_on_device(produced_rows * num_witin, false, None)
                 .map_err(|e| {
                     ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into())
                 })?;
@@ -382,7 +382,7 @@ fn replay_keccak_witness_only_from_packed<E: ExtensionField>(
                 &hal.inner,
                 &mut rmm_buffer,
                 &gpu_result.witness.device_buffer,
-                num_padded_rows,
+                produced_rows,
                 num_witin,
             )
             .map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?;
@@ -445,8 +445,7 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
     use crate::precompiles::KECCAK_ROUNDS_CEIL_LOG2;
 
     let num_instances = step_indices.len();
-    let num_padded_instances = num_instances.next_power_of_two().max(2);
-    let num_padded_rows = num_padded_instances * 32; // 2^5 = 32 rows per instance
+    let num_rows = num_instances * 32; // 2^5 = 32 rows per instance
     let rotation = KECCAK_ROUNDS_CEIL_LOG2; // = 5
     let materialize_initial_witness = crate::instructions::gpu::config::is_debug_compare_enabled()
         || should_materialize_witness_on_initial_assign();
@@ -479,7 +478,7 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
                 .witgen_keccak(
                     &col_map,
                     &packed_instances,
-                    num_padded_rows,
+                    num_rows,
                     shard_ctx.current_shard_offset_cycle(),
                     fetch_base_pc,
                     fetch_num_slots,
@@ -565,9 +564,10 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
     } else if crate::instructions::gpu::config::is_debug_compare_enabled()
         || !should_materialize_witness_on_gpu()
     {
-        info_span!("transpose_d2h", rows = num_padded_rows, cols = num_witin).in_scope(|| {
+        let produced_rows = gpu_result.witness.num_rows;
+        info_span!("transpose_d2h", rows = produced_rows, cols = num_witin).in_scope(|| {
             let mut rmm_buffer = hal
-                .alloc_elems_on_device(num_padded_rows * num_witin, false, None)
+                .alloc_elems_on_device(produced_rows * num_witin, false, None)
                 .map_err(|e| {
                     ZKVMError::InvalidWitness(format!("GPU alloc for transpose failed: {e}").into())
                 })?;
@@ -575,7 +575,7 @@ fn gpu_assign_keccak_inner<E: ExtensionField>(
                 &hal.inner,
                 &mut rmm_buffer,
                 &gpu_result.witness.device_buffer,
-                num_padded_rows,
+                produced_rows,
                 num_witin,
             )
             .map_err(|e| ZKVMError::InvalidWitness(format!("GPU transpose failed: {e}").into()))?;
diff --git a/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs b/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs
index 21f1f89a0..0813449e1 100644
--- a/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs
+++ b/ceno_zkvm/src/instructions/gpu/chips/shard_ram.rs
@@ -439,11 +439,11 @@ pub(crate) fn try_gpu_assign_shard_ram<E: ExtensionField>(
     {
         let struct_data = tracing::info_span!(
             "gpu_shard_ram_structural_transpose_d2h",
-            num_rows_padded,
+            rows = gpu_structural.num_rows,
             num_structural_witin,
         )
         .in_scope(|| -> Result<_, ZKVMError> {
-            let wit_num_rows = num_rows_padded;
+            let wit_num_rows = gpu_structural.num_rows;
             let struct_num_cols = num_structural_witin;
             let mut struct_rmm_buf = hal
                 .witgen
@@ -684,11 +684,11 @@ pub(crate) fn try_gpu_assign_shard_ram_from_device<E: ExtensionField>(
     {
         let struct_data = tracing::info_span!(
             "gpu_shard_ram_structural_transpose_d2h_from_device",
-            num_rows_padded,
+            rows = gpu_structural.num_rows,
             num_structural_witin,
         )
         .in_scope(|| -> Result<_, ZKVMError> {
-            let wit_num_rows = num_rows_padded;
+            let wit_num_rows = gpu_structural.num_rows;
             let struct_num_cols = num_structural_witin;
             let mut struct_rmm_buf = hal
                 .witgen
diff --git a/ceno_zkvm/src/instructions/gpu/dispatch.rs b/ceno_zkvm/src/instructions/gpu/dispatch.rs
index be51108c4..f7cf58969 100644
--- a/ceno_zkvm/src/instructions/gpu/dispatch.rs
+++ b/ceno_zkvm/src/instructions/gpu/dispatch.rs
@@ -481,7 +481,7 @@ fn gpu_assign_instances_inner<E: ExtensionField, I: Instruction<E>>(
             total_instances,
             num_witin,
             I::padding_strategy(),
-        )
+        )?
     };
     if materialize_initial_witness {
         raw_witin.padding_by_strategy();
@@ -1484,7 +1484,7 @@ fn replay_gpu_witness_from_resident_raw<E: ExtensionField, I: Instruction<E>>(
         total_instances,
         replay.num_witin,
         I::padding_strategy(),
-    );
+    )?;
 
     // Keep replayed witness immutable after attaching the col-major device backing.
     // Mutating/padding a RowMajorMatrix clears device metadata, but replay consumers
diff --git a/ceno_zkvm/src/instructions/gpu/utils/d2h.rs b/ceno_zkvm/src/instructions/gpu/utils/d2h.rs
index fc558046d..5647cef12 100644
--- a/ceno_zkvm/src/instructions/gpu/utils/d2h.rs
+++ b/ceno_zkvm/src/instructions/gpu/utils/d2h.rs
@@ -303,9 +303,9 @@ pub(crate) fn gpu_witness_to_rmm<E: ExtensionField>(
     num_rows: usize,
     num_cols: usize,
     padding: InstancePaddingStrategy,
-) -> RowMajorMatrix<E::BaseField> {
+) -> Result<RowMajorMatrix<E::BaseField>, ZKVMError> {
     let mut rmm = RowMajorMatrix::<E::BaseField>::new(num_rows, num_cols, padding);
     // Keep the original col-major witness buffer as the source of truth for GPU commit.
     rmm.set_device_backing(gpu_result.device_buffer, DeviceMatrixLayout::ColMajor);
-    rmm
+    Ok(rmm)
 }
diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 6421ad3c9..d4434509c 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -110,12 +110,14 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     let trace_est = estimate_trace_bytes(
         composed_cs,
         input,
+        replay_plan,
         witness_replayable,
         structural_cached_on_device,
     );
 
     // Part 2: main witness (base usage)
-    let main_witness_bytes = estimate_main_witness_bytes(composed_cs, num_var_with_rotation);
+    let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
+    let main_witness_bytes = estimate_main_witness_bytes(composed_cs, occupied_rows);
 
     // Part 3: ecc quark (temporary usage)
     let n = num_var_with_rotation.saturating_sub(1);
@@ -258,6 +260,25 @@ pub(crate) fn estimate_structural_mle_bytes(num_structural_witin: usize, num_var
     num_structural_witin * mle_len * base_elem_size
 }
 
+fn replay_plan_actual_rows<E: ExtensionField>(replay_plan: &GpuReplayPlan<E>) -> usize {
+    match replay_plan.kind {
+        GpuWitgenKind::Keccak => replay_plan
+            .keccak_instances
+            .as_ref()
+            .map(|instances| instances.len() * 32)
+            .unwrap_or(replay_plan.trace_height),
+        GpuWitgenKind::ShardRam => replay_plan.trace_height,
+        _ => replay_plan.step_indices.len(),
+    }
+}
+
+fn replay_plan_actual_structural_rows<E: ExtensionField>(replay_plan: &GpuReplayPlan<E>) -> usize {
+    match replay_plan.kind {
+        GpuWitgenKind::ShardRam => replay_plan.shard_ram_num_records,
+        _ => replay_plan.trace_height,
+    }
+}
+
 pub fn estimate_replay_materialization_bytes(
     num_witin: usize,
     _num_structural_witin: usize,
@@ -273,7 +294,7 @@ pub fn estimate_replay_materialization_bytes_for_plan<E: ExtensionField>(
     _num_vars: usize,
 ) -> usize {
     let elem_size = std::mem::size_of::<BB31Base>();
-    let witness_bytes = replay_plan.trace_height * replay_plan.num_witin * elem_size;
+    let witness_bytes = replay_plan_actual_rows(replay_plan) * replay_plan.num_witin * elem_size;
     let replay_temp_bytes = match replay_plan.kind {
         GpuWitgenKind::Keccak => replay_plan
             .keccak_instances
@@ -299,6 +320,7 @@ pub fn estimate_replay_materialization_bytes_for_plan<E: ExtensionField>(
 pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
+    replay_plan: Option<&GpuReplayPlan<E>>,
     witness_replayable: bool,
     structural_cached_on_device: bool,
 ) -> TraceEstimate {
@@ -308,14 +330,36 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
 
     let structural_mle_bytes = if structural_cached_on_device {
         0
+    } else if should_materialize_witness_on_gpu() {
+        replay_plan
+            .map(|plan| {
+                cs.num_structural_witin as usize
+                    * replay_plan_actual_structural_rows(plan)
+                    * std::mem::size_of::<BB31Base>()
+            })
+            .unwrap_or_else(|| {
+                estimate_structural_mle_bytes(
+                    cs.num_structural_witin as usize,
+                    num_var_with_rotation,
+                )
+            })
     } else {
         estimate_structural_mle_bytes(cs.num_structural_witin as usize, num_var_with_rotation)
     };
-    let (witness_mle_bytes, trace_temporary_bytes) = estimate_trace_extraction_bytes(
-        cs.num_witin as usize,
-        num_var_with_rotation,
-        witness_replayable,
-    );
+    let (witness_mle_bytes, trace_temporary_bytes) =
+        if should_materialize_witness_on_gpu() && witness_replayable {
+            let base_elem_size = std::mem::size_of::<BB31Base>();
+            let actual_rows = replay_plan
+                .map(replay_plan_actual_rows)
+                .unwrap_or(1usize << num_var_with_rotation);
+            (cs.num_witin as usize * actual_rows * base_elem_size, 0)
+        } else {
+            estimate_trace_extraction_bytes(
+                cs.num_witin as usize,
+                num_var_with_rotation,
+                witness_replayable,
+            )
+        };
 
     TraceEstimate {
         trace_resident_bytes: witness_mle_bytes + structural_mle_bytes,
@@ -325,11 +369,10 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
 
 pub fn estimate_main_witness_bytes<E: ExtensionField>(
     composed_cs: &ComposedConstrainSystem<E>,
-    num_var_with_rotation: usize,
+    occupied_rows: usize,
 ) -> usize {
     let elem_size = std::mem::size_of::<BB31Ext>();
-    let record_len = 1usize << num_var_with_rotation;
-    tower_output_count(composed_cs) * record_len * elem_size
+    tower_output_count(composed_cs) * occupied_rows * elem_size
 }
 
 pub(crate) fn estimate_main_constraints_bytes<
@@ -426,6 +469,23 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         elem_size,
         has_logup_numerator,
     );
+    let prod_split_bytes = if num_prod_towers > 0 {
+        num_prod_towers * (1 << (num_vars + 1)) * elem_size
+    } else {
+        0
+    };
+    let logup_split_bytes = if num_logup_towers > 0 {
+        let denominator_bytes = num_logup_towers * (1 << (num_vars + 1)) * elem_size;
+        let numerator_bytes = if has_logup_numerator {
+            denominator_bytes
+        } else {
+            0
+        };
+        denominator_bytes + numerator_bytes
+    } else {
+        0
+    };
+    let build_bytes = build_est.total_bytes + prod_split_bytes + logup_split_bytes;
     let prove_est = estimate_prove_tower_memory(
         num_prod_towers,
         num_logup_towers,
@@ -439,11 +499,7 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         prove_est.prod_tower_buffer_bytes + prove_est.logup_tower_buffer_bytes;
     let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes);
 
-    (
-        build_est.total_bytes,
-        prove_local_bytes,
-        tower_input_live_bytes,
-    )
+    (build_bytes, prove_local_bytes, tower_input_live_bytes)
 }
 
 /// Estimate temporary GPU memory for the tower proving stage (build + prove).
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 142f894fe..3d25202de 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -32,7 +32,7 @@ use gkr_iop::{
         layer::{LayerWitness, gpu::utils::extract_mle_relationships_from_monomial_terms},
     },
     gpu::{GpuBackend, GpuProver, gpu_prover::BB31Ext},
-    hal::ProverBackend,
+    hal::{MultilinearPolynomial, ProverBackend},
 };
 use itertools::{Itertools, chain};
 use mpcs::{Point, PolynomialCommitmentScheme};
@@ -42,7 +42,7 @@ use multilinear_extensions::{
     util::ceil_log2,
     virtual_poly::{build_eq_x_r_vec, eq_eval},
 };
-use p3::matrix::Matrix;
+use p3::{field::FieldAlgebra, matrix::Matrix};
 use rayon::iter::{
     IndexedParallelIterator, IntoParallelIterator, IntoParallelRefIterator,
     IntoParallelRefMutIterator, ParallelIterator,
@@ -68,6 +68,60 @@ use witness::DeviceMatrixLayout;
 use gkr_iop::gpu::gpu_prover::*;
 
 mod memory;
+
+fn pad_gpu_mles_to_full_domain<E: ExtensionField>(
+    mles: impl IntoIterator<Item = ArcMultilinearExtensionGpu<'static, E>>,
+) -> Vec<ArcMultilinearExtensionGpu<'static, E>> {
+    let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL");
+    let stream = gkr_iop::gpu::get_thread_stream();
+    mles.into_iter()
+        .map(|mle| {
+            let mle_ref = mle.as_ref();
+            let full_len = 1usize << mle_ref.num_vars();
+            if mle_ref.evaluations_len() == full_len {
+                return mle;
+            }
+            let padded: gkr_iop::gpu::MultilinearExtensionGpu<'static, E> = match mle_ref.inner() {
+                gkr_iop::gpu::GpuFieldType::Base(poly) => {
+                    let mut host = poly.to_cpu_vec(stream.as_ref());
+                    host.resize(full_len, BB31Base::ZERO);
+                    unsafe {
+                        std::mem::transmute(
+                            gkr_iop::gpu::MultilinearExtensionGpu::<E>::from_ceno_gpu_base(
+                                ceno_gpu::bb31::GpuPolynomial::from_ceno_vec(
+                                    &cuda_hal,
+                                    &host,
+                                    mle_ref.num_vars(),
+                                    stream.as_ref(),
+                                )
+                                .expect("pad base mle"),
+                            ),
+                        )
+                    }
+                }
+                gkr_iop::gpu::GpuFieldType::Ext(poly) => {
+                    let mut host = poly.to_cpu_vec(stream.as_ref());
+                    host.resize(full_len, BB31Ext::ZERO);
+                    unsafe {
+                        std::mem::transmute(
+                            gkr_iop::gpu::MultilinearExtensionGpu::<E>::from_ceno_gpu_ext(
+                                ceno_gpu::bb31::GpuPolynomialExt::from_ceno_vec(
+                                    &cuda_hal,
+                                    &host,
+                                    mle_ref.num_vars(),
+                                    stream.as_ref(),
+                                )
+                                .expect("pad ext mle"),
+                            ),
+                        )
+                    }
+                }
+                gkr_iop::gpu::GpuFieldType::Unreachable => unreachable!(),
+            };
+            Arc::new(padded)
+        })
+        .collect()
+}
 mod util;
 pub(crate) use memory::{
     check_gpu_mem_estimation, estimate_chip_proof_memory, estimate_main_witness_bytes,
@@ -101,6 +155,30 @@ struct PcsResidentStats {
     total_rmms: usize,
 }
 
+fn rmm_device_backing_bytes<T>(rmm: &witness::RowMajorMatrix<T>) -> usize
+where
+    T: FieldAlgebra + Default + Sync + Clone + Send + Copy + 'static,
+{
+    rmm.device_backing_ref::<BufferImpl<'static, T>>()
+        .map(|device_buffer| device_buffer.len() * std::mem::size_of::<T>())
+        .unwrap_or(0)
+}
+
+fn rmm_col_major_device_rows<T>(rmm: &witness::RowMajorMatrix<T>) -> Option<usize>
+where
+    T: FieldAlgebra + Default + Sync + Clone + Send + Copy + 'static,
+{
+    if rmm.device_backing_layout() != Some(DeviceMatrixLayout::ColMajor) {
+        return None;
+    }
+    let cols = rmm.width();
+    if cols == 0 {
+        return Some(0);
+    }
+    let device_buffer = rmm.device_backing_ref::<BufferImpl<'static, BB31Base>>()?;
+    Some(device_buffer.len() / cols)
+}
+
 fn pcs_resident_stats(
     pcs_data_basefold: &BasefoldCommitmentWithWitnessGpu<
         BB31Base,
@@ -141,7 +219,7 @@ fn pcs_resident_stats(
             (
                 rmms.iter()
                     .filter(|rmm| rmm.has_device_backing())
-                    .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::<BB31Base>())
+                    .map(rmm_device_backing_bytes)
                     .sum::<usize>(),
                 rmms.iter().filter(|rmm| rmm.has_device_backing()).count(),
                 rmms.len(),
@@ -331,24 +409,12 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
     let r_set_len = cs.r_expressions.len() + cs.r_table_expressions.len();
 
     let (point, proof, lk_out_evals, w_out_evals, r_out_evals) = {
-        // build_tower_witness_gpu will allocate buffers and build GPU specs
+        // build_tower_witness_gpu builds compact GPU specs directly.
         let span = entered_span!("build_tower_witness", profiling_2 = true);
-        let mut _big_buffers: Vec<BufferImpl<BB31Ext>> = Vec::new();
-        let mut _ones_buffer: Vec<GpuPolynomialExt<'static>> = Vec::new();
-        let mut _view_last_layers: Vec<Vec<Vec<GpuPolynomialExt<'static>>>> = Vec::new();
         let (prod_gpu, logup_gpu) = info_span!("[ceno] build_tower_witness_gpu").in_scope(|| {
-            build_tower_witness_gpu(
-                composed_cs,
-                input,
-                records,
-                challenges,
-                cuda_hal,
-                &mut _big_buffers,
-                &mut _ones_buffer,
-                &mut _view_last_layers,
-            )
-            .map_err(|e| format!("build_tower_witness_gpu failed: {}", e))
-            .unwrap()
+            build_tower_witness_gpu(composed_cs, input, records, challenges, cuda_hal)
+                .map_err(|e| format!("build_tower_witness_gpu failed: {}", e))
+                .unwrap()
         });
         exit_span!(span);
 
@@ -473,11 +539,12 @@ pub fn prove_rotation_impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>
     let log2_num_instances = input.log2_num_instances();
     let num_threads = optimal_sumcheck_threads(log2_num_instances);
     let num_var_with_rotation = log2_num_instances + composed_cs.rotation_vars().unwrap_or(0);
-    let wit = LayerWitness(
+    let padded_wit_storage = pad_gpu_mles_to_full_domain(
         chain!(&input.witness, &input.fixed, &input.structural_witness)
             .cloned()
-            .collect_vec(),
+            .map(|mle| unsafe { std::mem::transmute(mle) }),
     );
+    let wit = LayerWitness(padded_wit_storage);
 
     let (proof, points) = gkr_iop::gkr::layer::gpu::prove_rotation_gpu::<E, PCS>(
         num_threads,
@@ -691,11 +758,11 @@ pub fn prove_main_constraints_impl<
         num_threads,
         num_var_with_rotation,
         gkr::GKRCircuitWitness {
-            layers: vec![LayerWitness(
+            layers: vec![LayerWitness(pad_gpu_mles_to_full_domain(
                 chain!(&input.witness, &input.fixed, &input.structural_witness,)
                     .cloned()
-                    .collect_vec(),
-            )],
+                    .map(|mle| unsafe { std::mem::transmute(mle) }),
+            ))],
         },
         &out_evals,
         &input
@@ -1367,7 +1434,8 @@ where
     let device_buffer = witness_rmm
         .device_backing_ref::<BufferImpl<'static, BB31Base>>()
         .unwrap_or_else(|| panic!("col-major replay witness device backing type mismatch"));
-    let rows = witness_rmm.height();
+    let rows = rmm_col_major_device_rows(&witness_rmm)
+        .unwrap_or_else(|| panic!("col-major replay witness device backing row count mismatch"));
     let cols = witness_rmm.width();
     let poly_len_bytes = rows * std::mem::size_of::<BB31Base>();
 
@@ -1380,14 +1448,11 @@ where
     (0..cols)
         .map(|col_idx| {
             let src_byte_offset = col_idx * poly_len_bytes;
-            // Keep an owned handle to the parent GPU allocation instead of a
-            // borrowed CudaView. The resulting MLE outlives this helper.
             let view_buf =
                 device_buffer.owned_subrange(src_byte_offset..src_byte_offset + poly_len_bytes);
-            let view_poly = GpuPolynomial::new(view_buf, rows.trailing_zeros() as usize);
-            let view_poly_static: GpuPolynomial<'static> =
-                unsafe { std::mem::transmute(view_poly) };
-            let mle_static = MultilinearExtensionGpu::from_ceno_gpu_base(view_poly_static);
+            let view_poly = GpuPolynomial::new(view_buf, witness_rmm.num_vars());
+            let poly_static: GpuPolynomial<'static> = unsafe { std::mem::transmute(view_poly) };
+            let mle_static = MultilinearExtensionGpu::from_ceno_gpu_base(poly_static);
             Arc::new(unsafe {
                 std::mem::transmute::<
                     MultilinearExtensionGpu<'static, E>,
@@ -1421,7 +1486,7 @@ pub fn clear_replayable_trace_device_backing<E, PCS>(
     let before_device_bytes = rmms
         .iter()
         .filter(|rmm| rmm.has_device_backing())
-        .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::<BB31Base>())
+        .map(rmm_device_backing_bytes)
         .sum::<usize>();
 
     for (trace_idx, _) in replayable_traces {
@@ -1432,7 +1497,7 @@ pub fn clear_replayable_trace_device_backing<E, PCS>(
     let after_device_bytes = rmms
         .iter()
         .filter(|rmm| rmm.has_device_backing())
-        .map(|rmm| rmm.height() * rmm.width() * std::mem::size_of::<BB31Base>())
+        .map(rmm_device_backing_bytes)
         .sum::<usize>();
     tracing::info!(
         "[gpu] cleared replayable PCS RMM device backing: replayable_traces={}, rmms_device_before={:.2}MB ({}) -> after={:.2}MB ({})",
@@ -1508,7 +1573,8 @@ where
         let device_buffer = structural_rmm
             .device_backing_ref::<BufferImpl<'static, BB31Base>>()
             .unwrap_or_else(|| panic!("col-major structural device backing type mismatch"));
-        let rows = structural_rmm.height();
+        let rows = rmm_col_major_device_rows(structural_rmm)
+            .unwrap_or_else(|| panic!("col-major structural device backing row count mismatch"));
         let cols = structural_rmm.width();
         let poly_len_bytes = rows * std::mem::size_of::<BB31Base>();
         let total_bytes = cols * poly_len_bytes;
@@ -1517,8 +1583,7 @@ where
             total_bytes,
             "structural col-major buffer size mismatch"
         );
-        let num_vars_in_poly = rows.trailing_zeros() as usize;
-        assert_eq!(rows, 1usize << num_vars_in_poly);
+        let num_vars_in_poly = structural_rmm.num_vars();
 
         (0..cols)
             .map(|col_idx| {
@@ -1559,25 +1624,22 @@ where
 }
 
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
+pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, impl PolynomialCommitmentScheme<E>>>,
     records: &[ArcMultilinearExtensionGpu<'_, E>],
     challenges: &[E; 2],
     cuda_hal: &CudaHalBB31,
-    big_buffers: &'buf mut Vec<BufferImpl<BB31Ext>>,
-    ones_buffer: &mut Vec<GpuPolynomialExt<'static>>,
-    view_last_layers: &mut Vec<Vec<Vec<GpuPolynomialExt<'static>>>>,
 ) -> Result<
     (
-        Vec<ceno_gpu::GpuProverSpec<'buf>>,
-        Vec<ceno_gpu::GpuProverSpec<'buf>>,
+        Vec<ceno_gpu::GpuProverSpec<'static>>,
+        Vec<ceno_gpu::GpuProverSpec<'static>>,
     ),
     String,
 > {
     let stream = gkr_iop::gpu::get_thread_stream();
     use crate::scheme::constants::{NUM_FANIN, NUM_FANIN_LOGUP};
-    use ceno_gpu::{CudaHal as _, bb31::GpuPolynomialExt};
+    use ceno_gpu::bb31::GpuPolynomialExt;
     use p3::field::FieldAlgebra;
 
     let ComposedConstrainSystem {
@@ -1585,7 +1647,7 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
     } = composed_cs;
     let _num_instances_with_rotation =
         input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
-    let _chip_record_alpha = challenges[0];
+    let chip_record_alpha: BB31Ext = unsafe { std::mem::transmute_copy(&challenges[0]) };
 
     // SAFETY: The `records` slice is borrowed for the duration of this function call.
     // The lifetime is erased to 'static only to satisfy GPU API signatures that require
@@ -1616,46 +1678,62 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         &records[offset..][..cs.lk_expressions.len()]
     };
 
-    assert_eq!(big_buffers.len(), 0, "expect no big buffers");
-
-    // prod: last layes & buffer
-    let mut is_prod_buffer_exists = false;
+    // prod: split last layer once, then build compact tower layers.
     let prod_last_layers = r_set_wit
         .iter()
         .chain(w_set_wit.iter())
-        .map(|wit| wit.as_view_chunks(NUM_FANIN))
-        .collect::<Vec<_>>();
+        .map(|wit| match wit.inner() {
+            gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
+                .tower
+                .masked_mle_split_to_chunks(
+                    &*cuda_hal,
+                    poly,
+                    NUM_FANIN,
+                    BB31Ext::ONE,
+                    stream.as_ref(),
+                )
+                .map_err(|e| format!("Failed to split compact prod tower input: {e}")),
+            _ => return Err("tower witness expects extension-field record MLEs".to_string()),
+        })
+        .collect::<Result<Vec<_>, String>>()?;
     if !prod_last_layers.is_empty() {
         let first_layer = &prod_last_layers[0];
         assert_eq!(first_layer.len(), 2, "prod last_layer must have 2 MLEs");
-        let num_vars = first_layer[0].num_vars();
-        let num_towers = prod_last_layers.len();
-        view_last_layers.push(prod_last_layers);
-
-        // Allocate one big buffer for all product towers and add it to big_buffers
-        let tower_size = 1 << (num_vars + 1); // 2 * mle_len elements per tower
-        let total_buffer_size = num_towers * tower_size;
-        tracing::debug!(
-            "prod tower request buffer size: {:.2} MB",
-            (total_buffer_size * std::mem::size_of::<BB31Ext>()) as f64 / (1024.0 * 1024.0)
-        );
-        let big_buffer = cuda_hal
-            .alloc_ext_elems_on_device(total_buffer_size, false, stream.as_ref())
-            .map_err(|e| format!("Failed to allocate prod GPU buffer: {:?}", e))?;
-        big_buffers.push(big_buffer);
-        is_prod_buffer_exists = true;
     }
 
-    // logup: last layes
-    let mut is_logup_buffer_exists = false;
+    // logup: split last layer once, then build compact tower layers.
     let lk_numerator_last_layer = lk_n_wit
         .iter()
-        .map(|wit| wit.as_view_chunks(NUM_FANIN_LOGUP))
-        .collect::<Vec<_>>();
+        .map(|wit| match wit.inner() {
+            gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
+                .tower
+                .masked_mle_split_to_chunks(
+                    &*cuda_hal,
+                    poly,
+                    NUM_FANIN_LOGUP,
+                    chip_record_alpha,
+                    stream.as_ref(),
+                )
+                .map_err(|e| format!("Failed to split compact logup numerator: {e}")),
+            _ => Err("tower witness expects extension-field logup numerator MLEs".to_string()),
+        })
+        .collect::<Result<Vec<_>, String>>()?;
     let lk_denominator_last_layer = lk_d_wit
         .iter()
-        .map(|wit| wit.as_view_chunks(NUM_FANIN_LOGUP))
-        .collect::<Vec<_>>();
+        .map(|wit| match wit.inner() {
+            gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
+                .tower
+                .masked_mle_split_to_chunks(
+                    &*cuda_hal,
+                    poly,
+                    NUM_FANIN_LOGUP,
+                    chip_record_alpha,
+                    stream.as_ref(),
+                )
+                .map_err(|e| format!("Failed to split compact logup denominator: {e}")),
+            _ => Err("tower witness expects extension-field logup denominator MLEs".to_string()),
+        })
+        .collect::<Result<Vec<_>, String>>()?;
     let logup_last_layers = if !lk_numerator_last_layer.is_empty() {
         // Case when we have both numerator and denominator
         // Combine [p1, p2] from numerator and [q1, q2] from denominator
@@ -1665,100 +1743,47 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
             .map(|(lk_n_chunks, lk_d_chunks)| {
                 let mut last_layer = lk_n_chunks;
                 last_layer.extend(lk_d_chunks);
-                last_layer
+                Ok(last_layer)
             })
-            .collect::<Vec<_>>()
+            .collect::<Result<Vec<_>, String>>()?
     } else if lk_denominator_last_layer.is_empty() {
         vec![]
     } else {
-        // Case when numerator is empty - create shared ones_buffer and use views
-        // This saves memory by having all p1, p2 polynomials reference the same buffer
+        // Case when numerator is empty: share one owned ones buffer across all p1/p2 polynomials.
         let nv = lk_denominator_last_layer[0][0].num_vars();
 
-        // Create one shared ones_buffer as Owned (can be 'static)
         let ones_poly =
             GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE, stream.as_ref())
                 .map_err(|e| format!("Failed to create shared ones_buffer: {:?}", e))
                 .unwrap();
-        // SAFETY: Owned buffer can be safely treated as 'static
-        let ones_poly_static: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) };
-        ones_buffer.push(ones_poly_static);
-
-        // Get reference from storage to ensure proper lifetime
-        let ones_poly_ref = ones_buffer.last().unwrap();
-        let mle_len_bytes = ones_poly_ref.evaluations().len() * std::mem::size_of::<BB31Ext>();
+        let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) };
+        let mle_len_bytes = ones_poly.buf.len() * std::mem::size_of::<BB31Ext>();
 
-        // Create views referencing the shared ones_buffer for each tower's p1, p2
         lk_denominator_last_layer
             .into_iter()
             .map(|lk_d_chunks| {
-                // Create views of ones_buffer for p1 and p2
-                let p1_view = ones_poly_ref.evaluations().as_slice_range(0..mle_len_bytes);
-                let p2_view = ones_poly_ref.evaluations().as_slice_range(0..mle_len_bytes);
-                let p1_gpu = GpuPolynomialExt::new(BufferImpl::new_from_view(p1_view), nv);
-                let p2_gpu = GpuPolynomialExt::new(BufferImpl::new_from_view(p2_view), nv);
-                // SAFETY: views from 'static buffer can be 'static
-                let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) };
-                let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) };
-                // Use [p1, p2, q1, q2] format for the last layer
+                let p1_gpu =
+                    GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv);
+                let p2_gpu =
+                    GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv);
                 let mut last_layer = vec![p1_gpu, p2_gpu];
                 last_layer.extend(lk_d_chunks);
-                last_layer
+                Ok(last_layer)
             })
-            .collect::<Vec<_>>()
+            .collect::<Result<Vec<_>, String>>()?
     };
     if !logup_last_layers.is_empty() {
         let first_layer = &logup_last_layers[0];
         assert_eq!(first_layer.len(), 4, "logup last_layer must have 4 MLEs");
-        let num_vars = first_layer[0].num_vars();
-        let num_towers = logup_last_layers.len();
-        view_last_layers.push(logup_last_layers);
-
-        // Allocate one big buffer for all towers and add it to big_buffers
-        let tower_size = 1 << (num_vars + 2); // 4 * mle_len elements per tower
-        let total_buffer_size = num_towers * tower_size;
-        tracing::debug!(
-            "logup tower request buffer size: {:.2} MB",
-            (total_buffer_size * std::mem::size_of::<BB31Ext>()) as f64 / (1024.0 * 1024.0)
-        );
-        let big_buffer = cuda_hal
-            .alloc_ext_elems_on_device(total_buffer_size, false, stream.as_ref())
-            .unwrap();
-        big_buffers.push(big_buffer);
-        is_logup_buffer_exists = true;
     }
 
-    let (_, pushed_big_buffers) = big_buffers.split_at_mut(0);
-    let (prod_big_buffer, logup_big_buffer) = match (
-        is_prod_buffer_exists,
-        is_logup_buffer_exists,
-        pushed_big_buffers,
-    ) {
-        (false, false, []) => (None, None),
-        (true, false, [prod]) => (Some(prod), None),
-        (false, true, [logup]) => (None, Some(logup)),
-        (true, true, [prod, logup]) => (Some(prod), Some(logup)),
-        (prod_flag, logup_flag, slice) => {
-            panic!(
-                "unexpected state: prod={}, logup={}, newly_pushed_len={}",
-                prod_flag,
-                logup_flag,
-                slice.len()
-            );
-        }
-    };
-
     // Build product GpuProverSpecs
     let mut prod_gpu_specs = Vec::new();
-    if is_prod_buffer_exists {
-        let prod_last_layers = &view_last_layers[0];
+    if !prod_last_layers.is_empty() {
         let first_layer = &prod_last_layers[0];
         assert_eq!(first_layer.len(), 2, "prod last_layer must have 2 MLEs");
         let num_vars = first_layer[0].num_vars();
         let num_towers = prod_last_layers.len();
-        let Some(prod_big_buffer) = prod_big_buffer else {
-            panic!("prod big buffer not found");
-        };
 
         let span_prod = entered_span!(
             "build_prod_tower",
@@ -1770,7 +1795,6 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         let gpu_specs = {
             cuda_hal.tower.build_prod_tower_from_gpu_polys_batch(
                 cuda_hal,
-                prod_big_buffer,
                 &last_layers_refs,
                 num_vars,
                 num_towers,
@@ -1784,15 +1808,11 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
 
     // Build logup GpuProverSpecs
     let mut logup_gpu_specs = Vec::new();
-    if is_logup_buffer_exists {
-        let logup_last_layers = view_last_layers.last().unwrap();
+    if !logup_last_layers.is_empty() {
         let first_layer = &logup_last_layers[0];
         assert_eq!(first_layer.len(), 4, "logup last_layer must have 4 MLEs");
         let num_vars = first_layer[0].num_vars();
         let num_towers = logup_last_layers.len();
-        let Some(logup_big_buffer) = logup_big_buffer else {
-            panic!("logup big buffer not found");
-        };
 
         let span_logup = entered_span!(
             "build_logup_tower",
@@ -1805,14 +1825,12 @@ pub(crate) fn build_tower_witness_gpu<'buf, E: ExtensionField>(
             .tower
             .build_logup_tower_from_gpu_polys_batch(
                 cuda_hal,
-                logup_big_buffer,
                 &last_layers_refs,
                 num_vars,
                 num_towers,
                 stream.as_ref(),
             )
             .map_err(|e| format!("build_logup_tower_from_gpu_polys_batch failed: {:?}", e))?;
-
         logup_gpu_specs.extend(gpu_specs);
         exit_span!(span_logup);
     }
@@ -2005,7 +2023,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
             rounds.push((fixed_data, {
                 evals
                     .iter_mut()
-                    .zip(points)
+                    .zip(points.iter().cloned())
                     .filter_map(|(evals, point)| {
                         if !evals.is_empty() && !evals[0].is_empty() {
                             Some((point.clone(), evals.remove(0)))
@@ -2169,10 +2187,22 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                 task.circuit_name,
                 estimated_replay_bytes as f64 / (1024.0 * 1024.0),
             );
-            let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed");
-            check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes);
-            task.input.witness = info_span!("[ceno] replay_gpu_witness_from_raw")
-                .in_scope(|| extract_witness_mles_for_trace_rmm::<E>(witness_rmm));
+            task.input.witness = if let Some(trace_idx) = task.witness_trace_idx {
+                check_gpu_mem_estimation(gpu_mem_tracker, 0);
+                info_span!("[ceno] extract_witness_mles").in_scope(|| {
+                    extract_witness_mles_for_trace::<E, PCS>(
+                        pcs_data,
+                        trace_idx,
+                        task.num_witin,
+                        num_vars,
+                    )
+                })
+            } else {
+                let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed");
+                check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes);
+                info_span!("[ceno] replay_gpu_witness_from_raw")
+                    .in_scope(|| extract_witness_mles_for_trace_rmm::<E>(witness_rmm))
+            };
             if let Some(rmm) = task.structural_rmm.as_ref() {
                 task.input.structural_witness = info_span!("[ceno] transport_structural_witness")
                     .in_scope(|| {
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 516dee741..45f786c6f 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1264,22 +1264,12 @@ where
         );
         let tower_build_mem_tracker =
             crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "build_tower_witness_gpu");
-        let mut big_buffers = Vec::new();
-        let mut ones_buffer = Vec::new();
-        let mut view_last_layers = Vec::new();
         log_gpu_device_state(&format!("{name}:before_build_tower_witness"));
         log_gpu_pool_usage(&format!("{name}:before_build_tower_witness"));
         let (prod_gpu, logup_gpu, lk_out_evals, w_out_evals, r_out_evals) =
             info_span!("[ceno] build_tower_witness_gpu").in_scope(|| {
                 let (prod_gpu, logup_gpu) = build_tower_witness_gpu(
-                    cs,
-                    &input,
-                    &records,
-                    challenges,
-                    &cuda_hal,
-                    &mut big_buffers,
-                    &mut ones_buffer,
-                    &mut view_last_layers,
+                    cs, &input, &records, challenges, &cuda_hal,
                 )
                 .map_err(|e| {
                     ZKVMError::InvalidWitness(format!("build_tower_witness_gpu failed: {e}").into())
@@ -1332,9 +1322,6 @@ where
         check_gpu_mem_estimation(tower_prove_mem_tracker, tower_prove_estimated_bytes);
         drop(records);
         drop(tower_input);
-        drop(big_buffers);
-        drop(ones_buffer);
-        drop(view_last_layers);
         log_gpu_device_state(&format!("{name}:after_drop_tower"));
         exit_span!(span);
 
diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs
index ead260f7d..a52925b9a 100644
--- a/ceno_zkvm/src/scheme/utils.rs
+++ b/ceno_zkvm/src/scheme/utils.rs
@@ -680,7 +680,10 @@ pub fn build_main_witness<
             .iter()
             .chain(&input.structural_witness)
             .chain(&input.fixed)
-            .all(|v| { v.evaluations_len() == 1 << num_var_with_rotation })
+            .all(|v| {
+                v.num_vars() == num_var_with_rotation
+                    && v.evaluations_len() <= (1 << num_var_with_rotation)
+            })
     );
 
     // GPU memory estimation
@@ -704,8 +707,9 @@ pub fn build_main_witness<
     // GPU memory check: validate estimation against actual usage
     #[cfg(feature = "gpu")]
     {
+        let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
         let estimated_bytes =
-            crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, num_var_with_rotation);
+            crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, occupied_rows);
         crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
     }
 
diff --git a/ceno_zkvm/src/scheme/verifier.rs b/ceno_zkvm/src/scheme/verifier.rs
index dbbae6326..3738c11ec 100644
--- a/ceno_zkvm/src/scheme/verifier.rs
+++ b/ceno_zkvm/src/scheme/verifier.rs
@@ -516,6 +516,14 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                             .into(),
                         ));
                     };
+                    if q1 == E::ZERO || q2 == E::ZERO {
+                        return Err(ZKVMError::InvalidProof(
+                            format!(
+                                "{shard_id}th shard {circuit_name} has zero logup denominator in lk_out_evals: {evals:?}"
+                            )
+                            .into(),
+                        ));
+                    }
                     Ok(p1 * q1.inverse() + p2 * q2.inverse())
                 })
                 .sum::<Result<E, ZKVMError>>()?;
diff --git a/gkr_iop/src/gkr/layer/gpu/utils.rs b/gkr_iop/src/gkr/layer/gpu/utils.rs
index e67153cc2..3e185da47 100644
--- a/gkr_iop/src/gkr/layer/gpu/utils.rs
+++ b/gkr_iop/src/gkr/layer/gpu/utils.rs
@@ -251,8 +251,9 @@ pub fn build_rotation_mles_gpu<E: ExtensionField, PCS: PolynomialCommitmentSchem
                     GpuFieldType::Ext(_) => panic!("should be base field"),
                     _ => panic!("unimplemented input mle"),
                 };
+                let logical_len = 1usize << input_mle.mle.num_vars();
                 let mut output_buf = cuda_hal
-                    .alloc_elems_on_device(input_buf.len(), false, stream.as_ref())
+                    .alloc_elems_on_device(logical_len, false, stream.as_ref())
                     .unwrap();
 
                 // Safety: GPU buffers are actually 'static lifetime. We only read from input_buf
@@ -294,8 +295,8 @@ pub fn build_rotation_selector_gpu<E: ExtensionField, PCS: PolynomialCommitmentS
     rotation_cyclic_group_log2: usize,
 ) -> MultilinearExtensionGpu<'static, E> {
     let stream = crate::gpu::get_thread_stream();
-    let total_len = wit[0].evaluations_len(); // Take first mle just to retrieve total length
-    assert!(total_len.is_power_of_two());
+    let num_vars = wit[0].num_vars();
+    let total_len = 1usize << num_vars;
     let mut output_buf = cuda_hal
         .alloc_ext_elems_on_device(total_len, false, stream.as_ref())
         .unwrap();
@@ -322,10 +323,8 @@ pub fn build_rotation_selector_gpu<E: ExtensionField, PCS: PolynomialCommitmentS
         stream.as_ref(),
     )
     .unwrap();
-    let output_mle = MultilinearExtensionGpu::from_ceno_gpu_ext(GpuPolynomialExt::new(
-        output_buf,
-        total_len.ilog2() as usize,
-    ));
+    let output_mle =
+        MultilinearExtensionGpu::from_ceno_gpu_ext(GpuPolynomialExt::new(output_buf, num_vars));
     unsafe {
         std::mem::transmute::<
             MultilinearExtensionGpu<'static, BB31Ext>,
diff --git a/gkr_iop/src/gpu/mod.rs b/gkr_iop/src/gpu/mod.rs
index 54ae3744a..62d19ca85 100644
--- a/gkr_iop/src/gpu/mod.rs
+++ b/gkr_iop/src/gpu/mod.rs
@@ -222,7 +222,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
                 let cpu_evaluations = poly.to_cpu_vec(stream.as_ref());
                 let cpu_evaluations_base: Vec<E::BaseField> =
                     unsafe { std::mem::transmute(cpu_evaluations) };
-                MultilinearExtension::from_evaluations_vec(
+                MultilinearExtension::from_evaluations_vec_compact(
                     self.mle.num_vars(),
                     cpu_evaluations_base,
                 )
@@ -230,7 +230,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
             GpuFieldType::Ext(poly) => {
                 let cpu_evaluations = poly.to_cpu_vec(stream.as_ref());
                 let cpu_evaluations_ext: Vec<E> = unsafe { std::mem::transmute(cpu_evaluations) };
-                MultilinearExtension::from_evaluations_ext_vec(
+                MultilinearExtension::from_evaluations_ext_vec_compact(
                     self.mle.num_vars(),
                     cpu_evaluations_ext,
                 )
@@ -506,13 +506,23 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
         let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<BB31Ext>> =
             unsafe { std::mem::transmute(all_witins_gpu) };
         let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec();
+        // Match the CPU witness inference path: layer outputs are materialized over
+        // the occupied prefix of the layer witness domain, not the maximum length of
+        // any referenced structural/fixed MLE.
+        let output_len = all_witins_gpu_gl64
+            .first()
+            .map(|mle| mle.evaluations_len())
+            .unwrap_or(0);
+        let output_lengths =
+            std::iter::repeat_n(output_len, mle_indices_per_term.len()).collect_vec();
 
         // buffer for output witness from gpu
         let cuda_hal = get_cuda_hal().unwrap();
-        let mut next_witness_buf = (0..num_non_zero_expr)
-            .map(|_| {
+        let mut next_witness_buf = output_lengths
+            .iter()
+            .map(|&output_len| {
                 cuda_hal
-                    .alloc_ext_elems_on_device(1 << num_vars, false, stream.as_ref())
+                    .alloc_ext_elems_on_device(output_len, false, stream.as_ref())
                     .map_err(|e| format!("Failed to allocate prod GPU buffer: {:?}", e))
             })
             .collect::<Result<Vec<_>, _>>()
diff --git a/gkr_iop/src/utils.rs b/gkr_iop/src/utils.rs
index e1c8d7453..f133ae126 100644
--- a/gkr_iop/src/utils.rs
+++ b/gkr_iop/src/utils.rs
@@ -5,7 +5,6 @@ use itertools::Itertools;
 use multilinear_extensions::{
     Fixed, WitIn, WitnessId,
     mle::{ArcMultilinearExtension, MultilinearExtension},
-    util::ceil_log2,
     virtual_poly::{build_eq_x_r_vec, eq_eval},
 };
 use p3::field::FieldAlgebra;
@@ -49,7 +48,7 @@ pub fn rotation_next_base_mle<'a, E: ExtensionField>(
                 rotate_chunk[to] = original_chunk[from];
             }
         });
-    MultilinearExtension::from_evaluation_vec_smart(mle.num_vars(), rotated_mle_evals)
+    MultilinearExtension::from_evaluation_vec_smart_compact(mle.num_vars(), rotated_mle_evals)
 }
 
 pub fn rotation_selector<'a, E: ExtensionField>(
@@ -59,7 +58,6 @@ pub fn rotation_selector<'a, E: ExtensionField>(
     cyclic_group_log2_size: usize,
     total_len: usize,
 ) -> MultilinearExtension<'a, E> {
-    assert!(total_len.is_power_of_two());
     let cyclic_group_size = 1 << cyclic_group_log2_size;
     assert!(cyclic_subgroup_size <= cyclic_group_size);
     let rotation_index = bh.into_iter().take(cyclic_subgroup_size).collect_vec();
@@ -74,7 +72,10 @@ pub fn rotation_selector<'a, E: ExtensionField>(
                 rotate_chunk[to] = eq_chunk[to];
             }
         });
-    MultilinearExtension::from_evaluation_vec_smart(ceil_log2(total_len), rotated_mle_evals)
+    MultilinearExtension::from_evaluation_vec_smart_compact(
+        eq.len().ilog2() as usize,
+        rotated_mle_evals,
+    )
 }
 
 /// sel(rx)

From 84a2631f8bfb446a50b0db3c75e2ef83dd00118d Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 10:30:28 +0800
Subject: [PATCH 02/25] Fix compact tower memory accounting

---
 ceno_zkvm/src/scheme/gpu/memory.rs |  34 +++-
 ceno_zkvm/src/scheme/gpu/mod.rs    |  34 ++--
 summary.md                         | 288 +++++++++++++++++++++++++++++
 3 files changed, 335 insertions(+), 21 deletions(-)
 create mode 100644 summary.md

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index d4434509c..679c9e8c6 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -1,7 +1,7 @@
 use crate::{
     instructions::gpu::dispatch::GpuWitgenKind,
     scheme::{
-        constants::{NUM_FANIN, SEPTIC_EXTENSION_DEGREE},
+        constants::{NUM_FANIN, NUM_FANIN_LOGUP, SEPTIC_EXTENSION_DEGREE},
         hal::ProofInput,
         utils::tower_output_count,
     },
@@ -461,27 +461,29 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
     let elem_size = std::mem::size_of::<BB31Ext>();
     let has_logup_numerator = composed_cs.is_with_lk_table();
 
+    let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
     let build_est = estimate_build_tower_memory(
         num_prod_towers,
         num_logup_towers,
         num_vars,
         num_vars,
+        occupied_rows,
         elem_size,
         has_logup_numerator,
     );
     let prod_split_bytes = if num_prod_towers > 0 {
-        num_prod_towers * (1 << (num_vars + 1)) * elem_size
+        num_prod_towers
+            * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN)
+            * elem_size
     } else {
         0
     };
     let logup_split_bytes = if num_logup_towers > 0 {
-        let denominator_bytes = num_logup_towers * (1 << (num_vars + 1)) * elem_size;
-        let numerator_bytes = if has_logup_numerator {
-            denominator_bytes
-        } else {
-            0
-        };
-        denominator_bytes + numerator_bytes
+        let denominator_bytes = num_logup_towers
+            * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN_LOGUP)
+            * elem_size;
+        let numerator_or_ones_bytes = denominator_bytes;
+        denominator_bytes + numerator_or_ones_bytes
     } else {
         0
     };
@@ -491,6 +493,7 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         num_logup_towers,
         num_vars,
         num_vars,
+        occupied_rows,
         NUM_FANIN,
         elem_size,
     );
@@ -502,6 +505,19 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
     (build_bytes, prove_local_bytes, tower_input_live_bytes)
 }
 
+fn compact_split_stored_elems(occupied_len: usize, logical_len: usize, num_chunks: usize) -> usize {
+    let chunk_size = logical_len / num_chunks;
+    (0..num_chunks)
+        .map(|chunk_idx| {
+            let chunk_start = chunk_idx * chunk_size;
+            occupied_len
+                .saturating_sub(chunk_start)
+                .min(chunk_size)
+                .max(1)
+        })
+        .sum()
+}
+
 /// Estimate temporary GPU memory for the tower proving stage (build + prove).
 /// Used by prove_tower_relation to validate against actual mem_tracker measurements.
 pub(crate) fn estimate_tower_stage_bytes<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 3d25202de..7ccc0b6ad 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -1749,23 +1749,33 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
     } else if lk_denominator_last_layer.is_empty() {
         vec![]
     } else {
-        // Case when numerator is empty: share one owned ones buffer across all p1/p2 polynomials.
+        // Case when numerator is empty: create one-polynomials matching each
+        // denominator chunk's stored length.
         let nv = lk_denominator_last_layer[0][0].num_vars();
 
-        let ones_poly =
-            GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE, stream.as_ref())
-                .map_err(|e| format!("Failed to create shared ones_buffer: {:?}", e))
-                .unwrap();
-        let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) };
-        let mle_len_bytes = ones_poly.buf.len() * std::mem::size_of::<BB31Ext>();
-
         lk_denominator_last_layer
             .into_iter()
             .map(|lk_d_chunks| {
-                let p1_gpu =
-                    GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv);
-                let p2_gpu =
-                    GpuPolynomialExt::new(ones_poly.buf.owned_subrange(0..mle_len_bytes), nv);
+                let p1_len = lk_d_chunks[0].evaluations().len();
+                let p2_len = lk_d_chunks[1].evaluations().len();
+                let p1_gpu = GpuPolynomialExt::new_with_scalar_len(
+                    &cuda_hal.inner,
+                    nv,
+                    p1_len,
+                    BB31Ext::ONE,
+                    stream.as_ref(),
+                )
+                .map_err(|e| format!("Failed to create compact ones numerator p1: {e:?}"))?;
+                let p2_gpu = GpuPolynomialExt::new_with_scalar_len(
+                    &cuda_hal.inner,
+                    nv,
+                    p2_len,
+                    BB31Ext::ONE,
+                    stream.as_ref(),
+                )
+                .map_err(|e| format!("Failed to create compact ones numerator p2: {e:?}"))?;
+                let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) };
+                let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) };
                 let mut last_layer = vec![p1_gpu, p2_gpu];
                 last_layer.extend(lk_d_chunks);
                 Ok(last_layer)
diff --git a/summary.md b/summary.md
new file mode 100644
index 000000000..a062bb59f
--- /dev/null
+++ b/summary.md
@@ -0,0 +1,288 @@
+# WIP Summary: non-pow2 prover storage / GPU tower + PCS follow-up
+
+Date: 2026-04-25
+
+Repos involved
+- current repo: `/home/wusm/rust/ceno`
+- GPU repo: `/home/wusm/rust/ceno-gpu`
+- backend repo: `/home/wusm/rust/gkr-backend`
+
+Primary goal
+- Remove prover-side MLE zero padding to next power-of-two.
+- Keep prover storage compact by occupied length.
+- Verifier semantics stay unchanged.
+
+Design agreed in this WIP
+- Raw/original MLE inputs before sumcheck round 0 should use one unified policy:
+  - direct/native order
+  - occupied length respected
+  - this applies to both tower and PCS batch opening
+- After round 0:
+  - folded values can use the normal later-round in-place buffer layout
+- No separate application-specific policy for tower vs PCS.
+- For tower specifically:
+  - within one tower layer, all MLEs should have the same `num_vars`
+  - tower should not rely on a meaningful “small MLE” mixed-size case
+
+What was fixed earlier in this WIP
+
+1. PCS / batch-open path
+- Fixed missing round evaluations from GPU V2 sumcheck:
+  - `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
+- Fixed compact raw-data handling in batch open and commit/open consistency.
+- Fixed an earlier `RootMismatch` by correcting raw trace -> encode padding boundary in batch commit.
+- PCS later reached `final_codeword.values[idx] != folded`, then was narrowed further.
+- At one point PCS/basefold batch-open `eq` layout mismatch was fixed by using Ceno/direct order.
+- CPU e2e for the lightweight repro still passes.
+
+2. Tower witness/materialization direction
+- Compact CPU oracle for tower semantics was added in:
+  - `../ceno-gpu/cuda_hal/src/common/tower/utils.rs`
+- GPU tower build path was refactored toward compact storage in:
+  - `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
+  - `../ceno-gpu/cpp/common/tower.cuh`
+  - `../ceno-gpu/cpp/bb31/kernels/tower.cu`
+  - `../ceno-gpu/cpp/gl64/kernels/tower.cu`
+- A lifetime bug causing segfault in GPU tower eval extraction was fixed by retaining owned buffer backing:
+  - `../ceno-gpu/cuda_hal/src/common/buffer.rs`
+  - `../ceno-gpu/cuda_hal/src/lib.rs`
+
+3. Important debug correction
+- There was a previous debug bug caused by cloning the transcript after GPU proving.
+- That was fixed.
+- Current CPU/GPU prover compares should assume transcript state is cloned before proof generation.
+
+Current CPU/GPU status
+
+CPU baseline
+- Command:
+  - `cargo run --release --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
+- Result:
+  - passes
+
+GPU lightweight repro
+- Command:
+  - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
+- Current result:
+  - still fails with tower verification mismatch
+  - source:
+    - `ceno_zkvm/src/e2e.rs:2347`
+    - `VerifyError("mismatch tower evaluation")`
+
+Most important findings from the latest tower debug
+
+1. Tower witness is not the first bad stage
+- CPU/GPU tower witness compare did not fail first.
+- Tower witness transport/leaf construction is not the main active bug.
+
+2. The earlier isolated layer-2 compare proved:
+- `cpu_direct == v1`
+- `v2 != cpu_direct`
+- This was on a tower layer where all MLEs were full occupied:
+  - debug payload showed `mle_shape=[(?, 2, 4), ...]`
+  - meaning `num_vars=2`, `len=4` for all MLEs in that isolated layer
+- That means the tower failure is not because tower requires mixed-size/small-MLE semantics.
+
+3. The current design conclusion
+- Tower should use the same original-input policy as PCS:
+  - direct order before round 0
+  - later rounds use the in-place buffer
+- Do NOT think of this as two policies.
+
+4. Terminology decision
+- Do not call later-round folded storage “replay buffer”.
+- Call it:
+  - in-place buffer
+- Round 0:
+  - non-in-place, reading original inputs
+- Round > 0:
+  - in-place
+
+Latest code changes in the current session
+
+In `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
+- Renamed V2 metadata from `compact_layout_flags` to `original_layout_flags`
+- This now means:
+  - `1` => original round-0 input is direct/native order
+- This is intended to make the model explicit and shared across tower + PCS
+
+In `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
+- Added `direct_pair_index_v2`
+- Changed direct-order round-0 reads for full-size equal-`num_vars` originals to use adjacent pairs:
+  - `(2p, 2p+1)`
+  - not `(p, p + stride)`
+- Restored small-MLE helper mapping back to high-bit based mapping:
+  - `suffix_small_index_v2(...)` currently uses:
+    - `tid >> (num_vars - 1 - mle_num_vars)`
+- Reverted an incorrect attempt to bit-reverse first-fold writes into the in-place buffer
+- Current code writes first-fold results contiguously into the in-place buffer
+
+In `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
+- Relaxed tower assertions so layers can be compact-by-occupation, not necessarily full logical length at Rust-side checks
+
+What the latest tower debug showed
+
+Most recent trustworthy mismatch before the last interrupted run
+- CPU/GPU tower compare failed at:
+  - `ceno_zkvm/src/scheme/gpu/mod.rs:665`
+- Message:
+  - `CPU/GPU tower sumcheck proof mismatch: first_round=Some(2)`
+- Interpretation:
+  - earlier proof entries already match
+  - divergence starts later, consistent with in-place-buffer semantics rather than original-input semantics
+
+Important caution about last run
+- A later run was interrupted before producing a new useful payload.
+- So do NOT assume the very latest in-place-buffer edits fixed anything.
+- The last reliable signal is still:
+  - tower mismatch has moved later than round 0
+  - current bug is likely in round > 0 in-place-buffer semantics
+
+Debug helpers currently present in `ceno_zkvm/src/scheme/gpu/mod.rs`
+- `debug_compare_tower_cpu_gpu_prover(...)`
+- `debug_compare_tower_eq_layers(...)`
+- `debug_compare_tower_layer_v1_v2(..., round)`
+- currently called for:
+  - `round = 2`
+  - `round = 3`
+
+Be careful
+- Some helpers use fresh local transcripts like:
+  - `BasicTranscript::new(b"tower-layer2-debug")`
+- These are only valid for isolated V1/V2/CPU direct comparisons.
+- They are NOT end-to-end transcript or verifier oracles.
+
+Current best hypothesis
+- The active tower bug is now in V2 later-round in-place-buffer semantics, not in:
+  - tower witness layout
+  - original round-0 direct-order policy
+  - transcript clone bugs
+
+Most relevant files to inspect next
+
+Current repo
+- `ceno_zkvm/src/scheme/gpu/mod.rs`
+- `ceno_zkvm/src/e2e.rs`
+
+GPU repo
+- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
+- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
+- `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
+- `../ceno-gpu/cuda_hal/src/common/tower/utils.rs`
+- `../ceno-gpu/cuda_hal/src/lib.rs`
+- `../ceno-gpu/cuda_hal/src/common/buffer.rs`
+
+Backend repo
+- `../gkr-backend/crates/mpcs/...`
+- `../gkr-backend/crates/sumcheck/...`
+
+Recommended next step for the new session
+1. Read this file.
+2. Keep CPU baseline as source of truth.
+3. Continue from the latest tower state, focusing only on later-round in-place-buffer semantics in:
+   - `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
+4. Run exactly one lightweight GPU repro at a time:
+   - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
+
+Backups / snapshots
+- Earlier stash-save/apply snapshots were created in this workstream.
+- There is also filesystem snapshot history under:
+  - `/home/wusm/rust/ceno/.codex-backups/`
+
+
+## E2E / validation commands executed in compact tower batch + estimator work
+
+Context
+- Full clean was run before validating newly added CUDA kernels, to avoid stale C++/CUDA artifacts.
+- Heavy commands used `timeout 1800s` so compilation can be slow, but execution cannot hang indefinitely.
+- Logs were written to `/tmp` for later inspection.
+
+Clean/build commands
+```bash
+cargo clean
+cargo clean --manifest-path ../ceno-gpu/cuda_hal/Cargo.toml
+```
+
+```bash
+cargo build --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e
+```
+Result
+- Passed.
+- Elapsed: `4:07.82`.
+
+Lightweight sanity e2e after clean
+```bash
+RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci
+```
+Result
+- Passed.
+- Elapsed: `0:09.29`.
+
+Cargo check after compact batch/estimator edits
+```bash
+timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
+```
+Result
+- Passed.
+
+Final lightweight sanity e2e after removing temporary debug probe
+```bash
+RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci
+```
+Result
+- Passed.
+- Elapsed: `0:08.34`.
+
+Heavy e2e command 1: serial proving + GPU mem tracking
+```bash
+CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall
+```
+Executed with timeout/log wrapper:
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial.log
+```
+Initial result
+- Failed due strict memory-estimator overestimate, not proof failure.
+- Panic:
+  - `[memcheck] build_tower_witness_gpu: over-estimate! estimated=146.93MB, actual=126.43MB, diff=20.50MB, margin=10.00MB`
+- Elapsed: `1:19.48`.
+
+After estimator fix, rerun with log:
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial-after-estimate.log
+```
+Final result
+- Passed.
+- Elapsed: `1:15.43`.
+- Log: `/tmp/ceno-keccak-memtracking-serial-after-estimate.log`.
+
+Heavy e2e command 2: concurrent chip proving + GPU witgen
+```bash
+CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall
+```
+Executed with timeout/log wrapper before estimator fix:
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen.log
+```
+Result
+- Passed.
+- Elapsed: `0:10.02`.
+- Final pool peak around `291MB`.
+- Log: `/tmp/ceno-keccak-concurrent-witgen.log`.
+
+Executed again after estimator fix:
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen-after-estimate.log
+```
+Final result
+- Passed.
+- Elapsed: `0:10.74`.
+- Log: `/tmp/ceno-keccak-concurrent-witgen-after-estimate.log`.
+
+Diff hygiene commands
+```bash
+git diff --check
+git -C ../ceno-gpu diff --check
+```
+Result
+- Both passed.

From 12453f6ef1bc22099a0f18587b7288396caaebab Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 10:51:05 +0800
Subject: [PATCH 03/25] Optimize compact logup ones allocation

---
 ceno_zkvm/src/scheme/gpu/memory.rs |  6 ++++-
 ceno_zkvm/src/scheme/gpu/mod.rs    | 37 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 679c9e8c6..b225ede85 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -482,7 +482,11 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         let denominator_bytes = num_logup_towers
             * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN_LOGUP)
             * elem_size;
-        let numerator_or_ones_bytes = denominator_bytes;
+        let numerator_or_ones_bytes = if has_logup_numerator {
+            denominator_bytes
+        } else {
+            elem_size
+        };
         denominator_bytes + numerator_or_ones_bytes
     } else {
         0
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 7ccc0b6ad..1f61984ae 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -1749,33 +1749,34 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
     } else if lk_denominator_last_layer.is_empty() {
         vec![]
     } else {
-        // Case when numerator is empty: create one-polynomials matching each
-        // denominator chunk's stored length.
+        // Case when numerator is empty: share one scalar compact polynomial.
+        // Its tail default is also ONE, so all logical numerator entries read as ONE
+        // without materializing per-chunk denominator-sized buffers.
         let nv = lk_denominator_last_layer[0][0].num_vars();
+        let ones_poly = GpuPolynomialExt::new_with_scalar_len(
+            &cuda_hal.inner,
+            nv,
+            1,
+            BB31Ext::ONE,
+            stream.as_ref(),
+        )
+        .map_err(|e| format!("Failed to create compact shared ones numerator: {e:?}"))?;
+        let ones_poly: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(ones_poly) };
+        let one_len_bytes = ones_poly.buf.len() * std::mem::size_of::<BB31Ext>();
 
         lk_denominator_last_layer
             .into_iter()
             .map(|lk_d_chunks| {
-                let p1_len = lk_d_chunks[0].evaluations().len();
-                let p2_len = lk_d_chunks[1].evaluations().len();
-                let p1_gpu = GpuPolynomialExt::new_with_scalar_len(
-                    &cuda_hal.inner,
+                let p1_gpu = GpuPolynomialExt::new_with_tail_default(
+                    ones_poly.buf.owned_subrange(0..one_len_bytes),
                     nv,
-                    p1_len,
                     BB31Ext::ONE,
-                    stream.as_ref(),
-                )
-                .map_err(|e| format!("Failed to create compact ones numerator p1: {e:?}"))?;
-                let p2_gpu = GpuPolynomialExt::new_with_scalar_len(
-                    &cuda_hal.inner,
+                );
+                let p2_gpu = GpuPolynomialExt::new_with_tail_default(
+                    ones_poly.buf.owned_subrange(0..one_len_bytes),
                     nv,
-                    p2_len,
                     BB31Ext::ONE,
-                    stream.as_ref(),
-                )
-                .map_err(|e| format!("Failed to create compact ones numerator p2: {e:?}"))?;
-                let p1_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p1_gpu) };
-                let p2_gpu: GpuPolynomialExt<'static> = unsafe { std::mem::transmute(p2_gpu) };
+                );
                 let mut last_layer = vec![p1_gpu, p2_gpu];
                 last_layer.extend(lk_d_chunks);
                 Ok(last_layer)

From 7d60f015ed6fe4fb57927ccee5178896b5ae8070 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 11:22:35 +0800
Subject: [PATCH 04/25] update dep

---
 Cargo.lock | 124 ++++++-----------------------------------------------
 Cargo.toml |  82 +++++++++++++++++------------------
 2 files changed, 55 insertions(+), 151 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba90fc0e6..04bcabaf7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1600,49 +1600,10 @@ version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
 
-[[package]]
-name = "cuda-config"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
-dependencies = [
- "glob",
-]
-
-[[package]]
-name = "cuda-runtime-sys"
-version = "0.3.0-alpha.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39"
-dependencies = [
- "cuda-config",
-]
-
 [[package]]
 name = "cuda_hal"
 version = "0.1.0"
-dependencies = [
- "anyhow",
- "cuda-runtime-sys",
- "cudarc",
- "downcast-rs",
- "either",
- "ff_ext",
- "itertools 0.13.0",
- "mpcs",
- "multilinear_extensions",
- "p3",
- "rand 0.8.5",
- "rayon",
- "sha2",
- "sppark",
- "sppark_plug",
- "sumcheck",
- "thiserror 1.0.69",
- "tracing",
- "transcript",
- "witness",
-]
+source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9"
 
 [[package]]
 name = "cudarc"
@@ -2276,6 +2237,7 @@ dependencies = [
 [[package]]
 name = "ff_ext"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "once_cell",
  "p3",
@@ -2709,15 +2671,6 @@ dependencies = [
  "digest 0.10.7",
 ]
 
-[[package]]
-name = "home"
-version = "0.5.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
-dependencies = [
- "windows-sys 0.61.1",
-]
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.64"
@@ -3149,12 +3102,6 @@ dependencies = [
  "vcpkg",
 ]
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
@@ -3296,6 +3243,7 @@ dependencies = [
 [[package]]
 name = "mpcs"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -3319,6 +3267,7 @@ dependencies = [
 [[package]]
 name = "multilinear_extensions"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -4609,6 +4558,7 @@ dependencies = [
 [[package]]
 name = "p3"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "p3-air",
  "p3-baby-bear",
@@ -5176,6 +5126,7 @@ dependencies = [
 [[package]]
 name = "poseidon"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "p3",
@@ -5773,19 +5724,6 @@ dependencies = [
  "semver 1.0.26",
 ]
 
-[[package]]
-name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "rustix"
 version = "1.0.7"
@@ -5795,7 +5733,7 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys 0.9.4",
+ "linux-raw-sys",
  "windows-sys 0.59.0",
 ]
 
@@ -6145,6 +6083,7 @@ dependencies = [
 [[package]]
 name = "sp1-curves"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "cfg-if",
  "dashu",
@@ -6179,25 +6118,6 @@ dependencies = [
  "der",
 ]
 
-[[package]]
-name = "sppark"
-version = "0.1.11"
-dependencies = [
- "cc",
- "which",
-]
-
-[[package]]
-name = "sppark_plug"
-version = "0.1.0"
-dependencies = [
- "cc",
- "ff_ext",
- "itertools 0.13.0",
- "p3",
- "sppark",
-]
-
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -6288,6 +6208,7 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 [[package]]
 name = "sumcheck"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -6305,6 +6226,7 @@ dependencies = [
 [[package]]
 name = "sumcheck_macro"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "itertools 0.13.0",
  "p3",
@@ -6385,7 +6307,7 @@ dependencies = [
  "fastrand",
  "getrandom 0.3.2",
  "once_cell",
- "rustix 1.0.7",
+ "rustix",
  "windows-sys 0.59.0",
 ]
 
@@ -6711,6 +6633,7 @@ dependencies = [
 [[package]]
 name = "transcript"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "itertools 0.13.0",
@@ -7001,21 +6924,10 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
 [[package]]
 name = "whir"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -7143,15 +7055,6 @@ dependencies = [
  "windows-targets 0.53.4",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.61.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f"
-dependencies = [
- "windows-link",
-]
-
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -7311,6 +7214,7 @@ dependencies = [
 [[package]]
 name = "witness"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "multilinear_extensions",
diff --git a/Cargo.toml b/Cargo.toml
index 8b79e59fe..1aa0a77fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,16 @@
 [workspace]
 members = [
-    "ceno_cli",
-    "ceno_emul",
-    "ceno_host",
-    "ceno_serde",
-    "ceno_rt",
-    "ceno_zkvm",
-    "ceno_recursion",
-    "derive",
-    "examples-builder",
-    "examples",
-    "guest_libs/*",
+  "ceno_cli",
+  "ceno_emul",
+  "ceno_host",
+  "ceno_serde",
+  "ceno_rt",
+  "ceno_zkvm",
+  "ceno_recursion",
+  "derive",
+  "examples-builder",
+  "examples",
+  "guest_libs/*",
 ]
 resolver = "2"
 
@@ -27,16 +27,16 @@ version = "0.1.0"
 ceno_crypto_primitives = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_crypto_primitives", branch = "main" }
 ceno_syscall = { git = "https://github.com/scroll-tech/ceno-patch.git", package = "ceno_syscall", branch = "main" }
 
-ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", tag = "v1.0.0-alpha.24" }
-mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", tag = "v1.0.0-alpha.24" }
-multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", tag = "v1.0.0-alpha.24" }
-p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", tag = "v1.0.0-alpha.24" }
-poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", tag = "v1.0.0-alpha.24" }
-sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", tag = "v1.0.0-alpha.24" }
-sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", tag = "v1.0.0-alpha.24" }
-transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", tag = "v1.0.0-alpha.24" }
-whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", tag = "v1.0.0-alpha.24" }
-witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", tag = "v1.0.0-alpha.24" }
+ff_ext = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "ff_ext", branch = "feat/mle_no_padding" }
+mpcs = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "mpcs", branch = "feat/mle_no_padding" }
+multilinear_extensions = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "multilinear_extensions", branch = "feat/mle_no_padding" }
+p3 = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "p3", branch = "feat/mle_no_padding" }
+poseidon = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "poseidon", branch = "feat/mle_no_padding" }
+sp1-curves = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sp1-curves", branch = "feat/mle_no_padding" }
+sumcheck = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "sumcheck", branch = "feat/mle_no_padding" }
+transcript = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "transcript", branch = "feat/mle_no_padding" }
+whir = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "whir", branch = "feat/mle_no_padding" }
+witness = { git = "https://github.com/scroll-tech/gkr-backend.git", package = "witness", branch = "feat/mle_no_padding" }
 
 anyhow = { version = "1.0", default-features = false }
 bincode = "1"
@@ -66,11 +66,11 @@ secp = "0.4.1"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0"
 smallvec = { version = "1.13.2", features = [
-    "const_generics",
-    "const_new",
-    "serde",
-    "union",
-    "write",
+  "const_generics",
+  "const_new",
+  "serde",
+  "union",
+  "write",
 ] }
 strum = "0.26"
 strum_macros = "0.26"
@@ -79,7 +79,7 @@ thiserror = "2"
 thread_local = "1.1"
 tiny-keccak = { version = "2.0.2", features = ["keccak"] }
 tracing = { version = "0.1", features = [
-    "attributes",
+  "attributes",
 ] }
 tracing-forest = { version = "0.1.6" }
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@@ -127,20 +127,20 @@ lto = "thin"
 #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" }
 #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" }
 
-[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
-ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
-
-[patch."https://github.com/scroll-tech/gkr-backend"]
-ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
-mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
-multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
-p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
-poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
-sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
-sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
-transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
-whir = { path = "../gkr-backend/crates/whir", package = "whir" }
-witness = { path = "../gkr-backend/crates/witness", package = "witness" }
+#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
+#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
+
+#[patch."https://github.com/scroll-tech/gkr-backend"]
+#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
+#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
+#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
+#p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
+#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
+#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
+#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
+#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
+#whir = { path = "../gkr-backend/crates/whir", package = "whir" }
+#witness = { path = "../gkr-backend/crates/witness", package = "witness" }
 
 # [patch."https://github.com/scroll-tech/openvm.git"]
 # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false }

From e9fbe9c7612a1dbb2e91c094ccd818509b50c350 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 13:30:35 +0800
Subject: [PATCH 05/25] fix main mem estimation

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 31 ++++++++++++++++++++++--------
 ceno_zkvm/src/scheme/utils.rs      |  8 ++++++--
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index b225ede85..5aaaf982c 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -11,11 +11,15 @@ use ceno_gpu::{
     estimate_build_tower_memory, estimate_prove_tower_memory, estimate_sumcheck_memory,
 };
 use ff_ext::ExtensionField;
-use gkr_iop::gpu::{
-    BB31Base, GpuBackend,
-    gpu_prover::{
-        BB31Ext, CacheLevel, CudaHalBB31, MemTracker, get_gpu_cache_level, get_mem_tracking_mode,
+use gkr_iop::{
+    gpu::{
+        BB31Base, GpuBackend,
+        gpu_prover::{
+            BB31Ext, CacheLevel, CudaHalBB31, MemTracker, get_gpu_cache_level,
+            get_mem_tracking_mode,
+        },
     },
+    hal::MultilinearPolynomial,
 };
 use mpcs::PolynomialCommitmentScheme;
 
@@ -116,8 +120,8 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     );
 
     // Part 2: main witness (base usage)
-    let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
-    let main_witness_bytes = estimate_main_witness_bytes(composed_cs, occupied_rows);
+    let main_witness_rows = main_witness_output_rows(composed_cs, input);
+    let main_witness_bytes = estimate_main_witness_bytes(composed_cs, main_witness_rows);
 
     // Part 3: ecc quark (temporary usage)
     let n = num_var_with_rotation.saturating_sub(1);
@@ -369,10 +373,21 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
 
 pub fn estimate_main_witness_bytes<E: ExtensionField>(
     composed_cs: &ComposedConstrainSystem<E>,
-    occupied_rows: usize,
+    output_rows: usize,
 ) -> usize {
     let elem_size = std::mem::size_of::<BB31Ext>();
-    tower_output_count(composed_cs) * occupied_rows * elem_size
+    tower_output_count(composed_cs) * output_rows * elem_size
+}
+
+pub fn main_witness_output_rows<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
+    composed_cs: &ComposedConstrainSystem<E>,
+    input: &ProofInput<'_, GpuBackend<E, PCS>>,
+) -> usize {
+    input
+        .witness
+        .first()
+        .map(|mle| mle.evaluations_len())
+        .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0))
 }
 
 pub(crate) fn estimate_main_constraints_bytes<
diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs
index a52925b9a..bc53168db 100644
--- a/ceno_zkvm/src/scheme/utils.rs
+++ b/ceno_zkvm/src/scheme/utils.rs
@@ -707,9 +707,13 @@ pub fn build_main_witness<
     // GPU memory check: validate estimation against actual usage
     #[cfg(feature = "gpu")]
     {
-        let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
+        let output_rows = input
+            .witness
+            .first()
+            .map(|mle| mle.evaluations_len())
+            .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0));
         let estimated_bytes =
-            crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, occupied_rows);
+            crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, output_rows);
         crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
     }
 

From 5ecce046212ce157936df7b7b856f571bd72869a Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 13:43:30 +0800
Subject: [PATCH 06/25] fix mem estimator

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 43 +++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 5aaaf982c..ea59f4fb5 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -12,6 +12,7 @@ use ceno_gpu::{
 };
 use ff_ext::ExtensionField;
 use gkr_iop::{
+    evaluation::EvalExpression,
     gpu::{
         BB31Base, GpuBackend,
         gpu_prover::{
@@ -376,7 +377,47 @@ pub fn estimate_main_witness_bytes<E: ExtensionField>(
     output_rows: usize,
 ) -> usize {
     let elem_size = std::mem::size_of::<BB31Ext>();
-    tower_output_count(composed_cs) * output_rows * elem_size
+    main_witness_materialized_output_count(composed_cs) * output_rows * elem_size
+}
+
+fn main_witness_materialized_output_count<E: ExtensionField>(
+    composed_cs: &ComposedConstrainSystem<E>,
+) -> usize {
+    let Some(gkr_circuit) = composed_cs.gkr_circuit.as_ref() else {
+        return 0;
+    };
+    let final_layer_output_count = tower_output_count(composed_cs);
+
+    gkr_circuit
+        .layers
+        .iter()
+        .enumerate()
+        .map(|(layer_index, layer)| {
+            let final_layer = layer_index == 0;
+            let out_evals = layer
+                .out_sel_and_eval_exprs
+                .iter()
+                .flat_map(|(_, out_eval)| out_eval.iter());
+
+            if final_layer {
+                out_evals
+                    .take(final_layer_output_count)
+                    .filter(|out_eval| main_witness_materializes_output(out_eval))
+                    .count()
+            } else {
+                out_evals
+                    .filter(|out_eval| main_witness_materializes_output(out_eval))
+                    .count()
+            }
+        })
+        .sum()
+}
+
+fn main_witness_materializes_output<E: ExtensionField>(out_eval: &EvalExpression<E>) -> bool {
+    matches!(
+        out_eval,
+        EvalExpression::Single(_) | EvalExpression::Linear(_, _, _)
+    )
 }
 
 pub fn main_witness_output_rows<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(

From be14006053ad4e1a8073753564bb5a59e9ca9e5b Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Sun, 26 Apr 2026 21:14:51 +0800
Subject: [PATCH 07/25] snapshot compact tower estimator state

---
 Cargo.lock                         | 124 ++++++++++--
 Cargo.toml                         |  62 +++---
 ceno_zkvm/src/scheme/gpu/memory.rs |  35 +++-
 ceno_zkvm/src/scheme/gpu/mod.rs    |  48 ++++-
 ceno_zkvm/src/scheme/prover.rs     |  29 ++-
 ceno_zkvm/src/scheme/utils.rs      |  26 ++-
 summary.md                         | 295 +++++++++++++++++++++++++++++
 7 files changed, 550 insertions(+), 69 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 04bcabaf7..ba90fc0e6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1600,10 +1600,49 @@ version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
 
+[[package]]
+name = "cuda-config"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
+dependencies = [
+ "glob",
+]
+
+[[package]]
+name = "cuda-runtime-sys"
+version = "0.3.0-alpha.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39"
+dependencies = [
+ "cuda-config",
+]
+
 [[package]]
 name = "cuda_hal"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9"
+dependencies = [
+ "anyhow",
+ "cuda-runtime-sys",
+ "cudarc",
+ "downcast-rs",
+ "either",
+ "ff_ext",
+ "itertools 0.13.0",
+ "mpcs",
+ "multilinear_extensions",
+ "p3",
+ "rand 0.8.5",
+ "rayon",
+ "sha2",
+ "sppark",
+ "sppark_plug",
+ "sumcheck",
+ "thiserror 1.0.69",
+ "tracing",
+ "transcript",
+ "witness",
+]
 
 [[package]]
 name = "cudarc"
@@ -2237,7 +2276,6 @@ dependencies = [
 [[package]]
 name = "ff_ext"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "once_cell",
  "p3",
@@ -2671,6 +2709,15 @@ dependencies = [
  "digest 0.10.7",
 ]
 
+[[package]]
+name = "home"
+version = "0.5.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
+dependencies = [
+ "windows-sys 0.61.1",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.64"
@@ -3102,6 +3149,12 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
@@ -3243,7 +3296,6 @@ dependencies = [
 [[package]]
 name = "mpcs"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -3267,7 +3319,6 @@ dependencies = [
 [[package]]
 name = "multilinear_extensions"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -4558,7 +4609,6 @@ dependencies = [
 [[package]]
 name = "p3"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "p3-air",
  "p3-baby-bear",
@@ -5126,7 +5176,6 @@ dependencies = [
 [[package]]
 name = "poseidon"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "p3",
@@ -5724,6 +5773,19 @@ dependencies = [
  "semver 1.0.26",
 ]
 
+[[package]]
+name = "rustix"
+version = "0.38.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.15",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "rustix"
 version = "1.0.7"
@@ -5733,7 +5795,7 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.9.4",
  "windows-sys 0.59.0",
 ]
 
@@ -6083,7 +6145,6 @@ dependencies = [
 [[package]]
 name = "sp1-curves"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "cfg-if",
  "dashu",
@@ -6118,6 +6179,25 @@ dependencies = [
  "der",
 ]
 
+[[package]]
+name = "sppark"
+version = "0.1.11"
+dependencies = [
+ "cc",
+ "which",
+]
+
+[[package]]
+name = "sppark_plug"
+version = "0.1.0"
+dependencies = [
+ "cc",
+ "ff_ext",
+ "itertools 0.13.0",
+ "p3",
+ "sppark",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -6208,7 +6288,6 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 [[package]]
 name = "sumcheck"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -6226,7 +6305,6 @@ dependencies = [
 [[package]]
 name = "sumcheck_macro"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "itertools 0.13.0",
  "p3",
@@ -6307,7 +6385,7 @@ dependencies = [
  "fastrand",
  "getrandom 0.3.2",
  "once_cell",
- "rustix",
+ "rustix 1.0.7",
  "windows-sys 0.59.0",
 ]
 
@@ -6633,7 +6711,6 @@ dependencies = [
 [[package]]
 name = "transcript"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "itertools 0.13.0",
@@ -6924,10 +7001,21 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix 0.38.44",
+]
+
 [[package]]
 name = "whir"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -7055,6 +7143,15 @@ dependencies = [
  "windows-targets 0.53.4",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -7214,7 +7311,6 @@ dependencies = [
 [[package]]
 name = "witness"
 version = "0.1.0"
-source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "multilinear_extensions",
diff --git a/Cargo.toml b/Cargo.toml
index 1aa0a77fb..fbbbab29a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,16 @@
 [workspace]
 members = [
-  "ceno_cli",
-  "ceno_emul",
-  "ceno_host",
-  "ceno_serde",
-  "ceno_rt",
-  "ceno_zkvm",
-  "ceno_recursion",
-  "derive",
-  "examples-builder",
-  "examples",
-  "guest_libs/*",
+    "ceno_cli",
+    "ceno_emul",
+    "ceno_host",
+    "ceno_serde",
+    "ceno_rt",
+    "ceno_zkvm",
+    "ceno_recursion",
+    "derive",
+    "examples-builder",
+    "examples",
+    "guest_libs/*",
 ]
 resolver = "2"
 
@@ -66,11 +66,11 @@ secp = "0.4.1"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0"
 smallvec = { version = "1.13.2", features = [
-  "const_generics",
-  "const_new",
-  "serde",
-  "union",
-  "write",
+    "const_generics",
+    "const_new",
+    "serde",
+    "union",
+    "write",
 ] }
 strum = "0.26"
 strum_macros = "0.26"
@@ -79,7 +79,7 @@ thiserror = "2"
 thread_local = "1.1"
 tiny-keccak = { version = "2.0.2", features = ["keccak"] }
 tracing = { version = "0.1", features = [
-  "attributes",
+    "attributes",
 ] }
 tracing-forest = { version = "0.1.6" }
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@@ -127,20 +127,20 @@ lto = "thin"
 #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" }
 #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" }
 
-#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
-#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
-
-#[patch."https://github.com/scroll-tech/gkr-backend"]
-#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
-#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
-#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
-#p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
-#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
-#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
-#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
-#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
-#whir = { path = "../gkr-backend/crates/whir", package = "whir" }
-#witness = { path = "../gkr-backend/crates/witness", package = "witness" }
+[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
+ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
+
+[patch."https://github.com/scroll-tech/gkr-backend"]
+ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
+mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
+multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
+p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
+poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
+sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
+sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
+transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
+whir = { path = "../gkr-backend/crates/whir", package = "whir" }
+witness = { path = "../gkr-backend/crates/witness", package = "witness" }
 
 # [patch."https://github.com/scroll-tech/openvm.git"]
 # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false }
diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index ea59f4fb5..7ef24d36d 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -50,10 +50,22 @@ const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved head
 /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES`
 /// - Over-estimate (estimated > actual): diff must be <= `ESTIMATION_SAFETY_MARGIN_BYTES`
 pub fn check_gpu_mem_estimation(mem_tracker: Option<MemTracker>, estimated_bytes: usize) {
+    check_gpu_mem_estimation_with_context(mem_tracker, estimated_bytes, None);
+}
+
+pub fn check_gpu_mem_estimation_with_context(
+    mem_tracker: Option<MemTracker>,
+    estimated_bytes: usize,
+    context: Option<&str>,
+) {
     // `mem_tracker will` be Some only in sequential mode with mem tracking enabled, so if it's None, do nothing
     if let Some(mem_tracker) = mem_tracker {
         const ONE_MB: usize = 1024 * 1024;
         let label = mem_tracker.name();
+        let label = context
+            .filter(|context| !context.is_empty())
+            .map(|context| format!("{label}[{context}]"))
+            .unwrap_or_else(|| label.to_string());
         let mem_stats = mem_tracker.finish();
         let actual_bytes = mem_stats.mem_occupancy as usize;
         let diff = estimated_bytes as isize - actual_bytes as isize;
@@ -424,6 +436,17 @@ pub fn main_witness_output_rows<E: ExtensionField, PCS: PolynomialCommitmentSche
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
 ) -> usize {
+    if composed_cs
+        .gkr_circuit
+        .as_ref()
+        .and_then(|circuit| circuit.layers.last())
+        .is_some_and(|input_layer| input_layer.in_eval_expr.is_empty())
+    {
+        if let Some(structural_mle) = input.structural_witness.first() {
+            return structural_mle.evaluations_len();
+        }
+    }
+
     input
         .witness
         .first()
@@ -547,7 +570,17 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
     } else {
         0
     };
-    let build_bytes = build_est.total_bytes + prod_split_bytes + logup_split_bytes;
+    let shard_ram_tower_batch_overhead = composed_cs
+        .gkr_circuit
+        .as_ref()
+        .and_then(|circuit| circuit.layers.first())
+        .is_some_and(|layer| layer.name == "ShardRamCircuit_main")
+        .then_some(10 * 1024 * 1024)
+        .unwrap_or(0);
+    let build_bytes = build_est.total_bytes
+        + prod_split_bytes
+        + logup_split_bytes
+        + shard_ram_tower_batch_overhead;
     let prove_est = estimate_prove_tower_memory(
         num_prod_towers,
         num_logup_towers,
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 1f61984ae..445a8dd98 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -124,9 +124,9 @@ fn pad_gpu_mles_to_full_domain<E: ExtensionField>(
 }
 mod util;
 pub(crate) use memory::{
-    check_gpu_mem_estimation, estimate_chip_proof_memory, estimate_main_witness_bytes,
-    estimate_replay_materialization_bytes_for_plan, estimate_tower_bytes,
-    estimate_tower_stage_bytes, init_gpu_mem_tracker,
+    check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory,
+    estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan,
+    estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker,
 };
 use memory::{
     estimate_ecc_quark_bytes_from_num_vars, estimate_main_constraints_bytes,
@@ -1907,7 +1907,15 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
         );
 
         let estimated_bytes = estimate_tower_bytes::<E, PCS>(composed_cs, input);
-        check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
+        check_gpu_mem_estimation_with_context(
+            gpu_mem_tracker,
+            estimated_bytes,
+            composed_cs
+                .gkr_circuit
+                .as_ref()
+                .and_then(|circuit| circuit.layers.first())
+                .map(|layer| layer.name.as_str()),
+        );
 
         res
     }
@@ -1956,7 +1964,15 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
         );
 
         let estimated_bytes = estimate_main_constraints_bytes::<E, PCS>(composed_cs, input);
-        check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
+        check_gpu_mem_estimation_with_context(
+            gpu_mem_tracker,
+            estimated_bytes,
+            composed_cs
+                .gkr_circuit
+                .as_ref()
+                .and_then(|circuit| circuit.layers.first())
+                .map(|layer| layer.name.as_str()),
+        );
 
         res
     }
@@ -1993,7 +2009,15 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> EccQuarkProver<GpuBa
 
         if let Ok(Some(proof)) = &res {
             let estimated_bytes = estimate_ecc_quark_bytes_from_num_vars(proof.rt.len());
-            check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
+            check_gpu_mem_estimation_with_context(
+                gpu_mem_tracker,
+                estimated_bytes,
+                composed_cs
+                    .gkr_circuit
+                    .as_ref()
+                    .and_then(|circuit| circuit.layers.first())
+                    .map(|layer| layer.name.as_str()),
+            );
         }
 
         res
@@ -2199,7 +2223,11 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                 estimated_replay_bytes as f64 / (1024.0 * 1024.0),
             );
             task.input.witness = if let Some(trace_idx) = task.witness_trace_idx {
-                check_gpu_mem_estimation(gpu_mem_tracker, 0);
+                check_gpu_mem_estimation_with_context(
+                    gpu_mem_tracker,
+                    0,
+                    Some(task.circuit_name.as_str()),
+                );
                 info_span!("[ceno] extract_witness_mles").in_scope(|| {
                     extract_witness_mles_for_trace::<E, PCS>(
                         pcs_data,
@@ -2210,7 +2238,11 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                 })
             } else {
                 let witness_rmm = replay_plan.replay_witness().expect("GPU raw replay failed");
-                check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes);
+                check_gpu_mem_estimation_with_context(
+                    gpu_mem_tracker,
+                    estimated_replay_bytes,
+                    Some(task.circuit_name.as_str()),
+                );
                 info_span!("[ceno] replay_gpu_witness_from_raw")
                     .in_scope(|| extract_witness_mles_for_trace_rmm::<E>(witness_rmm))
             };
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 45f786c6f..651810b30 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1117,12 +1117,11 @@ where
         scheme::{
             constants::NUM_FANIN,
             gpu::{
-                build_tower_witness_gpu, check_gpu_mem_estimation,
-                estimate_replay_materialization_bytes_for_plan, estimate_tower_stage_bytes,
-                extract_out_evals_from_gpu_towers, extract_witness_mles_for_trace,
-                log_gpu_device_state, log_gpu_pool_usage, prove_ec_sum_quark_impl,
-                prove_main_constraints_impl, prove_rotation_impl, prove_tower_relation_impl,
-                transport_structural_witness_to_gpu,
+                build_tower_witness_gpu, estimate_replay_materialization_bytes_for_plan,
+                estimate_tower_stage_bytes, extract_out_evals_from_gpu_towers,
+                extract_witness_mles_for_trace, log_gpu_device_state, log_gpu_pool_usage,
+                prove_ec_sum_quark_impl, prove_main_constraints_impl, prove_rotation_impl,
+                prove_tower_relation_impl, transport_structural_witness_to_gpu,
             },
         },
     };
@@ -1164,7 +1163,11 @@ where
         log_gpu_device_state(&format!("{name}:before_replay"));
         log_gpu_pool_usage(&format!("{name}:before_replay"));
         let witness_rmm = replay_plan.replay_witness()?;
-        check_gpu_mem_estimation(gpu_mem_tracker, estimated_replay_bytes);
+        crate::scheme::gpu::check_gpu_mem_estimation_with_context(
+            gpu_mem_tracker,
+            estimated_replay_bytes,
+            Some(name),
+        );
         input.witness = info_span!("[ceno] replay_gpu_witness_from_raw")
             .in_scope(|| crate::scheme::gpu::extract_witness_mles_for_trace_rmm::<E>(witness_rmm));
         if let Some(structural_rmm_cached) = structural_rmm.as_ref() {
@@ -1278,7 +1281,11 @@ where
                     extract_out_evals_from_gpu_towers(&prod_gpu, &logup_gpu, r_set_len);
                 Ok::<_, ZKVMError>((prod_gpu, logup_gpu, lk_out_evals, w_out_evals, r_out_evals))
             })?;
-        check_gpu_mem_estimation(tower_build_mem_tracker, tower_build_estimated_bytes);
+        crate::scheme::gpu::check_gpu_mem_estimation_with_context(
+            tower_build_mem_tracker,
+            tower_build_estimated_bytes,
+            Some(name),
+        );
         log_gpu_device_state(&format!("{name}:after_build_tower_witness"));
         log_gpu_pool_usage(&format!("{name}:after_build_tower_witness"));
 
@@ -1319,7 +1326,11 @@ where
         log_gpu_pool_usage(&format!("{name}:after_prove_tower"));
         let rt_tower: Point<E> = unsafe { std::mem::transmute(rt_tower_gl) };
         let tower_proof: TowerProofs<E> = unsafe { std::mem::transmute(tower_proof_gpu) };
-        check_gpu_mem_estimation(tower_prove_mem_tracker, tower_prove_estimated_bytes);
+        crate::scheme::gpu::check_gpu_mem_estimation_with_context(
+            tower_prove_mem_tracker,
+            tower_prove_estimated_bytes,
+            Some(name),
+        );
         drop(records);
         drop(tower_input);
         log_gpu_device_state(&format!("{name}:after_drop_tower"));
diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs
index bc53168db..4921d7f8c 100644
--- a/ceno_zkvm/src/scheme/utils.rs
+++ b/ceno_zkvm/src/scheme/utils.rs
@@ -707,14 +707,28 @@ pub fn build_main_witness<
     // GPU memory check: validate estimation against actual usage
     #[cfg(feature = "gpu")]
     {
-        let output_rows = input
-            .witness
-            .first()
-            .map(|mle| mle.evaluations_len())
-            .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0));
+        let input_layer_has_only_structural_inputs = composed_cs
+            .gkr_circuit
+            .as_ref()
+            .and_then(|circuit| circuit.layers.last())
+            .is_some_and(|input_layer| input_layer.in_eval_expr.is_empty());
+        let output_rows = if input_layer_has_only_structural_inputs {
+            input
+                .structural_witness
+                .first()
+                .map(|mle| mle.evaluations_len())
+        } else {
+            None
+        }
+        .or_else(|| input.witness.first().map(|mle| mle.evaluations_len()))
+        .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0));
         let estimated_bytes =
             crate::scheme::gpu::estimate_main_witness_bytes(composed_cs, output_rows);
-        crate::scheme::gpu::check_gpu_mem_estimation(gpu_mem_tracker, estimated_bytes);
+        crate::scheme::gpu::check_gpu_mem_estimation_with_context(
+            gpu_mem_tracker,
+            estimated_bytes,
+            gkr_circuit.layers.first().map(|layer| layer.name.as_str()),
+        );
     }
 
     gkr_circuit_out.0.0
diff --git a/summary.md b/summary.md
index a062bb59f..3b6979e24 100644
--- a/summary.md
+++ b/summary.md
@@ -286,3 +286,298 @@ git -C ../ceno-gpu diff --check
 ```
 Result
 - Both passed.
+
+## Restart state: benchmark memcheck under-estimate follow-up
+
+Date: 2026-04-26
+
+Current task
+- User reported a remaining GPU memory under-estimate when running the top-entry repo `/home/wusm/rust/ceno-reth-benchmark` against the local `/home/wusm/rust/ceno` repo.
+- The benchmark command must use `--rpc-url "$CENO_RPC"`; do not paste or persist concrete RPC URLs in logs or docs.
+- In the current shell, `CENO_RPC` was not set, so the benchmark repro could not be completed before restart.
+
+Current repo state
+- Main repo: `/home/wusm/rust/ceno`
+- Current branch includes commit:
+  - `5ecce046 fix mem estimator`
+- Important existing fix already present:
+  - `ceno_zkvm/src/scheme/gpu/memory.rs` now estimates `build_main_witness` by materialized GKR outputs, not only final tower outputs.
+  - This fixed the earlier `Ecall_Keccak` under-estimate where old estimate was around `11.73MB` and actual was `16.00MB`.
+- Local root `Cargo.toml` and `Cargo.lock` are dirty from pre-existing dependency/local-path work; do not accidentally revert them unless explicitly requested.
+
+New diagnostic patch added before restart
+- Added contextual labels to GPU memcheck output so future failures identify both stage and circuit.
+- Files touched:
+  - `ceno_zkvm/src/scheme/gpu/memory.rs`
+    - added `check_gpu_mem_estimation_with_context(...)`
+    - labels now print like `build_main_witness[Ecall_Keccak]`
+  - `ceno_zkvm/src/scheme/utils.rs`
+    - `build_main_witness` memcheck now includes first GKR layer name
+  - `ceno_zkvm/src/scheme/prover.rs`
+    - replay/build-tower/prove-tower memchecks now include circuit name in sequential GPU proving path
+  - `ceno_zkvm/src/scheme/gpu/mod.rs`
+    - prover trait memchecks now include first GKR layer name or task circuit name where available
+- This patch is diagnostic/safety oriented; it does not change memory estimates.
+
+Validation already run after diagnostic patch
+```bash
+cargo fmt --check
+```
+Result
+- Passed.
+
+```bash
+timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
+```
+Result
+- Passed.
+
+Lightweight memcheck e2e command run after diagnostic patch
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 900s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-light-keccak-context-memcheck.log
+```
+Result
+- Memcheck stages passed; no under-estimate panic.
+- The run still fails later at the known verifier assertion in `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`.
+- Useful log examples:
+  - `replay_gpu_witness_from_raw[Ecall_Keccak]: estimated=11.23MB, actual=11.23MB`
+  - `build_main_witness[Ecall_Keccak]: estimated=32.41MB, actual=32.59MB`
+  - `build_tower_witness_gpu[Ecall_Keccak]: estimated=105.83MB, actual=106.01MB`
+  - `prove_tower_relation_gpu[Ecall_Keccak]: estimated=36.84MB, actual=37.26MB`
+  - `replay_gpu_witness_from_raw[ShardRamCircuit]: estimated=0.38MB, actual=0.38MB`
+  - `build_main_witness[ShardRamCircuit_main]: estimated=0.01MB, actual=0.01MB`
+  - `build_tower_witness_gpu[ShardRamCircuit]: estimated=0.01MB, actual=0.02MB`
+
+Important conclusion so far
+- Lightweight Ceno `keccak_syscall` no longer reproduces the reported memcheck under-estimate.
+- The remaining issue appears large-payload/top-entry specific and needs the benchmark repro with `CENO_RPC` exported.
+- Because contextual memcheck labels are now in place, the next benchmark run should immediately identify the failing stage and circuit.
+
+Required environment for next session
+```bash
+export CENO_RPC='<redacted RPC URL>'
+```
+- The assistant cannot see shell variables unless they are present in the execution environment.
+- Verify with:
+```bash
+if [ -n "${CENO_RPC:-}" ]; then echo 'CENO_RPC is set'; else echo 'CENO_RPC is NOT set'; fi
+```
+
+Benchmark repro command to run next
+- Workdir: `/home/wusm/rust/ceno-reth-benchmark`
+- Use timeout and tee log.
+- Keep `--rpc-url "$CENO_RPC"` exactly; do not expand into a persisted command string.
+
+```bash
+/usr/bin/time -f 'elapsed %E' timeout 2400s env \
+  CENO_GPU_MEM_TRACKING=1 \
+  CENO_CONCURRENT_CHIP_PROVING=0 \
+  CENO_GPU_ENABLE_WITGEN=1 \
+  RUST_MIN_STACK=16777216 \
+  RUST_BACKTRACE=1 \
+  CYCLE_TRACKER_MAX_DEPTH=4 \
+  OUTPUT_PATH=metrics.json \
+  CENO_GPU_CACHE_LEVEL=0 \
+  RUSTFLAGS='-C target-feature=+avx2' \
+  JEMALLOC_SYS_WITH_MALLOC_CONF='retain:true,metadata_thp:always,thp:always,dirty_decay_ms:-1,muzzy_decay_ms:-1' \
+  RUST_LOG=debug \
+  cargo run --features jemalloc --features metrics --features perf-metrics --features gpu --bin ceno-reth-benchmark-bin -- \
+    --block-number 23587691 \
+    --rpc-url "$CENO_RPC" \
+    --cache-dir block_data \
+    --mode prove-app \
+    --app-proofs ./app_proof.bitcode \
+    --shard-id 0 \
+    --chain-id 1 \
+  2>&1 | tee /tmp/ceno-reth-benchmark-memcheck.log
+```
+
+After benchmark fails or completes
+1. Extract memcheck failure context:
+```bash
+rg -n "under-estimate|over-estimate|\\[memcheck\\].*diff=-" /tmp/ceno-reth-benchmark-memcheck.log | tail -120
+```
+2. The failing line should now include a label like:
+   - `build_main_witness[<circuit>]`
+   - `build_tower_witness_gpu[<circuit>]`
+   - `prove_tower_relation_gpu[<circuit>]`
+   - `replay_gpu_witness_from_raw[<circuit>]`
+3. Patch only the relevant estimator in `/home/wusm/rust/ceno`.
+4. Re-run lightweight Ceno check first:
+```bash
+cargo fmt --check
+timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
+```
+5. Then rerun the benchmark command above.
+
+Security hygiene
+- If a concrete RPC URL accidentally appears in any local log, scrub it immediately:
+```bash
+for f in /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark/*.txt /home/wusm/rust/ceno-reth-benchmark/*.log; do
+  [ -f "$f" ] || continue
+  perl -0pi -e 's#https://eth-mainnet\\.g\\.alchemy\\.com/v2/[^\\s\\x27\\"]+#\\$CENO_RPC#g' "$f"
+done
+```
+- Verify no RPC string remains:
+```bash
+rg -n 'alchemy|eth-mainnet\.g\.alchemy' /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark -g '*.txt' -g '*.log' -g '*.md' -g '*.json' 2>/dev/null || true
+```
+
+## Architecture refresher: compact GPU witness / memory-estimator terminology
+
+This section is intended for a fresh session before touching estimators or compact witness code.
+
+Core terminology
+- `occupied rows` / `actual rows`:
+  - Real number of rows with data for a chip or replay plan.
+  - Usually `input.num_instances() << rotation_vars` for normal chip inputs.
+  - For replayed GPU witgen, prefer replay-plan-specific real rows when available.
+- `logical domain` / `full domain`:
+  - Power-of-two domain implied by `num_vars`.
+  - Some protocols/verifier semantics still reason over this domain.
+  - Prover storage should avoid allocating it when compact storage is sufficient.
+- `compact witness`:
+  - Device/host storage sized by occupied rows, not full logical domain.
+  - This is the intended design for the GPU witgen/prover path.
+- `materialized output`:
+  - GKR layer output MLE that is actually allocated during `build_main_witness`.
+  - `EvalExpression::Single` and `EvalExpression::Linear` materialize; `Zero` does not.
+- `final/output GKR layer`:
+  - `gkr_circuit.layers[0]` because circuit layers are ordered output-to-input.
+  - The `output_mask` is applied only to this final/output layer during tower witness build.
+- `internal GKR layers`:
+  - Any layer after index 0 in `gkr_circuit.layers`.
+  - These do not receive the final tower `output_mask`; all non-zero outputs are materialized.
+- `replay path`:
+  - GPU witgen can replay raw records into device-backed witness matrices just-in-time.
+  - Large replay-heavy chips currently include `Ecall_Keccak` and `ShardRamCircuit`.
+- `stage split`:
+  - Large replay chips materialize witness multiple times for separate stages to reduce peak VRAM.
+  - Estimator must model stage-local peaks, not sum all stages as simultaneously live.
+
+Important module map
+- `ceno_zkvm/src/scheme/gpu/memory.rs`
+  - Central GPU memory estimator and memcheck assertion logic.
+  - Key functions:
+    - `estimate_chip_proof_memory`
+    - `estimate_trace_bytes`
+    - `estimate_main_witness_bytes`
+    - `estimate_tower_stage_components`
+    - `estimate_main_constraints_bytes`
+    - `estimate_replay_materialization_bytes_for_plan`
+    - `check_gpu_mem_estimation_with_context`
+- `ceno_zkvm/src/scheme/utils.rs`
+  - Builds main GKR witness through `build_main_witness` / `gkr_witness`.
+  - Owns output materialization mask logic:
+    - `tower_output_count`
+    - `build_output_materialization_mask`
+    - `first_layer_output_group_stage_masks`
+  - Critical design point:
+    - `output_mask` is applied only to final/output GKR layer.
+- `ceno_zkvm/src/scheme/prover.rs`
+  - Sequential per-chip GPU proving flow and replay stage splitting.
+  - Important stages:
+    - replay raw GPU witness
+    - build main witness
+    - build tower witness
+    - prove tower
+    - replay again for ECC/main constraints if needed
+- `ceno_zkvm/src/scheme/gpu/mod.rs`
+  - GPU prover trait implementations and shared helpers.
+  - Includes trait-level memchecks for tower/main/ecc/replay helper paths.
+- `../ceno-gpu/cuda_hal/src/common/tower/*`
+  - GPU tower witness/proof host-side implementation.
+- `../ceno-gpu/cpp/common/tower.cuh` and kernel files under `../ceno-gpu/cpp/*/kernels/tower.cu`
+  - CUDA tower kernels and compact split logic.
+- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
+  - Rust host-side V2 sumcheck setup.
+- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
+  - CUDA V2 sumcheck logic.
+
+Current compact witness design assumptions
+- Whole flow target:
+  - commit
+  - tower prove
+  - main prove
+  - rotation prove
+  - ECC prove
+  - batch opening
+  - should operate on compact witness storage wherever prover-side full-domain padding is not semantically required.
+- Round-0 original inputs:
+  - Use direct/native order over real occupied data.
+  - Do not invent tower-specific order separate from PCS.
+- Later folded rounds:
+  - Use normal in-place/folded buffer semantics.
+  - Do not call this a replay buffer; call it `in-place buffer`.
+- Compact even/odd tails:
+  - Avoid branch-per-element loops for odd real lengths.
+  - Decide odd/even outside the loop and process the leftover tail separately.
+- Cloning policy:
+  - Avoid full `clone` / `to_vec` on large witness buffers unless it is intentionally debug-only.
+
+Main witness memory-estimator design
+- Old broken model:
+  - `tower_output_count(composed_cs) * rows * sizeof(BB31Ext)`.
+  - This only counts final tower outputs.
+- Correct current model:
+  - Count final/output layer materialized tower outputs under the output mask.
+  - Plus count all internal layer non-zero outputs because internal layers are not masked.
+  - Multiply by real output rows, normally `input.witness.first().evaluations_len()`.
+- Why this matters:
+  - Multi-layer GKR circuits like `Ecall_Keccak` materialize internal outputs during `build_main_witness`.
+  - Single-layer circuits like `ShardRamCircuit_main` usually do not have the same missing-internal-output issue.
+
+Replay / trace estimator design
+- Normal non-replay path:
+  - Extracted witness and structural MLEs can stay resident across chip proof.
+  - Stage peak is resident trace plus max temporary stage.
+- Replay-heavy path (`Ecall_Keccak`, `ShardRamCircuit`):
+  - Estimate replay materialization from replay plan real rows, not full logical domain.
+  - Replay witness is materialized for tower stages, then cleared before tower prove/main stages as designed.
+  - Estimator should use max of replay/build/prove/ecc/main stage peaks plus safety margin.
+- Structural witness caveat:
+  - If structural RMM already has device backing, transport may be view-only and estimate zero new bytes.
+  - If not device-backed, estimate structural upload by real rows when possible.
+
+Tower estimator design
+- Build stage estimate includes:
+  - CUDA tower build temporary allocations from `estimate_build_tower_memory`.
+  - Compact product split buffers.
+  - Compact logup split buffers.
+- Prove stage estimate separates:
+  - live tower input buffers
+  - local create-proof temporary allocations
+- For logup:
+  - If table lookup has numerator, numerator buffers are real compact buffers.
+  - If no numerator, ones/default numerator should not allocate a full domain buffer.
+
+Scheduler / memcheck relationship
+- Sequential + `CENO_GPU_MEM_TRACKING=1`:
+  - Runs memcheck assertions stage-by-stage.
+  - This is the best mode for estimator debugging.
+- Concurrent + mem tracking disabled:
+  - Uses estimator for booking/scheduling VRAM, not direct memcheck assertions.
+- Booking can include extra safety margin for replay-heavy chips in concurrent mode.
+- A stage-local memcheck pass does not automatically prove concurrent booking is optimal, but it strongly validates the per-stage estimator.
+
+Current known caveats
+- Lightweight `keccak_syscall` memchecks pass after current estimator fixes.
+- The lightweight run still hits a known verifier assertion later at:
+  - `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`
+- The remaining reported under-estimate is only known from the top-entry benchmark payload and must be reproduced with `CENO_RPC` exported.
+- Do not guess the failing estimator from the old generic label; use the new contextual memcheck label first.
+
+Recommended investigation discipline
+1. Reproduce with sequential mem tracking first:
+   - `CENO_GPU_MEM_TRACKING=1`
+   - `CENO_CONCURRENT_CHIP_PROVING=0`
+2. Read the exact contextual label:
+   - `build_main_witness[...]`
+   - `build_tower_witness_gpu[...]`
+   - `prove_tower_relation_gpu[...]`
+   - `replay_gpu_witness_from_raw[...]`
+3. Patch only the estimator for that stage/circuit class.
+4. Validate in `/home/wusm/rust/ceno` first:
+   - `cargo fmt --check`
+   - `timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e`
+5. Then rerun the top-entry benchmark.

From df88decc60fff7deac779c8b5318b2d821cfa1fe Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 11:08:43 +0800
Subject: [PATCH 08/25] rollback Cargo.toml, Cargo.lock change

---
 Cargo.lock | 124 ++++++-----------------------------------------------
 Cargo.toml |  62 +++++++++++++--------------
 2 files changed, 45 insertions(+), 141 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba90fc0e6..04bcabaf7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1600,49 +1600,10 @@ version = "0.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2"
 
-[[package]]
-name = "cuda-config"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ee74643f7430213a1a78320f88649de309b20b80818325575e393f848f79f5d"
-dependencies = [
- "glob",
-]
-
-[[package]]
-name = "cuda-runtime-sys"
-version = "0.3.0-alpha.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d070b301187fee3c611e75a425cf12247b7c75c09729dbdef95cb9cb64e8c39"
-dependencies = [
- "cuda-config",
-]
-
 [[package]]
 name = "cuda_hal"
 version = "0.1.0"
-dependencies = [
- "anyhow",
- "cuda-runtime-sys",
- "cudarc",
- "downcast-rs",
- "either",
- "ff_ext",
- "itertools 0.13.0",
- "mpcs",
- "multilinear_extensions",
- "p3",
- "rand 0.8.5",
- "rayon",
- "sha2",
- "sppark",
- "sppark_plug",
- "sumcheck",
- "thiserror 1.0.69",
- "tracing",
- "transcript",
- "witness",
-]
+source = "git+https://github.com/scroll-tech/ceno-gpu-mock.git?branch=main#fe8f7923b7d3a3823c27949fab0aab8e31011aa9"
 
 [[package]]
 name = "cudarc"
@@ -2276,6 +2237,7 @@ dependencies = [
 [[package]]
 name = "ff_ext"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "once_cell",
  "p3",
@@ -2709,15 +2671,6 @@ dependencies = [
  "digest 0.10.7",
 ]
 
-[[package]]
-name = "home"
-version = "0.5.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
-dependencies = [
- "windows-sys 0.61.1",
-]
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.64"
@@ -3149,12 +3102,6 @@ dependencies = [
  "vcpkg",
 ]
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
@@ -3296,6 +3243,7 @@ dependencies = [
 [[package]]
 name = "mpcs"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -3319,6 +3267,7 @@ dependencies = [
 [[package]]
 name = "multilinear_extensions"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -4609,6 +4558,7 @@ dependencies = [
 [[package]]
 name = "p3"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "p3-air",
  "p3-baby-bear",
@@ -5176,6 +5126,7 @@ dependencies = [
 [[package]]
 name = "poseidon"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "p3",
@@ -5773,19 +5724,6 @@ dependencies = [
  "semver 1.0.26",
 ]
 
-[[package]]
-name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "rustix"
 version = "1.0.7"
@@ -5795,7 +5733,7 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys 0.9.4",
+ "linux-raw-sys",
  "windows-sys 0.59.0",
 ]
 
@@ -6145,6 +6083,7 @@ dependencies = [
 [[package]]
 name = "sp1-curves"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "cfg-if",
  "dashu",
@@ -6179,25 +6118,6 @@ dependencies = [
  "der",
 ]
 
-[[package]]
-name = "sppark"
-version = "0.1.11"
-dependencies = [
- "cc",
- "which",
-]
-
-[[package]]
-name = "sppark_plug"
-version = "0.1.0"
-dependencies = [
- "cc",
- "ff_ext",
- "itertools 0.13.0",
- "p3",
- "sppark",
-]
-
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -6288,6 +6208,7 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 [[package]]
 name = "sumcheck"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "either",
  "ff_ext",
@@ -6305,6 +6226,7 @@ dependencies = [
 [[package]]
 name = "sumcheck_macro"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "itertools 0.13.0",
  "p3",
@@ -6385,7 +6307,7 @@ dependencies = [
  "fastrand",
  "getrandom 0.3.2",
  "once_cell",
- "rustix 1.0.7",
+ "rustix",
  "windows-sys 0.59.0",
 ]
 
@@ -6711,6 +6633,7 @@ dependencies = [
 [[package]]
 name = "transcript"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "itertools 0.13.0",
@@ -7001,21 +6924,10 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
 [[package]]
 name = "whir"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "bincode 1.3.3",
  "clap",
@@ -7143,15 +7055,6 @@ dependencies = [
  "windows-targets 0.53.4",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.61.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f"
-dependencies = [
- "windows-link",
-]
-
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -7311,6 +7214,7 @@ dependencies = [
 [[package]]
 name = "witness"
 version = "0.1.0"
+source = "git+https://github.com/scroll-tech/gkr-backend.git?branch=feat%2Fmle_no_padding#1fc9f700b54dfb63415e3d4115d778fc10ad9131"
 dependencies = [
  "ff_ext",
  "multilinear_extensions",
diff --git a/Cargo.toml b/Cargo.toml
index fbbbab29a..59a7e8653 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,16 @@
 [workspace]
 members = [
-    "ceno_cli",
-    "ceno_emul",
-    "ceno_host",
-    "ceno_serde",
-    "ceno_rt",
-    "ceno_zkvm",
-    "ceno_recursion",
-    "derive",
-    "examples-builder",
-    "examples",
-    "guest_libs/*",
+  "ceno_cli",
+  "ceno_emul",
+  "ceno_host",
+  "ceno_serde",
+  "ceno_rt",
+  "ceno_zkvm",
+  "ceno_recursion",
+  "derive",
+  "examples-builder",
+  "examples",
+  "guest_libs/*",
 ]
 resolver = "2"
 
@@ -66,11 +66,11 @@ secp = "0.4.1"
 serde = { version = "1.0", features = ["derive", "rc"] }
 serde_json = "1.0"
 smallvec = { version = "1.13.2", features = [
-    "const_generics",
-    "const_new",
-    "serde",
-    "union",
-    "write",
+  "const_generics",
+  "const_new",
+  "serde",
+  "union",
+  "write",
 ] }
 strum = "0.26"
 strum_macros = "0.26"
@@ -79,7 +79,7 @@ thiserror = "2"
 thread_local = "1.1"
 tiny-keccak = { version = "2.0.2", features = ["keccak"] }
 tracing = { version = "0.1", features = [
-    "attributes",
+  "attributes",
 ] }
 tracing-forest = { version = "0.1.6" }
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@@ -127,20 +127,20 @@ lto = "thin"
 #ceno_crypto_primitives = { path = "../ceno-patch/crypto-primitives", package = "ceno_crypto_primitives" }
 #ceno_syscall = { path = "../ceno-patch/syscall", package = "ceno_syscall" }
 
-[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
-ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
-
-[patch."https://github.com/scroll-tech/gkr-backend"]
-ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
-mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
-multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
-p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
-poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
-sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
-sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
-transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
-whir = { path = "../gkr-backend/crates/whir", package = "whir" }
-witness = { path = "../gkr-backend/crates/witness", package = "witness" }
+#[patch."https://github.com/scroll-tech/ceno-gpu-mock.git"]
+#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal", default-features = false, features = ["bb31"] }
+#
+#[patch."https://github.com/scroll-tech/gkr-backend"]
+#ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
+#mpcs = { path = "../gkr-backend/crates/mpcs", package = "mpcs" }
+#multilinear_extensions = { path = "../gkr-backend/crates/multilinear_extensions", package = "multilinear_extensions" }
+#p3 = { path = "../gkr-backend/crates/p3", package = "p3" }
+#poseidon = { path = "../gkr-backend/crates/poseidon", package = "poseidon" }
+#sp1-curves = { path = "../gkr-backend/crates/curves", package = "sp1-curves" }
+#sumcheck = { path = "../gkr-backend/crates/sumcheck", package = "sumcheck" }
+#transcript = { path = "../gkr-backend/crates/transcript", package = "transcript" }
+#whir = { path = "../gkr-backend/crates/whir", package = "whir" }
+#witness = { path = "../gkr-backend/crates/witness", package = "witness" }
 
 # [patch."https://github.com/scroll-tech/openvm.git"]
 # openvm = { path = "../openvm-scroll-tech/crates/toolchain/openvm", default-features = false }

From b57b6928000e82256b43477e04766dc48787db86 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 13:43:18 +0800
Subject: [PATCH 09/25] fix memory estimation

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 11 +++++++----
 ceno_zkvm/src/scheme/gpu/mod.rs    | 13 +++++++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 7ef24d36d..3baf10bc5 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -374,6 +374,7 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
             estimate_trace_extraction_bytes(
                 cs.num_witin as usize,
                 num_var_with_rotation,
+                input.num_instances() << composed_cs.rotation_vars().unwrap_or(0),
                 witness_replayable,
             )
         };
@@ -640,11 +641,13 @@ pub(crate) fn estimate_tower_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
 pub(crate) fn estimate_trace_extraction_bytes(
     num_witin: usize,
     num_vars: usize,
+    occupied_rows: usize,
     witness_replayable: bool,
 ) -> (usize, usize) {
     let base_elem_size = std::mem::size_of::<BB31Base>();
     let mle_len = 1usize << num_vars;
-    let poly_bytes = num_witin * mle_len * base_elem_size;
+    let compact_poly_bytes = num_witin * occupied_rows * base_elem_size;
+    let logical_poly_bytes = num_witin * mle_len * base_elem_size;
 
     if should_materialize_witness_on_gpu() {
         if should_retain_witness_device_backing_after_commit() {
@@ -660,19 +663,19 @@ pub(crate) fn estimate_trace_extraction_bytes(
             // duration of the chip proof. There is no separate extraction temp
             // buffer, but the replayed witness itself must be accounted for as
             // resident task memory.
-            return (poly_bytes, 0);
+            return (compact_poly_bytes, 0);
         }
 
         // GPU witgen alone does not imply replayability. Non-replayable traces
         // still go through basefold::get_trace in cache-none mode, which
         // allocates the extracted witness plus a temporary 2x transpose buffer.
-        return (poly_bytes, 2 * poly_bytes);
+        return (compact_poly_bytes, 2 * logical_poly_bytes);
     }
 
     if matches!(get_gpu_cache_level(), CacheLevel::None) {
         // Default cache level is None
         // get_trace allocates poly copies (resident) + temp_buffer (2x, freed after)
-        (poly_bytes, 2 * poly_bytes)
+        (compact_poly_bytes, 2 * logical_poly_bytes)
     } else {
         (0, 0)
     }
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 445a8dd98..bc81a20d0 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -1339,9 +1339,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TraceCommitter<GpuBa
                 } else {
                     0
                 };
+                let occupied_rows = poly_group
+                    .first()
+                    .map(|poly| poly.evaluations().len())
+                    .unwrap_or(0);
 
                 let (resident, temporary) =
-                    estimate_trace_extraction_bytes(num_witin, num_vars, true);
+                    estimate_trace_extraction_bytes(num_witin, num_vars, occupied_rows, true);
                 check_gpu_mem_estimation(gpu_mem_tracker, resident + temporary);
 
                 trace_idx += 1;
@@ -1391,7 +1395,12 @@ where
         .get_trace(&cuda_hal, pcs_data_basefold, trace_idx, stream.as_ref())
         .unwrap_or_else(|err| panic!("Failed to extract trace {trace_idx}: {err}"));
 
-    let (resident, temporary) = estimate_trace_extraction_bytes(expected_num, num_vars, false);
+    let occupied_rows = poly_group
+        .first()
+        .map(|poly| poly.evaluations().len())
+        .unwrap_or(0);
+    let (resident, temporary) =
+        estimate_trace_extraction_bytes(expected_num, num_vars, occupied_rows, false);
     check_gpu_mem_estimation(gpu_mem_tracker, resident + temporary);
 
     let mles: Vec<Arc<MultilinearExtensionGpu<'a, E>>> = poly_group

From c50b793cc702c8122a4cc99b9d7cb5458514a705 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 15:24:44 +0800
Subject: [PATCH 10/25] verifier log

---
 ceno_zkvm/src/scheme/verifier.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ceno_zkvm/src/scheme/verifier.rs b/ceno_zkvm/src/scheme/verifier.rs
index 351e1d402..e45cae99d 100644
--- a/ceno_zkvm/src/scheme/verifier.rs
+++ b/ceno_zkvm/src/scheme/verifier.rs
@@ -340,6 +340,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
         vm_proof: ZKVMProof<E, PCS>,
         mut transcript: impl ForkableTranscript<E>,
     ) -> Result<SepticPoint<E::BaseField>, ZKVMError> {
+        tracing::info!(
+            "verifying shard proof: expected_shard_id={}, proof_shard_id={}, chip_groups={}",
+            shard_id,
+            vm_proof.public_values.shard_id,
+            vm_proof.chip_proofs.len()
+        );
+
         // main invariant between opcode circuits and table circuits
         let mut prod_r = E::ONE;
         let mut prod_w = E::ONE;

From 89b86987b2d1598b4c4d591cf449b31700bf1d66 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 16:48:06 +0800
Subject: [PATCH 11/25] Pass tower input by value for GPU proving

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 1 -
 ceno_zkvm/src/scheme/gpu/mod.rs    | 2 +-
 ceno_zkvm/src/scheme/prover.rs     | 3 +--
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 3baf10bc5..69c39761e 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -148,7 +148,6 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     let (tower_build_bytes, tower_prove_local_bytes, tower_input_live_bytes) =
         estimate_tower_stage_components(composed_cs, input);
     let tower_prove_peak_bytes = tower_input_live_bytes + tower_prove_local_bytes;
-    let tower_temporary_bytes = tower_build_bytes.max(tower_prove_peak_bytes);
 
     // Part 5: main constraints (temporary usage)
     let main_constraints_temporary_bytes = estimate_main_constraints_bytes(composed_cs, input);
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index bc81a20d0..2113b3ed8 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -446,7 +446,7 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
         let (point_gl, proof_gpu) = info_span!("[ceno] prove_tower_relation_gpu").in_scope(|| {
             cuda_hal
                 .tower
-                .create_proof(cuda_hal, &tower_input, NUM_FANIN, basic_tr, stream.as_ref())
+                .create_proof(cuda_hal, tower_input, NUM_FANIN, basic_tr, stream.as_ref())
                 .expect("gpu tower create_proof failed")
         });
         exit_span!(span);
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 651810b30..b48aa017a 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1315,7 +1315,7 @@ where
                     .tower
                     .create_proof(
                         &cuda_hal,
-                        &tower_input,
+                        tower_input,
                         NUM_FANIN,
                         basic_tr,
                         gkr_iop::gpu::get_thread_stream().as_ref(),
@@ -1332,7 +1332,6 @@ where
             Some(name),
         );
         drop(records);
-        drop(tower_input);
         log_gpu_device_state(&format!("{name}:after_drop_tower"));
         exit_span!(span);
 

From f210e1f61bed53e2853e480b5306eb95ce61be36 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 17:50:40 +0800
Subject: [PATCH 12/25] split tower layer by view

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 40 ++----------------------------
 ceno_zkvm/src/scheme/gpu/mod.rs    | 12 +++------
 2 files changed, 5 insertions(+), 47 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 69c39761e..f0764c17f 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -1,7 +1,7 @@
 use crate::{
     instructions::gpu::dispatch::GpuWitgenKind,
     scheme::{
-        constants::{NUM_FANIN, NUM_FANIN_LOGUP, SEPTIC_EXTENSION_DEGREE},
+        constants::{NUM_FANIN, SEPTIC_EXTENSION_DEGREE},
         hal::ProofInput,
         utils::tower_output_count,
     },
@@ -550,26 +550,6 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         elem_size,
         has_logup_numerator,
     );
-    let prod_split_bytes = if num_prod_towers > 0 {
-        num_prod_towers
-            * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN)
-            * elem_size
-    } else {
-        0
-    };
-    let logup_split_bytes = if num_logup_towers > 0 {
-        let denominator_bytes = num_logup_towers
-            * compact_split_stored_elems(occupied_rows, 1 << (num_vars + 1), NUM_FANIN_LOGUP)
-            * elem_size;
-        let numerator_or_ones_bytes = if has_logup_numerator {
-            denominator_bytes
-        } else {
-            elem_size
-        };
-        denominator_bytes + numerator_or_ones_bytes
-    } else {
-        0
-    };
     let shard_ram_tower_batch_overhead = composed_cs
         .gkr_circuit
         .as_ref()
@@ -577,10 +557,7 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         .is_some_and(|layer| layer.name == "ShardRamCircuit_main")
         .then_some(10 * 1024 * 1024)
         .unwrap_or(0);
-    let build_bytes = build_est.total_bytes
-        + prod_split_bytes
-        + logup_split_bytes
-        + shard_ram_tower_batch_overhead;
+    let build_bytes = build_est.total_bytes + shard_ram_tower_batch_overhead;
     let prove_est = estimate_prove_tower_memory(
         num_prod_towers,
         num_logup_towers,
@@ -598,19 +575,6 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
     (build_bytes, prove_local_bytes, tower_input_live_bytes)
 }
 
-fn compact_split_stored_elems(occupied_len: usize, logical_len: usize, num_chunks: usize) -> usize {
-    let chunk_size = logical_len / num_chunks;
-    (0..num_chunks)
-        .map(|chunk_idx| {
-            let chunk_start = chunk_idx * chunk_size;
-            occupied_len
-                .saturating_sub(chunk_start)
-                .min(chunk_size)
-                .max(1)
-        })
-        .sum()
-}
-
 /// Estimate temporary GPU memory for the tower proving stage (build + prove).
 /// Used by prove_tower_relation to validate against actual mem_tracker measurements.
 pub(crate) fn estimate_tower_stage_bytes<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 2113b3ed8..d2473faa6 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -1694,13 +1694,7 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
         .map(|wit| match wit.inner() {
             gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
                 .tower
-                .masked_mle_split_to_chunks(
-                    &*cuda_hal,
-                    poly,
-                    NUM_FANIN,
-                    BB31Ext::ONE,
-                    stream.as_ref(),
-                )
+                .masked_mle_view_chunks(&*cuda_hal, poly, NUM_FANIN, BB31Ext::ONE, stream.as_ref())
                 .map_err(|e| format!("Failed to split compact prod tower input: {e}")),
             _ => return Err("tower witness expects extension-field record MLEs".to_string()),
         })
@@ -1716,7 +1710,7 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
         .map(|wit| match wit.inner() {
             gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
                 .tower
-                .masked_mle_split_to_chunks(
+                .masked_mle_view_chunks(
                     &*cuda_hal,
                     poly,
                     NUM_FANIN_LOGUP,
@@ -1732,7 +1726,7 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
         .map(|wit| match wit.inner() {
             gkr_iop::gpu::GpuFieldType::Ext(poly) => cuda_hal
                 .tower
-                .masked_mle_split_to_chunks(
+                .masked_mle_view_chunks(
                     &*cuda_hal,
                     poly,
                     NUM_FANIN_LOGUP,

From 99b7a94524ecf46f530fc2ae8b14d40b763fc069 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 21:10:27 +0800
Subject: [PATCH 13/25] Use dense tower build for compact GPU input

---
 ceno_zkvm/src/scheme/gpu/mod.rs | 86 +++++++++------------------------
 1 file changed, 22 insertions(+), 64 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index d2473faa6..ea7090807 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -32,7 +32,7 @@ use gkr_iop::{
         layer::{LayerWitness, gpu::utils::extract_mle_relationships_from_monomial_terms},
     },
     gpu::{GpuBackend, GpuProver, gpu_prover::BB31Ext},
-    hal::{MultilinearPolynomial, ProverBackend},
+    hal::ProverBackend,
 };
 use itertools::{Itertools, chain};
 use mpcs::{Point, PolynomialCommitmentScheme};
@@ -69,59 +69,6 @@ use gkr_iop::gpu::gpu_prover::*;
 
 mod memory;
 
-fn pad_gpu_mles_to_full_domain<E: ExtensionField>(
-    mles: impl IntoIterator<Item = ArcMultilinearExtensionGpu<'static, E>>,
-) -> Vec<ArcMultilinearExtensionGpu<'static, E>> {
-    let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL");
-    let stream = gkr_iop::gpu::get_thread_stream();
-    mles.into_iter()
-        .map(|mle| {
-            let mle_ref = mle.as_ref();
-            let full_len = 1usize << mle_ref.num_vars();
-            if mle_ref.evaluations_len() == full_len {
-                return mle;
-            }
-            let padded: gkr_iop::gpu::MultilinearExtensionGpu<'static, E> = match mle_ref.inner() {
-                gkr_iop::gpu::GpuFieldType::Base(poly) => {
-                    let mut host = poly.to_cpu_vec(stream.as_ref());
-                    host.resize(full_len, BB31Base::ZERO);
-                    unsafe {
-                        std::mem::transmute(
-                            gkr_iop::gpu::MultilinearExtensionGpu::<E>::from_ceno_gpu_base(
-                                ceno_gpu::bb31::GpuPolynomial::from_ceno_vec(
-                                    &cuda_hal,
-                                    &host,
-                                    mle_ref.num_vars(),
-                                    stream.as_ref(),
-                                )
-                                .expect("pad base mle"),
-                            ),
-                        )
-                    }
-                }
-                gkr_iop::gpu::GpuFieldType::Ext(poly) => {
-                    let mut host = poly.to_cpu_vec(stream.as_ref());
-                    host.resize(full_len, BB31Ext::ZERO);
-                    unsafe {
-                        std::mem::transmute(
-                            gkr_iop::gpu::MultilinearExtensionGpu::<E>::from_ceno_gpu_ext(
-                                ceno_gpu::bb31::GpuPolynomialExt::from_ceno_vec(
-                                    &cuda_hal,
-                                    &host,
-                                    mle_ref.num_vars(),
-                                    stream.as_ref(),
-                                )
-                                .expect("pad ext mle"),
-                            ),
-                        )
-                    }
-                }
-                gkr_iop::gpu::GpuFieldType::Unreachable => unreachable!(),
-            };
-            Arc::new(padded)
-        })
-        .collect()
-}
 mod util;
 pub(crate) use memory::{
     check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory,
@@ -539,12 +486,12 @@ pub fn prove_rotation_impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>
     let log2_num_instances = input.log2_num_instances();
     let num_threads = optimal_sumcheck_threads(log2_num_instances);
     let num_var_with_rotation = log2_num_instances + composed_cs.rotation_vars().unwrap_or(0);
-    let padded_wit_storage = pad_gpu_mles_to_full_domain(
+    let wit = LayerWitness(
         chain!(&input.witness, &input.fixed, &input.structural_witness)
             .cloned()
-            .map(|mle| unsafe { std::mem::transmute(mle) }),
+            .map(|mle| unsafe { std::mem::transmute(mle) })
+            .collect(),
     );
-    let wit = LayerWitness(padded_wit_storage);
 
     let (proof, points) = gkr_iop::gkr::layer::gpu::prove_rotation_gpu::<E, PCS>(
         num_threads,
@@ -758,11 +705,12 @@ pub fn prove_main_constraints_impl<
         num_threads,
         num_var_with_rotation,
         gkr::GKRCircuitWitness {
-            layers: vec![LayerWitness(pad_gpu_mles_to_full_domain(
+            layers: vec![LayerWitness(
                 chain!(&input.witness, &input.fixed, &input.structural_witness,)
                     .cloned()
-                    .map(|mle| unsafe { std::mem::transmute(mle) }),
-            ))],
+                    .map(|mle| unsafe { std::mem::transmute(mle) })
+                    .collect(),
+            )],
         },
         &out_evals,
         &input
@@ -1807,7 +1755,7 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
         let last_layers_refs: Vec<&[GpuPolynomialExt<'_>]> =
             prod_last_layers.iter().map(|v| v.as_slice()).collect();
         let gpu_specs = {
-            cuda_hal.tower.build_prod_tower_from_gpu_polys_batch(
+            cuda_hal.tower.build_prod_tower_dense_from_gpu_polys_batch(
                 cuda_hal,
                 &last_layers_refs,
                 num_vars,
@@ -1815,7 +1763,12 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
                 stream.as_ref(),
             )
         }
-        .map_err(|e| format!("build_prod_tower_from_gpu_polys_batch failed: {:?}", e))?;
+        .map_err(|e| {
+            format!(
+                "build_prod_tower_dense_from_gpu_polys_batch failed: {:?}",
+                e
+            )
+        })?;
         prod_gpu_specs.extend(gpu_specs);
         exit_span!(span_prod);
     }
@@ -1837,14 +1790,19 @@ pub(crate) fn build_tower_witness_gpu<E: ExtensionField>(
             logup_last_layers.iter().map(|v| v.as_slice()).collect();
         let gpu_specs = cuda_hal
             .tower
-            .build_logup_tower_from_gpu_polys_batch(
+            .build_logup_tower_dense_from_gpu_polys_batch(
                 cuda_hal,
                 &last_layers_refs,
                 num_vars,
                 num_towers,
                 stream.as_ref(),
             )
-            .map_err(|e| format!("build_logup_tower_from_gpu_polys_batch failed: {:?}", e))?;
+            .map_err(|e| {
+                format!(
+                    "build_logup_tower_dense_from_gpu_polys_batch failed: {:?}",
+                    e
+                )
+            })?;
         logup_gpu_specs.extend(gpu_specs);
         exit_span!(span_logup);
     }

From f0d81b641f730eb7824eaa7d0a893a4ef2cff6e0 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 21:40:40 +0800
Subject: [PATCH 14/25] Pass logup shape to tower prove estimator

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index f0764c17f..6eaabd50b 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -566,6 +566,7 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         occupied_rows,
         NUM_FANIN,
         elem_size,
+        has_logup_numerator,
     );
 
     let tower_input_live_bytes =

From 917810cdf06222f2b033f1a4aa18f1731c201b37 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 22:19:45 +0800
Subject: [PATCH 15/25] Deduplicate borrowed tower input booking

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 6eaabd50b..68a200182 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -145,9 +145,17 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     // `tower_prove_local_bytes` is only the new allocation occupancy inside
     // `create_proof`, while `tower_input_live_bytes` tracks the already-built
     // TowerInput buffers that remain live during that stage.
-    let (tower_build_bytes, tower_prove_local_bytes, tower_input_live_bytes) =
-        estimate_tower_stage_components(composed_cs, input);
+    let (
+        tower_build_bytes,
+        tower_prove_local_bytes,
+        tower_input_live_bytes,
+        borrowed_tower_input_bytes,
+    ) = estimate_tower_stage_components(composed_cs, input);
+    let scheduler_tower_input_live_bytes =
+        tower_input_live_bytes.saturating_sub(borrowed_tower_input_bytes);
     let tower_prove_peak_bytes = tower_input_live_bytes + tower_prove_local_bytes;
+    let scheduler_tower_prove_peak_bytes =
+        scheduler_tower_input_live_bytes + tower_prove_local_bytes;
 
     // Part 5: main constraints (temporary usage)
     let main_constraints_temporary_bytes = estimate_main_constraints_bytes(composed_cs, input);
@@ -189,7 +197,7 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         // to stay live through tower proving. They are dropped before the
         // ECC/rotation/main-constraint stages.
         let tower_build_stage_bytes = main_witness_bytes + tower_build_bytes;
-        let tower_prove_stage_bytes = main_witness_bytes + tower_prove_peak_bytes;
+        let tower_prove_stage_bytes = main_witness_bytes + scheduler_tower_prove_peak_bytes;
         let stage_peak = trace_est
             .trace_temporary_bytes
             .max(tower_build_stage_bytes)
@@ -525,7 +533,7 @@ pub(crate) fn estimate_main_constraints_bytes<
 fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
-) -> (usize, usize, usize) {
+) -> (usize, usize, usize, usize) {
     let cs = &composed_cs.zkvm_v1_css;
     let num_prod_towers = composed_cs.num_reads() + composed_cs.num_writes();
     let num_logup_towers = if composed_cs.is_with_lk_table() {
@@ -571,9 +579,16 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
 
     let tower_input_live_bytes =
         prove_est.prod_tower_buffer_bytes + prove_est.logup_tower_buffer_bytes;
+    let borrowed_input_bytes =
+        prove_est.prod_borrowed_input_bytes + prove_est.logup_borrowed_input_bytes;
     let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes);
 
-    (build_bytes, prove_local_bytes, tower_input_live_bytes)
+    (
+        build_bytes,
+        prove_local_bytes,
+        tower_input_live_bytes,
+        borrowed_input_bytes,
+    )
 }
 
 /// Estimate temporary GPU memory for the tower proving stage (build + prove).
@@ -582,7 +597,8 @@ pub(crate) fn estimate_tower_stage_bytes<E: ExtensionField, PCS: PolynomialCommi
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
 ) -> (usize, usize) {
-    let (build_bytes, prove_local_bytes, _) = estimate_tower_stage_components(composed_cs, input);
+    let (build_bytes, prove_local_bytes, _, _) =
+        estimate_tower_stage_components(composed_cs, input);
     (build_bytes, prove_local_bytes)
 }
 
@@ -590,7 +606,7 @@ pub(crate) fn estimate_tower_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
 ) -> usize {
-    let (build_bytes, prove_local_bytes, tower_input_live_bytes) =
+    let (build_bytes, prove_local_bytes, tower_input_live_bytes, _) =
         estimate_tower_stage_components(composed_cs, input);
     build_bytes.max(tower_input_live_bytes + prove_local_bytes)
 }

From 4fc8daeb59eafb82b532cc6ba39ec61e6a2799d6 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 22:21:57 +0800
Subject: [PATCH 16/25] fix logging

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 68a200182..fb8e5871f 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -240,7 +240,7 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         );
     } else {
         let tower_build_stage_bytes = main_witness_bytes + tower_build_bytes;
-        let tower_prove_stage_bytes = main_witness_bytes + tower_prove_peak_bytes;
+        let tower_prove_stage_bytes = main_witness_bytes + scheduler_tower_prove_peak_bytes;
         // Resident memory (always occupied during chip proof)
         tracing::info!(
             "[mem estimate][{}] resident: trace={:.2}MB",

From ef9fa3025b70010e919e3686d84208adbbad9a1f Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 23:11:57 +0800
Subject: [PATCH 17/25] Check scheduler memory estimate in mem tracking

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 73 ++++++++++++++++++++++++++++--
 ceno_zkvm/src/scheme/gpu/mod.rs    |  3 +-
 ceno_zkvm/src/scheme/prover.rs     | 11 +++++
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index fb8e5871f..f61bce28f 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -45,6 +45,7 @@ pub fn init_gpu_mem_tracker<'a>(
 
 const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB
 const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB
+const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024;
 
 /// Validate that the estimated GPU memory matches actual usage within tolerance.
 /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES`
@@ -57,6 +58,70 @@ pub fn check_gpu_mem_estimation_with_context(
     mem_tracker: Option<MemTracker>,
     estimated_bytes: usize,
     context: Option<&str>,
+) {
+    check_gpu_mem_estimation_with_margins(
+        mem_tracker,
+        estimated_bytes,
+        context,
+        ESTIMATION_TOLERANCE_BYTES,
+        ESTIMATION_SAFETY_MARGIN_BYTES,
+    );
+}
+
+pub fn check_gpu_scheduler_mem_estimation_with_context(
+    mem_tracker: Option<MemTracker>,
+    estimated_bytes: usize,
+    context: Option<&str>,
+) {
+    // Scheduler estimates are admission-control estimates, not exact stage-local allocation
+    // estimates. They intentionally include safety margins and conservative lifetime overlap, so
+    // large over-estimates should be surfaced as warnings rather than failing the proof. Under-
+    // estimates remain hard failures because they can admit unsafe concurrent work.
+    if let Some(mem_tracker) = mem_tracker {
+        const ONE_MB: usize = 1024 * 1024;
+        let label = mem_tracker.name();
+        let label = context
+            .filter(|context| !context.is_empty())
+            .map(|context| format!("{label}[{context}]"))
+            .unwrap_or_else(|| label.to_string());
+        let mem_stats = mem_tracker.finish();
+        let actual_bytes = mem_stats.mem_occupancy as usize;
+        let diff = estimated_bytes as isize - actual_bytes as isize;
+        let to_mb = |b: usize| b as f64 / ONE_MB as f64;
+        let diff_mb = diff as f64 / ONE_MB as f64;
+        tracing::info!(
+            "[memcheck] {label}: scheduler_estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB",
+            to_mb(estimated_bytes),
+            to_mb(actual_bytes),
+            diff_mb
+        );
+        if diff < 0 {
+            assert!(
+                (-diff) as usize <= ESTIMATION_TOLERANCE_BYTES,
+                "[memcheck] {label}: scheduler under-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, tolerance={:.2}MB",
+                to_mb(estimated_bytes),
+                to_mb(actual_bytes),
+                diff_mb,
+                to_mb(ESTIMATION_TOLERANCE_BYTES),
+            );
+        } else if diff as usize > SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES {
+            tracing::warn!(
+                "[memcheck] {label}: scheduler over-estimate warning: estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, warning_margin={:.2}MB",
+                to_mb(estimated_bytes),
+                to_mb(actual_bytes),
+                diff_mb,
+                to_mb(SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES),
+            );
+        }
+    }
+}
+
+fn check_gpu_mem_estimation_with_margins(
+    mem_tracker: Option<MemTracker>,
+    estimated_bytes: usize,
+    context: Option<&str>,
+    under_tolerance_bytes: usize,
+    over_tolerance_bytes: usize,
 ) {
     // `mem_tracker will` be Some only in sequential mode with mem tracking enabled, so if it's None, do nothing
     if let Some(mem_tracker) = mem_tracker {
@@ -80,22 +145,22 @@ pub fn check_gpu_mem_estimation_with_context(
         if diff < 0 {
             // Under-estimate: actual exceeds estimated
             assert!(
-                (-diff) as usize <= ESTIMATION_TOLERANCE_BYTES,
+                (-diff) as usize <= under_tolerance_bytes,
                 "[memcheck] {label}: under-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, tolerance={:.2}MB",
                 to_mb(estimated_bytes),
                 to_mb(actual_bytes),
                 diff_mb,
-                to_mb(ESTIMATION_TOLERANCE_BYTES),
+                to_mb(under_tolerance_bytes),
             );
         } else {
             // Over-estimate: estimated exceeds actual
             assert!(
-                diff as usize <= ESTIMATION_SAFETY_MARGIN_BYTES,
+                diff as usize <= over_tolerance_bytes,
                 "[memcheck] {label}: over-estimate! estimated={:.2}MB, actual={:.2}MB, diff={:.2}MB, margin={:.2}MB",
                 to_mb(estimated_bytes),
                 to_mb(actual_bytes),
                 diff_mb,
-                to_mb(ESTIMATION_SAFETY_MARGIN_BYTES),
+                to_mb(over_tolerance_bytes),
             );
         }
     }
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index ea7090807..986b04439 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -71,7 +71,8 @@ mod memory;
 
 mod util;
 pub(crate) use memory::{
-    check_gpu_mem_estimation, check_gpu_mem_estimation_with_context, estimate_chip_proof_memory,
+    check_gpu_mem_estimation, check_gpu_mem_estimation_with_context,
+    check_gpu_scheduler_mem_estimation_with_context, estimate_chip_proof_memory,
     estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan,
     estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker,
 };
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index b48aa017a..d1041aeb5 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -603,6 +603,12 @@ impl<
                         task.circuit_idx as u64,
                     ));
 
+                    let task_name = task.circuit_name.clone();
+                    let estimated_memory_bytes = task.estimated_memory_bytes as usize;
+                    let cuda_hal = gkr_iop::gpu::get_cuda_hal().expect("Failed to get CUDA HAL");
+                    let chip_mem_tracker =
+                        crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "create_chip_proof");
+
                     let gpu_input: ProofInput<'static, gkr_iop::gpu::GpuBackend<E, PCS>> =
                         unsafe { std::mem::transmute(task.input) };
 
@@ -619,6 +625,11 @@ impl<
                             task.num_witin,
                             task.structural_rmm,
                         )?;
+                    crate::scheme::gpu::check_gpu_scheduler_mem_estimation_with_context(
+                        chip_mem_tracker,
+                        estimated_memory_bytes,
+                        Some(task_name.as_str()),
+                    );
 
                     Ok(ChipTaskResult {
                         task_id: task.task_id,

From 011a8981324320c60de431bfcd5d738732b35f68 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 23:32:03 +0800
Subject: [PATCH 18/25] Refine replay tower proof memory estimate

---
 ceno_zkvm/src/scheme/prover.rs | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index d1041aeb5..33627ae68 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1268,13 +1268,13 @@ where
         let span = entered_span!("prove_tower_relation", profiling_2 = true);
         let r_set_len =
             cs.zkvm_v1_css.r_expressions.len() + cs.zkvm_v1_css.r_table_expressions.len();
-        let (tower_build_estimated_bytes, tower_prove_estimated_bytes) =
+        let (tower_build_estimated_bytes, tower_prove_prebuild_estimated_bytes) =
             estimate_tower_stage_bytes::<E, PCS>(cs, &input);
         tracing::info!(
             "[gpu tower][{}] estimated: build_tower={:.2}MB, prove_tower={:.2}MB",
             name,
             tower_build_estimated_bytes as f64 / (1024.0 * 1024.0),
-            tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0),
+            tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0),
         );
         let tower_build_mem_tracker =
             crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "build_tower_witness_gpu");
@@ -1316,6 +1316,27 @@ where
             prod_specs: prod_gpu,
             logup_specs: logup_gpu,
         };
+        let tower_prove_estimate = cuda_hal
+            .tower
+            .estimate_memory_requirements(&tower_input, NUM_FANIN);
+        let tower_input_live_bytes = tower_prove_estimate.prod_tower_buffer_bytes
+            + tower_prove_estimate.logup_tower_buffer_bytes;
+        let runtime_layout_prove_bytes = tower_prove_estimate
+            .total_bytes
+            .saturating_sub(tower_input_live_bytes);
+        let release_adjusted_prebuild_bytes =
+            tower_prove_prebuild_estimated_bytes / NUM_FANIN + 4 * 1024 * 1024;
+        let tower_prove_estimated_bytes =
+            runtime_layout_prove_bytes.max(release_adjusted_prebuild_bytes);
+        tracing::info!(
+            "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, local={:.2}MB, tower_live={:.2}MB",
+            name,
+            tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0),
+            runtime_layout_prove_bytes as f64 / (1024.0 * 1024.0),
+            release_adjusted_prebuild_bytes as f64 / (1024.0 * 1024.0),
+            tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0),
+            tower_input_live_bytes as f64 / (1024.0 * 1024.0),
+        );
         let tower_prove_mem_tracker =
             crate::scheme::gpu::init_gpu_mem_tracker(&cuda_hal, "prove_tower_relation_gpu");
         log_gpu_device_state(&format!("{name}:before_prove_tower"));

From f3ca1cf35f4ad40db9b116693c4b1bce7832c32f Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Mon, 27 Apr 2026 23:38:32 +0800
Subject: [PATCH 19/25] clippy fix

---
 ceno_zkvm/src/bin/e2e.rs |   3 +-
 summary.md               | 583 ---------------------------------------
 2 files changed, 1 insertion(+), 585 deletions(-)
 delete mode 100644 summary.md

diff --git a/ceno_zkvm/src/bin/e2e.rs b/ceno_zkvm/src/bin/e2e.rs
index 721708389..c489567a7 100644
--- a/ceno_zkvm/src/bin/e2e.rs
+++ b/ceno_zkvm/src/bin/e2e.rs
@@ -5,8 +5,7 @@ use ceno_zkvm::print_allocated_bytes;
 use ceno_zkvm::{
     e2e::{
         Checkpoint, FieldType, MultiProver, PcsKind, Preset, public_io_words_to_digest_words,
-        run_e2e_full_trace_verify, run_e2e_single_shard_debug_verify, run_e2e_with_checkpoint,
-        setup_platform, setup_platform_debug,
+        run_e2e_with_checkpoint, setup_platform, setup_platform_debug,
     },
     scheme::{
         ZKVMProof, constants::MAX_NUM_VARIABLES, create_backend, create_prover, hal::ProverDevice,
diff --git a/summary.md b/summary.md
deleted file mode 100644
index 3b6979e24..000000000
--- a/summary.md
+++ /dev/null
@@ -1,583 +0,0 @@
-# WIP Summary: non-pow2 prover storage / GPU tower + PCS follow-up
-
-Date: 2026-04-25
-
-Repos involved
-- current repo: `/home/wusm/rust/ceno`
-- GPU repo: `/home/wusm/rust/ceno-gpu`
-- backend repo: `/home/wusm/rust/gkr-backend`
-
-Primary goal
-- Remove prover-side MLE zero padding to next power-of-two.
-- Keep prover storage compact by occupied length.
-- Verifier semantics stay unchanged.
-
-Design agreed in this WIP
-- Raw/original MLE inputs before sumcheck round 0 should use one unified policy:
-  - direct/native order
-  - occupied length respected
-  - this applies to both tower and PCS batch opening
-- After round 0:
-  - folded values can use the normal later-round in-place buffer layout
-- No separate application-specific policy for tower vs PCS.
-- For tower specifically:
-  - within one tower layer, all MLEs should have the same `num_vars`
-  - tower should not rely on a meaningful “small MLE” mixed-size case
-
-What was fixed earlier in this WIP
-
-1. PCS / batch-open path
-- Fixed missing round evaluations from GPU V2 sumcheck:
-  - `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
-- Fixed compact raw-data handling in batch open and commit/open consistency.
-- Fixed an earlier `RootMismatch` by correcting raw trace -> encode padding boundary in batch commit.
-- PCS later reached `final_codeword.values[idx] != folded`, then was narrowed further.
-- At one point PCS/basefold batch-open `eq` layout mismatch was fixed by using Ceno/direct order.
-- CPU e2e for the lightweight repro still passes.
-
-2. Tower witness/materialization direction
-- Compact CPU oracle for tower semantics was added in:
-  - `../ceno-gpu/cuda_hal/src/common/tower/utils.rs`
-- GPU tower build path was refactored toward compact storage in:
-  - `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
-  - `../ceno-gpu/cpp/common/tower.cuh`
-  - `../ceno-gpu/cpp/bb31/kernels/tower.cu`
-  - `../ceno-gpu/cpp/gl64/kernels/tower.cu`
-- A lifetime bug causing segfault in GPU tower eval extraction was fixed by retaining owned buffer backing:
-  - `../ceno-gpu/cuda_hal/src/common/buffer.rs`
-  - `../ceno-gpu/cuda_hal/src/lib.rs`
-
-3. Important debug correction
-- There was a previous debug bug caused by cloning the transcript after GPU proving.
-- That was fixed.
-- Current CPU/GPU prover compares should assume transcript state is cloned before proof generation.
-
-Current CPU/GPU status
-
-CPU baseline
-- Command:
-  - `cargo run --release --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
-- Result:
-  - passes
-
-GPU lightweight repro
-- Command:
-  - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
-- Current result:
-  - still fails with tower verification mismatch
-  - source:
-    - `ceno_zkvm/src/e2e.rs:2347`
-    - `VerifyError("mismatch tower evaluation")`
-
-Most important findings from the latest tower debug
-
-1. Tower witness is not the first bad stage
-- CPU/GPU tower witness compare did not fail first.
-- Tower witness transport/leaf construction is not the main active bug.
-
-2. The earlier isolated layer-2 compare proved:
-- `cpu_direct == v1`
-- `v2 != cpu_direct`
-- This was on a tower layer where all MLEs were full occupied:
-  - debug payload showed `mle_shape=[(?, 2, 4), ...]`
-  - meaning `num_vars=2`, `len=4` for all MLEs in that isolated layer
-- That means the tower failure is not because tower requires mixed-size/small-MLE semantics.
-
-3. The current design conclusion
-- Tower should use the same original-input policy as PCS:
-  - direct order before round 0
-  - later rounds use the in-place buffer
-- Do NOT think of this as two policies.
-
-4. Terminology decision
-- Do not call later-round folded storage “replay buffer”.
-- Call it:
-  - in-place buffer
-- Round 0:
-  - non-in-place, reading original inputs
-- Round > 0:
-  - in-place
-
-Latest code changes in the current session
-
-In `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
-- Renamed V2 metadata from `compact_layout_flags` to `original_layout_flags`
-- This now means:
-  - `1` => original round-0 input is direct/native order
-- This is intended to make the model explicit and shared across tower + PCS
-
-In `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
-- Added `direct_pair_index_v2`
-- Changed direct-order round-0 reads for full-size equal-`num_vars` originals to use adjacent pairs:
-  - `(2p, 2p+1)`
-  - not `(p, p + stride)`
-- Restored small-MLE helper mapping back to high-bit based mapping:
-  - `suffix_small_index_v2(...)` currently uses:
-    - `tid >> (num_vars - 1 - mle_num_vars)`
-- Reverted an incorrect attempt to bit-reverse first-fold writes into the in-place buffer
-- Current code writes first-fold results contiguously into the in-place buffer
-
-In `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
-- Relaxed tower assertions so layers can be compact-by-occupation, not necessarily full logical length at Rust-side checks
-
-What the latest tower debug showed
-
-Most recent trustworthy mismatch before the last interrupted run
-- CPU/GPU tower compare failed at:
-  - `ceno_zkvm/src/scheme/gpu/mod.rs:665`
-- Message:
-  - `CPU/GPU tower sumcheck proof mismatch: first_round=Some(2)`
-- Interpretation:
-  - earlier proof entries already match
-  - divergence starts later, consistent with in-place-buffer semantics rather than original-input semantics
-
-Important caution about last run
-- A later run was interrupted before producing a new useful payload.
-- So do NOT assume the very latest in-place-buffer edits fixed anything.
-- The last reliable signal is still:
-  - tower mismatch has moved later than round 0
-  - current bug is likely in round > 0 in-place-buffer semantics
-
-Debug helpers currently present in `ceno_zkvm/src/scheme/gpu/mod.rs`
-- `debug_compare_tower_cpu_gpu_prover(...)`
-- `debug_compare_tower_eq_layers(...)`
-- `debug_compare_tower_layer_v1_v2(..., round)`
-- currently called for:
-  - `round = 2`
-  - `round = 3`
-
-Be careful
-- Some helpers use fresh local transcripts like:
-  - `BasicTranscript::new(b"tower-layer2-debug")`
-- These are only valid for isolated V1/V2/CPU direct comparisons.
-- They are NOT end-to-end transcript or verifier oracles.
-
-Current best hypothesis
-- The active tower bug is now in V2 later-round in-place-buffer semantics, not in:
-  - tower witness layout
-  - original round-0 direct-order policy
-  - transcript clone bugs
-
-Most relevant files to inspect next
-
-Current repo
-- `ceno_zkvm/src/scheme/gpu/mod.rs`
-- `ceno_zkvm/src/e2e.rs`
-
-GPU repo
-- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
-- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
-- `../ceno-gpu/cuda_hal/src/common/tower/mod.rs`
-- `../ceno-gpu/cuda_hal/src/common/tower/utils.rs`
-- `../ceno-gpu/cuda_hal/src/lib.rs`
-- `../ceno-gpu/cuda_hal/src/common/buffer.rs`
-
-Backend repo
-- `../gkr-backend/crates/mpcs/...`
-- `../gkr-backend/crates/sumcheck/...`
-
-Recommended next step for the new session
-1. Read this file.
-2. Keep CPU baseline as source of truth.
-3. Continue from the latest tower state, focusing only on later-round in-place-buffer semantics in:
-   - `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
-4. Run exactly one lightweight GPU repro at a time:
-   - `RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 cargo run --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e -- --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci`
-
-Backups / snapshots
-- Earlier stash-save/apply snapshots were created in this workstream.
-- There is also filesystem snapshot history under:
-  - `/home/wusm/rust/ceno/.codex-backups/`
-
-
-## E2E / validation commands executed in compact tower batch + estimator work
-
-Context
-- Full clean was run before validating newly added CUDA kernels, to avoid stale C++/CUDA artifacts.
-- Heavy commands used `timeout 1800s` so compilation can be slow, but execution cannot hang indefinitely.
-- Logs were written to `/tmp` for later inspection.
-
-Clean/build commands
-```bash
-cargo clean
-cargo clean --manifest-path ../ceno-gpu/cuda_hal/Cargo.toml
-```
-
-```bash
-cargo build --release --features gpu --package ceno_zkvm --features sanity-check --bin e2e
-```
-Result
-- Passed.
-- Elapsed: `4:07.82`.
-
-Lightweight sanity e2e after clean
-```bash
-RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci
-```
-Result
-- Passed.
-- Elapsed: `0:09.29`.
-
-Cargo check after compact batch/estimator edits
-```bash
-timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
-```
-Result
-- Passed.
-
-Final lightweight sanity e2e after removing temporary debug probe
-```bash
-RUST_LOG=error CENO_CONCURRENT_CHIP_PROVING=0 target/release/e2e --platform=ceno --max-cycle-per-shard=1000 --hints=2 --public-io=5 --shard-id=0 examples/target/riscv32im-ceno-zkvm-elf/release/examples/fibonacci
-```
-Result
-- Passed.
-- Elapsed: `0:08.34`.
-
-Heavy e2e command 1: serial proving + GPU mem tracking
-```bash
-CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall
-```
-Executed with timeout/log wrapper:
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial.log
-```
-Initial result
-- Failed due strict memory-estimator overestimate, not proof failure.
-- Panic:
-  - `[memcheck] build_tower_witness_gpu: over-estimate! estimated=146.93MB, actual=126.43MB, diff=20.50MB, margin=10.00MB`
-- Elapsed: `1:19.48`.
-
-After estimator fix, rerun with log:
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-memtracking-serial-after-estimate.log
-```
-Final result
-- Passed.
-- Elapsed: `1:15.43`.
-- Log: `/tmp/ceno-keccak-memtracking-serial-after-estimate.log`.
-
-Heavy e2e command 2: concurrent chip proving + GPU witgen
-```bash
-CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall
-```
-Executed with timeout/log wrapper before estimator fix:
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen.log
-```
-Result
-- Passed.
-- Elapsed: `0:10.02`.
-- Final pool peak around `291MB`.
-- Log: `/tmp/ceno-keccak-concurrent-witgen.log`.
-
-Executed again after estimator fix:
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 1800s env CENO_GPU_MEM_TRACKING=0 CENO_CONCURRENT_CHIP_PROVING=1 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-keccak-concurrent-witgen-after-estimate.log
-```
-Final result
-- Passed.
-- Elapsed: `0:10.74`.
-- Log: `/tmp/ceno-keccak-concurrent-witgen-after-estimate.log`.
-
-Diff hygiene commands
-```bash
-git diff --check
-git -C ../ceno-gpu diff --check
-```
-Result
-- Both passed.
-
-## Restart state: benchmark memcheck under-estimate follow-up
-
-Date: 2026-04-26
-
-Current task
-- User reported a remaining GPU memory under-estimate when running the top-entry repo `/home/wusm/rust/ceno-reth-benchmark` against the local `/home/wusm/rust/ceno` repo.
-- The benchmark command must use `--rpc-url "$CENO_RPC"`; do not paste or persist concrete RPC URLs in logs or docs.
-- In the current shell, `CENO_RPC` was not set, so the benchmark repro could not be completed before restart.
-
-Current repo state
-- Main repo: `/home/wusm/rust/ceno`
-- Current branch includes commit:
-  - `5ecce046 fix mem estimator`
-- Important existing fix already present:
-  - `ceno_zkvm/src/scheme/gpu/memory.rs` now estimates `build_main_witness` by materialized GKR outputs, not only final tower outputs.
-  - This fixed the earlier `Ecall_Keccak` under-estimate where old estimate was around `11.73MB` and actual was `16.00MB`.
-- Local root `Cargo.toml` and `Cargo.lock` are dirty from pre-existing dependency/local-path work; do not accidentally revert them unless explicitly requested.
-
-New diagnostic patch added before restart
-- Added contextual labels to GPU memcheck output so future failures identify both stage and circuit.
-- Files touched:
-  - `ceno_zkvm/src/scheme/gpu/memory.rs`
-    - added `check_gpu_mem_estimation_with_context(...)`
-    - labels now print like `build_main_witness[Ecall_Keccak]`
-  - `ceno_zkvm/src/scheme/utils.rs`
-    - `build_main_witness` memcheck now includes first GKR layer name
-  - `ceno_zkvm/src/scheme/prover.rs`
-    - replay/build-tower/prove-tower memchecks now include circuit name in sequential GPU proving path
-  - `ceno_zkvm/src/scheme/gpu/mod.rs`
-    - prover trait memchecks now include first GKR layer name or task circuit name where available
-- This patch is diagnostic/safety oriented; it does not change memory estimates.
-
-Validation already run after diagnostic patch
-```bash
-cargo fmt --check
-```
-Result
-- Passed.
-
-```bash
-timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
-```
-Result
-- Passed.
-
-Lightweight memcheck e2e command run after diagnostic patch
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 900s env CENO_GPU_MEM_TRACKING=1 CENO_CONCURRENT_CHIP_PROVING=0 CENO_GPU_ENABLE_WITGEN=1 cargo run --config net.git-fetch-with-cli=true --release --package ceno_zkvm --features gpu --bin e2e -- --platform=ceno --max-cycle-per-shard=1600 examples/target/riscv32im-ceno-zkvm-elf/release/examples/keccak_syscall 2>&1 | tee /tmp/ceno-light-keccak-context-memcheck.log
-```
-Result
-- Memcheck stages passed; no under-estimate panic.
-- The run still fails later at the known verifier assertion in `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`.
-- Useful log examples:
-  - `replay_gpu_witness_from_raw[Ecall_Keccak]: estimated=11.23MB, actual=11.23MB`
-  - `build_main_witness[Ecall_Keccak]: estimated=32.41MB, actual=32.59MB`
-  - `build_tower_witness_gpu[Ecall_Keccak]: estimated=105.83MB, actual=106.01MB`
-  - `prove_tower_relation_gpu[Ecall_Keccak]: estimated=36.84MB, actual=37.26MB`
-  - `replay_gpu_witness_from_raw[ShardRamCircuit]: estimated=0.38MB, actual=0.38MB`
-  - `build_main_witness[ShardRamCircuit_main]: estimated=0.01MB, actual=0.01MB`
-  - `build_tower_witness_gpu[ShardRamCircuit]: estimated=0.01MB, actual=0.02MB`
-
-Important conclusion so far
-- Lightweight Ceno `keccak_syscall` no longer reproduces the reported memcheck under-estimate.
-- The remaining issue appears large-payload/top-entry specific and needs the benchmark repro with `CENO_RPC` exported.
-- Because contextual memcheck labels are now in place, the next benchmark run should immediately identify the failing stage and circuit.
-
-Required environment for next session
-```bash
-export CENO_RPC='<redacted RPC URL>'
-```
-- The assistant cannot see shell variables unless they are present in the execution environment.
-- Verify with:
-```bash
-if [ -n "${CENO_RPC:-}" ]; then echo 'CENO_RPC is set'; else echo 'CENO_RPC is NOT set'; fi
-```
-
-Benchmark repro command to run next
-- Workdir: `/home/wusm/rust/ceno-reth-benchmark`
-- Use timeout and tee log.
-- Keep `--rpc-url "$CENO_RPC"` exactly; do not expand into a persisted command string.
-
-```bash
-/usr/bin/time -f 'elapsed %E' timeout 2400s env \
-  CENO_GPU_MEM_TRACKING=1 \
-  CENO_CONCURRENT_CHIP_PROVING=0 \
-  CENO_GPU_ENABLE_WITGEN=1 \
-  RUST_MIN_STACK=16777216 \
-  RUST_BACKTRACE=1 \
-  CYCLE_TRACKER_MAX_DEPTH=4 \
-  OUTPUT_PATH=metrics.json \
-  CENO_GPU_CACHE_LEVEL=0 \
-  RUSTFLAGS='-C target-feature=+avx2' \
-  JEMALLOC_SYS_WITH_MALLOC_CONF='retain:true,metadata_thp:always,thp:always,dirty_decay_ms:-1,muzzy_decay_ms:-1' \
-  RUST_LOG=debug \
-  cargo run --features jemalloc --features metrics --features perf-metrics --features gpu --bin ceno-reth-benchmark-bin -- \
-    --block-number 23587691 \
-    --rpc-url "$CENO_RPC" \
-    --cache-dir block_data \
-    --mode prove-app \
-    --app-proofs ./app_proof.bitcode \
-    --shard-id 0 \
-    --chain-id 1 \
-  2>&1 | tee /tmp/ceno-reth-benchmark-memcheck.log
-```
-
-After benchmark fails or completes
-1. Extract memcheck failure context:
-```bash
-rg -n "under-estimate|over-estimate|\\[memcheck\\].*diff=-" /tmp/ceno-reth-benchmark-memcheck.log | tail -120
-```
-2. The failing line should now include a label like:
-   - `build_main_witness[<circuit>]`
-   - `build_tower_witness_gpu[<circuit>]`
-   - `prove_tower_relation_gpu[<circuit>]`
-   - `replay_gpu_witness_from_raw[<circuit>]`
-3. Patch only the relevant estimator in `/home/wusm/rust/ceno`.
-4. Re-run lightweight Ceno check first:
-```bash
-cargo fmt --check
-timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e
-```
-5. Then rerun the benchmark command above.
-
-Security hygiene
-- If a concrete RPC URL accidentally appears in any local log, scrub it immediately:
-```bash
-for f in /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark/*.txt /home/wusm/rust/ceno-reth-benchmark/*.log; do
-  [ -f "$f" ] || continue
-  perl -0pi -e 's#https://eth-mainnet\\.g\\.alchemy\\.com/v2/[^\\s\\x27\\"]+#\\$CENO_RPC#g' "$f"
-done
-```
-- Verify no RPC string remains:
-```bash
-rg -n 'alchemy|eth-mainnet\.g\.alchemy' /tmp/ceno-reth-benchmark-memcheck.log /home/wusm/rust/ceno-reth-benchmark -g '*.txt' -g '*.log' -g '*.md' -g '*.json' 2>/dev/null || true
-```
-
-## Architecture refresher: compact GPU witness / memory-estimator terminology
-
-This section is intended for a fresh session before touching estimators or compact witness code.
-
-Core terminology
-- `occupied rows` / `actual rows`:
-  - Real number of rows with data for a chip or replay plan.
-  - Usually `input.num_instances() << rotation_vars` for normal chip inputs.
-  - For replayed GPU witgen, prefer replay-plan-specific real rows when available.
-- `logical domain` / `full domain`:
-  - Power-of-two domain implied by `num_vars`.
-  - Some protocols/verifier semantics still reason over this domain.
-  - Prover storage should avoid allocating it when compact storage is sufficient.
-- `compact witness`:
-  - Device/host storage sized by occupied rows, not full logical domain.
-  - This is the intended design for the GPU witgen/prover path.
-- `materialized output`:
-  - GKR layer output MLE that is actually allocated during `build_main_witness`.
-  - `EvalExpression::Single` and `EvalExpression::Linear` materialize; `Zero` does not.
-- `final/output GKR layer`:
-  - `gkr_circuit.layers[0]` because circuit layers are ordered output-to-input.
-  - The `output_mask` is applied only to this final/output layer during tower witness build.
-- `internal GKR layers`:
-  - Any layer after index 0 in `gkr_circuit.layers`.
-  - These do not receive the final tower `output_mask`; all non-zero outputs are materialized.
-- `replay path`:
-  - GPU witgen can replay raw records into device-backed witness matrices just-in-time.
-  - Large replay-heavy chips currently include `Ecall_Keccak` and `ShardRamCircuit`.
-- `stage split`:
-  - Large replay chips materialize witness multiple times for separate stages to reduce peak VRAM.
-  - Estimator must model stage-local peaks, not sum all stages as simultaneously live.
-
-Important module map
-- `ceno_zkvm/src/scheme/gpu/memory.rs`
-  - Central GPU memory estimator and memcheck assertion logic.
-  - Key functions:
-    - `estimate_chip_proof_memory`
-    - `estimate_trace_bytes`
-    - `estimate_main_witness_bytes`
-    - `estimate_tower_stage_components`
-    - `estimate_main_constraints_bytes`
-    - `estimate_replay_materialization_bytes_for_plan`
-    - `check_gpu_mem_estimation_with_context`
-- `ceno_zkvm/src/scheme/utils.rs`
-  - Builds main GKR witness through `build_main_witness` / `gkr_witness`.
-  - Owns output materialization mask logic:
-    - `tower_output_count`
-    - `build_output_materialization_mask`
-    - `first_layer_output_group_stage_masks`
-  - Critical design point:
-    - `output_mask` is applied only to final/output GKR layer.
-- `ceno_zkvm/src/scheme/prover.rs`
-  - Sequential per-chip GPU proving flow and replay stage splitting.
-  - Important stages:
-    - replay raw GPU witness
-    - build main witness
-    - build tower witness
-    - prove tower
-    - replay again for ECC/main constraints if needed
-- `ceno_zkvm/src/scheme/gpu/mod.rs`
-  - GPU prover trait implementations and shared helpers.
-  - Includes trait-level memchecks for tower/main/ecc/replay helper paths.
-- `../ceno-gpu/cuda_hal/src/common/tower/*`
-  - GPU tower witness/proof host-side implementation.
-- `../ceno-gpu/cpp/common/tower.cuh` and kernel files under `../ceno-gpu/cpp/*/kernels/tower.cu`
-  - CUDA tower kernels and compact split logic.
-- `../ceno-gpu/cuda_hal/src/common/sumcheck/generic_v2.rs`
-  - Rust host-side V2 sumcheck setup.
-- `../ceno-gpu/cpp/common/sumcheck/generic_v2.cuh`
-  - CUDA V2 sumcheck logic.
-
-Current compact witness design assumptions
-- Whole flow target:
-  - commit
-  - tower prove
-  - main prove
-  - rotation prove
-  - ECC prove
-  - batch opening
-  - should operate on compact witness storage wherever prover-side full-domain padding is not semantically required.
-- Round-0 original inputs:
-  - Use direct/native order over real occupied data.
-  - Do not invent tower-specific order separate from PCS.
-- Later folded rounds:
-  - Use normal in-place/folded buffer semantics.
-  - Do not call this a replay buffer; call it `in-place buffer`.
-- Compact even/odd tails:
-  - Avoid branch-per-element loops for odd real lengths.
-  - Decide odd/even outside the loop and process the leftover tail separately.
-- Cloning policy:
-  - Avoid full `clone` / `to_vec` on large witness buffers unless it is intentionally debug-only.
-
-Main witness memory-estimator design
-- Old broken model:
-  - `tower_output_count(composed_cs) * rows * sizeof(BB31Ext)`.
-  - This only counts final tower outputs.
-- Correct current model:
-  - Count final/output layer materialized tower outputs under the output mask.
-  - Plus count all internal layer non-zero outputs because internal layers are not masked.
-  - Multiply by real output rows, normally `input.witness.first().evaluations_len()`.
-- Why this matters:
-  - Multi-layer GKR circuits like `Ecall_Keccak` materialize internal outputs during `build_main_witness`.
-  - Single-layer circuits like `ShardRamCircuit_main` usually do not have the same missing-internal-output issue.
-
-Replay / trace estimator design
-- Normal non-replay path:
-  - Extracted witness and structural MLEs can stay resident across chip proof.
-  - Stage peak is resident trace plus max temporary stage.
-- Replay-heavy path (`Ecall_Keccak`, `ShardRamCircuit`):
-  - Estimate replay materialization from replay plan real rows, not full logical domain.
-  - Replay witness is materialized for tower stages, then cleared before tower prove/main stages as designed.
-  - Estimator should use max of replay/build/prove/ecc/main stage peaks plus safety margin.
-- Structural witness caveat:
-  - If structural RMM already has device backing, transport may be view-only and estimate zero new bytes.
-  - If not device-backed, estimate structural upload by real rows when possible.
-
-Tower estimator design
-- Build stage estimate includes:
-  - CUDA tower build temporary allocations from `estimate_build_tower_memory`.
-  - Compact product split buffers.
-  - Compact logup split buffers.
-- Prove stage estimate separates:
-  - live tower input buffers
-  - local create-proof temporary allocations
-- For logup:
-  - If table lookup has numerator, numerator buffers are real compact buffers.
-  - If no numerator, ones/default numerator should not allocate a full domain buffer.
-
-Scheduler / memcheck relationship
-- Sequential + `CENO_GPU_MEM_TRACKING=1`:
-  - Runs memcheck assertions stage-by-stage.
-  - This is the best mode for estimator debugging.
-- Concurrent + mem tracking disabled:
-  - Uses estimator for booking/scheduling VRAM, not direct memcheck assertions.
-- Booking can include extra safety margin for replay-heavy chips in concurrent mode.
-- A stage-local memcheck pass does not automatically prove concurrent booking is optimal, but it strongly validates the per-stage estimator.
-
-Current known caveats
-- Lightweight `keccak_syscall` memchecks pass after current estimator fixes.
-- The lightweight run still hits a known verifier assertion later at:
-  - `gkr_iop/src/gkr/layer/zerocheck_layer.rs:306`
-- The remaining reported under-estimate is only known from the top-entry benchmark payload and must be reproduced with `CENO_RPC` exported.
-- Do not guess the failing estimator from the old generic label; use the new contextual memcheck label first.
-
-Recommended investigation discipline
-1. Reproduce with sequential mem tracking first:
-   - `CENO_GPU_MEM_TRACKING=1`
-   - `CENO_CONCURRENT_CHIP_PROVING=0`
-2. Read the exact contextual label:
-   - `build_main_witness[...]`
-   - `build_tower_witness_gpu[...]`
-   - `prove_tower_relation_gpu[...]`
-   - `replay_gpu_witness_from_raw[...]`
-3. Patch only the estimator for that stage/circuit class.
-4. Validate in `/home/wusm/rust/ceno` first:
-   - `cargo fmt --check`
-   - `timeout 300s cargo check --features gpu --package ceno_zkvm --bin e2e`
-5. Then rerun the top-entry benchmark.

From 147f5679142911e16a6057d06cd4f5fc0bbd4d89 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 08:11:12 +0800
Subject: [PATCH 20/25] add missing syncronization, avoid race condition

---
 ceno_zkvm/src/scheme/prover.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 33627ae68..0d5a9feab 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1,6 +1,7 @@
 use ff_ext::ExtensionField;
 use gkr_iop::{
     cpu::{CpuBackend, CpuProver},
+    error::BackendError,
     hal::ProverBackend,
 };
 use std::{collections::BTreeMap, marker::PhantomData, sync::Arc};
@@ -1144,6 +1145,20 @@ where
         .get_pool_stream()
         .expect("should acquire stream");
     let _thread_stream_guard = gkr_iop::gpu::bind_thread_stream(_stream.clone());
+    let sync_concurrent_chip_stream = || -> Result<(), ZKVMError> {
+        if ChipScheduler::is_concurrent_mode() {
+            cuda_hal
+                .inner
+                .synchronize_stream(_stream.stream())
+                .map_err(|e| {
+                    ZKVMError::BackendError(BackendError::CircuitError(
+                        format!("failed to synchronize GPU chip proof stream for {name}: {e:?}")
+                            .into_boxed_str(),
+                    ))
+                })?;
+        }
+        Ok(())
+    };
     let replay_stage_split = gpu_replay_plan
         .as_ref()
         .is_some_and(|plan| matches!(plan.kind, GpuWitgenKind::Keccak | GpuWitgenKind::ShardRam));
@@ -1399,6 +1414,7 @@ where
             wits_in_evals,
             fixed_in_evals,
         } = evals;
+        sync_concurrent_chip_stream()?;
         clear_materialized_input(&mut input);
         log_gpu_device_state(&format!("{name}:after_main_constraints"));
         exit_span!(span);
@@ -1483,6 +1499,7 @@ where
         wits_in_evals,
         fixed_in_evals,
     } = evals;
+    sync_concurrent_chip_stream()?;
     exit_span!(span);
 
     Ok((

From 94fc7bfb1cd40b90488135e78ad25c0e69b35f9c Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 10:41:27 +0800
Subject: [PATCH 21/25] Account ShardRam tower prove allocator overhead

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 23 ++++++++++++++++++-----
 ceno_zkvm/src/scheme/gpu/mod.rs    |  1 +
 ceno_zkvm/src/scheme/prover.rs     | 10 +++++++---
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index f61bce28f..4abad8566 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -46,6 +46,15 @@ pub fn init_gpu_mem_tracker<'a>(
 const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB
 const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB
 const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024;
+const SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES: usize = 16 * 1024 * 1024;
+
+pub(crate) fn tower_prove_allocator_overhead_bytes(circuit_name: &str) -> usize {
+    if circuit_name == "ShardRamCircuit" {
+        SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES
+    } else {
+        0
+    }
+}
 
 /// Validate that the estimated GPU memory matches actual usage within tolerance.
 /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES`
@@ -623,13 +632,12 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         elem_size,
         has_logup_numerator,
     );
-    let shard_ram_tower_batch_overhead = composed_cs
+    let is_shard_ram = composed_cs
         .gkr_circuit
         .as_ref()
         .and_then(|circuit| circuit.layers.first())
-        .is_some_and(|layer| layer.name == "ShardRamCircuit_main")
-        .then_some(10 * 1024 * 1024)
-        .unwrap_or(0);
+        .is_some_and(|layer| layer.name == "ShardRamCircuit_main");
+    let shard_ram_tower_batch_overhead = is_shard_ram.then_some(10 * 1024 * 1024).unwrap_or(0);
     let build_bytes = build_est.total_bytes + shard_ram_tower_batch_overhead;
     let prove_est = estimate_prove_tower_memory(
         num_prod_towers,
@@ -646,7 +654,12 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         prove_est.prod_tower_buffer_bytes + prove_est.logup_tower_buffer_bytes;
     let borrowed_input_bytes =
         prove_est.prod_borrowed_input_bytes + prove_est.logup_borrowed_input_bytes;
-    let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes);
+    let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes)
+        + if is_shard_ram {
+            tower_prove_allocator_overhead_bytes("ShardRamCircuit")
+        } else {
+            0
+        };
 
     (
         build_bytes,
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 986b04439..f816f0e7b 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -75,6 +75,7 @@ pub(crate) use memory::{
     check_gpu_scheduler_mem_estimation_with_context, estimate_chip_proof_memory,
     estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan,
     estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker,
+    tower_prove_allocator_overhead_bytes,
 };
 use memory::{
     estimate_ecc_quark_bytes_from_num_vars, estimate_main_constraints_bytes,
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 0d5a9feab..f4a525bf3 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1341,14 +1341,18 @@ where
             .saturating_sub(tower_input_live_bytes);
         let release_adjusted_prebuild_bytes =
             tower_prove_prebuild_estimated_bytes / NUM_FANIN + 4 * 1024 * 1024;
-        let tower_prove_estimated_bytes =
-            runtime_layout_prove_bytes.max(release_adjusted_prebuild_bytes);
+        let allocator_overhead_bytes =
+            crate::scheme::gpu::tower_prove_allocator_overhead_bytes(name);
+        let tower_prove_estimated_bytes = runtime_layout_prove_bytes
+            .max(release_adjusted_prebuild_bytes)
+            + allocator_overhead_bytes;
         tracing::info!(
-            "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, local={:.2}MB, tower_live={:.2}MB",
+            "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, allocator_overhead={:.2}MB, local={:.2}MB, tower_live={:.2}MB",
             name,
             tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0),
             runtime_layout_prove_bytes as f64 / (1024.0 * 1024.0),
             release_adjusted_prebuild_bytes as f64 / (1024.0 * 1024.0),
+            allocator_overhead_bytes as f64 / (1024.0 * 1024.0),
             tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0),
             tower_input_live_bytes as f64 / (1024.0 * 1024.0),
         );

From c9401d104f3c50d53f7d1665e5f754145f08408f Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 10:49:41 +0800
Subject: [PATCH 22/25] misc: clippy fix

---
 ceno_zkvm/src/scheme/prover.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index f4a525bf3..4294adfbb 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1,7 +1,8 @@
 use ff_ext::ExtensionField;
+#[cfg(feature = "gpu")]
+use gkr_iop::error::BackendError;
 use gkr_iop::{
     cpu::{CpuBackend, CpuProver},
-    error::BackendError,
     hal::ProverBackend,
 };
 use std::{collections::BTreeMap, marker::PhantomData, sync::Arc};

From d14e66a2cf5bb75859a6ad433c686921a4414d41 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 14:52:38 +0800
Subject: [PATCH 23/25] Fix GPU proof memory estimation

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 37 +++++++++++++++---------------
 ceno_zkvm/src/scheme/gpu/mod.rs    | 36 ++++++++++++++++++-----------
 ceno_zkvm/src/scheme/prover.rs     | 11 ++++++---
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index 4abad8566..a2bf8f086 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -225,11 +225,11 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         tower_input_live_bytes,
         borrowed_tower_input_bytes,
     ) = estimate_tower_stage_components(composed_cs, input);
-    let scheduler_tower_input_live_bytes =
+    let tower_input_non_borrowed_bytes =
         tower_input_live_bytes.saturating_sub(borrowed_tower_input_bytes);
-    let tower_prove_peak_bytes = tower_input_live_bytes + tower_prove_local_bytes;
-    let scheduler_tower_prove_peak_bytes =
-        scheduler_tower_input_live_bytes + tower_prove_local_bytes;
+    let tower_input_backing_bytes = main_witness_bytes.max(borrowed_tower_input_bytes);
+    let tower_prove_stage_bytes =
+        tower_input_backing_bytes + tower_input_non_borrowed_bytes + tower_prove_local_bytes;
 
     // Part 5: main constraints (temporary usage)
     let main_constraints_temporary_bytes = estimate_main_constraints_bytes(composed_cs, input);
@@ -251,7 +251,6 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         // During tower prove, the replayed witness/device backing has already
         // been cleared, but the built TowerInput buffers remain live and
         // overlap with the fresh create_proof allocations.
-        let tower_prove_stage_bytes = tower_prove_peak_bytes;
         let ecc_stage_bytes = trace_est.trace_resident_bytes + ecc_quark_temporary_bytes;
         let main_stage_bytes = trace_est.trace_resident_bytes + main_constraints_temporary_bytes;
         let replay_stage_bytes = structural_resident_bytes + replay_materialization_bytes;
@@ -271,7 +270,6 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         // to stay live through tower proving. They are dropped before the
         // ECC/rotation/main-constraint stages.
         let tower_build_stage_bytes = main_witness_bytes + tower_build_bytes;
-        let tower_prove_stage_bytes = main_witness_bytes + scheduler_tower_prove_peak_bytes;
         let stage_peak = trace_est
             .trace_temporary_bytes
             .max(tower_build_stage_bytes)
@@ -290,18 +288,19 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     if replay_stage_split {
         let tower_build_stage_bytes =
             trace_est.trace_resident_bytes + main_witness_bytes + tower_build_bytes;
-        let tower_prove_stage_bytes = tower_prove_peak_bytes;
         let ecc_stage_bytes = trace_est.trace_resident_bytes + ecc_quark_temporary_bytes;
         let main_stage_bytes = trace_est.trace_resident_bytes + main_constraints_temporary_bytes;
         let replay_stage_bytes = structural_resident_bytes + replay_materialization_bytes;
         tracing::info!(
-            "[mem estimate][{}] replay_split: trace={:.2}MB, main_witness={:.2}MB, replay={:.2}MB, tower_build_stage={:.2}MB, prove_tower_stage={:.2}MB, ecc_stage={:.2}MB, prove_main_stage={:.2}MB",
+            "[mem estimate][{}] replay_split: trace={:.2}MB, main_witness={:.2}MB, replay={:.2}MB, tower_build_stage={:.2}MB, prove_tower_stage={:.2}MB, tower_backing={:.2}MB, borrowed_tower_input={:.2}MB, ecc_stage={:.2}MB, prove_main_stage={:.2}MB",
             circuit_name,
             to_mb(trace_est.trace_resident_bytes),
             to_mb(main_witness_bytes),
             to_mb(replay_stage_bytes),
             to_mb(tower_build_stage_bytes),
             to_mb(tower_prove_stage_bytes),
+            to_mb(tower_input_backing_bytes),
+            to_mb(borrowed_tower_input_bytes),
             to_mb(ecc_stage_bytes),
             to_mb(main_stage_bytes),
         );
@@ -314,7 +313,6 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         );
     } else {
         let tower_build_stage_bytes = main_witness_bytes + tower_build_bytes;
-        let tower_prove_stage_bytes = main_witness_bytes + scheduler_tower_prove_peak_bytes;
         // Resident memory (always occupied during chip proof)
         tracing::info!(
             "[mem estimate][{}] resident: trace={:.2}MB",
@@ -323,11 +321,13 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         );
         // Stage-scoped memory beyond the always-live extracted trace.
         tracing::info!(
-            "[mem estimate][{}] temporary: extract_trace={:.2}MB, tower_build_with_main={:.2}MB, tower_prove_with_main={:.2}MB, ecc_quark={:.2}MB, prove_main={:.2}MB",
+            "[mem estimate][{}] temporary: extract_trace={:.2}MB, tower_build_with_main={:.2}MB, tower_prove_with_backing={:.2}MB, tower_backing={:.2}MB, borrowed_tower_input={:.2}MB, ecc_quark={:.2}MB, prove_main={:.2}MB",
             circuit_name,
             to_mb(trace_est.trace_temporary_bytes),
             to_mb(tower_build_stage_bytes),
             to_mb(tower_prove_stage_bytes),
+            to_mb(tower_input_backing_bytes),
+            to_mb(borrowed_tower_input_bytes),
             to_mb(ecc_quark_temporary_bytes),
             to_mb(main_constraints_temporary_bytes),
         );
@@ -698,14 +698,13 @@ pub(crate) fn estimate_tower_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
 /// When cache is disabled (`CacheLevel::None`, the default), estimates actual allocation costs.
 pub(crate) fn estimate_trace_extraction_bytes(
     num_witin: usize,
-    num_vars: usize,
+    _num_vars: usize,
     occupied_rows: usize,
     witness_replayable: bool,
 ) -> (usize, usize) {
     let base_elem_size = std::mem::size_of::<BB31Base>();
-    let mle_len = 1usize << num_vars;
     let compact_poly_bytes = num_witin * occupied_rows * base_elem_size;
-    let logical_poly_bytes = num_witin * mle_len * base_elem_size;
+    let transpose_temporary_bytes = 2 * compact_poly_bytes;
 
     if should_materialize_witness_on_gpu() {
         if should_retain_witness_device_backing_after_commit() {
@@ -725,15 +724,17 @@ pub(crate) fn estimate_trace_extraction_bytes(
         }
 
         // GPU witgen alone does not imply replayability. Non-replayable traces
-        // still go through basefold::get_trace in cache-none mode, which
-        // allocates the extracted witness plus a temporary 2x transpose buffer.
-        return (compact_poly_bytes, 2 * logical_poly_bytes);
+        // still go through basefold::get_trace in cache-none mode. The fallback
+        // transpose buffer is 2x the compact RMM backing, not 2x the logical
+        // domain length.
+        return (compact_poly_bytes, transpose_temporary_bytes);
     }
 
     if matches!(get_gpu_cache_level(), CacheLevel::None) {
         // Default cache level is None
-        // get_trace allocates poly copies (resident) + temp_buffer (2x, freed after)
-        (compact_poly_bytes, 2 * logical_poly_bytes)
+        // get_trace allocates poly copies (resident) + temp_buffer over the
+        // compact RMM backing (2x, freed after).
+        (compact_poly_bytes, transpose_temporary_bytes)
     } else {
         (0, 0)
     }
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index f816f0e7b..4e5f3d0de 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -27,6 +27,7 @@ use ceno_gpu::{
 use either::Either;
 use ff_ext::ExtensionField;
 use gkr_iop::{
+    error::BackendError,
     gkr::{
         self, Evaluation, GKRProof, GKRProverOutput,
         layer::{LayerWitness, gpu::utils::extract_mle_relationships_from_monomial_terms},
@@ -345,7 +346,7 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
     challenges: &[E; 2],
     transcript: &mut impl Transcript<<GpuBackend<E, PCS> as ProverBackend>::E>,
     cuda_hal: &Arc<CudaHalBB31>,
-) -> TowerRelationOutput<E> {
+) -> Result<TowerRelationOutput<E>, ZKVMError> {
     let stream = gkr_iop::gpu::get_thread_stream();
     if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
         panic!("GPU backend only supports Goldilocks base field");
@@ -360,11 +361,12 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
     let (point, proof, lk_out_evals, w_out_evals, r_out_evals) = {
         // build_tower_witness_gpu builds compact GPU specs directly.
         let span = entered_span!("build_tower_witness", profiling_2 = true);
-        let (prod_gpu, logup_gpu) = info_span!("[ceno] build_tower_witness_gpu").in_scope(|| {
-            build_tower_witness_gpu(composed_cs, input, records, challenges, cuda_hal)
-                .map_err(|e| format!("build_tower_witness_gpu failed: {}", e))
-                .unwrap()
-        });
+        let (prod_gpu, logup_gpu) =
+            info_span!("[ceno] build_tower_witness_gpu").in_scope(|| {
+                build_tower_witness_gpu(composed_cs, input, records, challenges, cuda_hal)
+                    .map_err(|e| format!("build_tower_witness_gpu failed: {}", e))
+                    .map_err(|e| ZKVMError::InvalidWitness(e.into()))
+            })?;
         exit_span!(span);
 
         // GPU optimization: Extract out_evals from GPU-built towers before consuming them
@@ -392,12 +394,17 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
         };
 
         let span = entered_span!("prove_tower_relation", profiling_2 = true);
-        let (point_gl, proof_gpu) = info_span!("[ceno] prove_tower_relation_gpu").in_scope(|| {
-            cuda_hal
-                .tower
-                .create_proof(cuda_hal, tower_input, NUM_FANIN, basic_tr, stream.as_ref())
-                .expect("gpu tower create_proof failed")
-        });
+        let (point_gl, proof_gpu) =
+            info_span!("[ceno] prove_tower_relation_gpu").in_scope(|| {
+                cuda_hal
+                    .tower
+                    .create_proof(cuda_hal, tower_input, NUM_FANIN, basic_tr, stream.as_ref())
+                    .map_err(|e| {
+                        ZKVMError::BackendError(BackendError::CircuitError(
+                            format!("gpu tower create_proof failed: {e:?}").into_boxed_str(),
+                        ))
+                    })
+            })?;
         exit_span!(span);
 
         // TowerProofs
@@ -406,7 +413,7 @@ pub fn prove_tower_relation_impl<E: ExtensionField, PCS: PolynomialCommitmentSch
         (point, proof, lk_out_evals, w_out_evals, r_out_evals)
     };
 
-    (point, proof, lk_out_evals, w_out_evals, r_out_evals)
+    Ok((point, proof, lk_out_evals, w_out_evals, r_out_evals))
 }
 
 // Extract out_evals from GPU-built tower witnesses
@@ -1867,7 +1874,8 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
             challenges,
             transcript,
             &cuda_hal,
-        );
+        )
+        .expect("prove_tower_relation_impl failed");
 
         let estimated_bytes = estimate_tower_bytes::<E, PCS>(composed_cs, input);
         check_gpu_mem_estimation_with_context(
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 4294adfbb..eafad53ff 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1372,8 +1372,13 @@ where
                         basic_tr,
                         gkr_iop::gpu::get_thread_stream().as_ref(),
                     )
-                    .expect("gpu tower create_proof failed")
-            });
+                    .map_err(|e| {
+                        ZKVMError::BackendError(BackendError::CircuitError(
+                            format!("gpu tower create_proof failed for {name}: {e:?}")
+                                .into_boxed_str(),
+                        ))
+                    })
+            })?;
         log_gpu_device_state(&format!("{name}:after_prove_tower"));
         log_gpu_pool_usage(&format!("{name}:after_prove_tower"));
         let rt_tower: Point<E> = unsafe { std::mem::transmute(rt_tower_gl) };
@@ -1469,7 +1474,7 @@ where
             prove_tower_relation_impl::<E, PCS>(
                 cs, &input, &records, challenges, transcript, &cuda_hal,
             )
-        });
+        })?;
     exit_span!(span);
     drop(records);
 

From ceced51d0da6df0c6a767682f390f2b00ee72a59 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 15:26:48 +0800
Subject: [PATCH 24/25] Fix GPU proof estimate row basis

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 44 ++++++++++++++++++++++++++----
 ceno_zkvm/src/scheme/prover.rs     | 19 +++++++++++++
 ceno_zkvm/src/scheme/scheduler.rs  |  3 ++
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index a2bf8f086..c4b54a2ff 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -183,11 +183,14 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
     circuit_name: &str,
     replay_plan: Option<&GpuReplayPlan<E>>,
+    witness_trace_rows: Option<usize>,
     structural_cached_on_device: bool,
 ) -> u64 {
     let num_var_with_rotation =
         input.log2_num_instances() + composed_cs.rotation_vars().unwrap_or(0);
     let witness_replayable = replay_plan.is_some();
+    let occupied_rows =
+        estimate_witness_occupied_rows(composed_cs, input, replay_plan, witness_trace_rows);
     let structural_resident_bytes = if structural_cached_on_device {
         0
     } else {
@@ -204,10 +207,11 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         replay_plan,
         witness_replayable,
         structural_cached_on_device,
+        Some(occupied_rows),
     );
 
     // Part 2: main witness (base usage)
-    let main_witness_rows = main_witness_output_rows(composed_cs, input);
+    let main_witness_rows = main_witness_output_rows(composed_cs, input, Some(occupied_rows));
     let main_witness_bytes = estimate_main_witness_bytes(composed_cs, main_witness_rows);
 
     // Part 3: ecc quark (temporary usage)
@@ -224,7 +228,7 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
         tower_prove_local_bytes,
         tower_input_live_bytes,
         borrowed_tower_input_bytes,
-    ) = estimate_tower_stage_components(composed_cs, input);
+    ) = estimate_tower_stage_components(composed_cs, input, Some(occupied_rows));
     let tower_input_non_borrowed_bytes =
         tower_input_live_bytes.saturating_sub(borrowed_tower_input_bytes);
     let tower_input_backing_bytes = main_witness_bytes.max(borrowed_tower_input_bytes);
@@ -344,6 +348,23 @@ pub fn estimate_chip_proof_memory<E: ExtensionField, PCS: PolynomialCommitmentSc
     total_usage_bytes as u64
 }
 
+fn estimate_witness_occupied_rows<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
+    composed_cs: &ComposedConstrainSystem<E>,
+    input: &ProofInput<'_, GpuBackend<E, PCS>>,
+    replay_plan: Option<&GpuReplayPlan<E>>,
+    witness_trace_rows: Option<usize>,
+) -> usize {
+    if let Some(replay_plan) = replay_plan {
+        return replay_plan_actual_rows(replay_plan);
+    }
+    input
+        .witness
+        .first()
+        .map(|mle| mle.evaluations_len())
+        .or(witness_trace_rows)
+        .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0))
+}
+
 pub(crate) struct TraceEstimate {
     /// Persistent resident bytes (witness polys + structural MLEs)
     pub(crate) trace_resident_bytes: usize,
@@ -421,6 +442,7 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
     replay_plan: Option<&GpuReplayPlan<E>>,
     witness_replayable: bool,
     structural_cached_on_device: bool,
+    occupied_rows_override: Option<usize>,
 ) -> TraceEstimate {
     let cs = &composed_cs.zkvm_v1_css;
     let num_var_with_rotation =
@@ -455,7 +477,9 @@ pub(crate) fn estimate_trace_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
             estimate_trace_extraction_bytes(
                 cs.num_witin as usize,
                 num_var_with_rotation,
-                input.num_instances() << composed_cs.rotation_vars().unwrap_or(0),
+                occupied_rows_override.unwrap_or_else(|| {
+                    input.num_instances() << composed_cs.rotation_vars().unwrap_or(0)
+                }),
                 witness_replayable,
             )
         };
@@ -517,6 +541,7 @@ fn main_witness_materializes_output<E: ExtensionField>(out_eval: &EvalExpression
 pub fn main_witness_output_rows<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
+    occupied_rows_override: Option<usize>,
 ) -> usize {
     if composed_cs
         .gkr_circuit
@@ -533,6 +558,7 @@ pub fn main_witness_output_rows<E: ExtensionField, PCS: PolynomialCommitmentSche
         .witness
         .first()
         .map(|mle| mle.evaluations_len())
+        .or(occupied_rows_override)
         .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0))
 }
 
@@ -607,6 +633,7 @@ pub(crate) fn estimate_main_constraints_bytes<
 fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
     composed_cs: &ComposedConstrainSystem<E>,
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
+    occupied_rows_override: Option<usize>,
 ) -> (usize, usize, usize, usize) {
     let cs = &composed_cs.zkvm_v1_css;
     let num_prod_towers = composed_cs.num_reads() + composed_cs.num_writes();
@@ -622,7 +649,12 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
     let elem_size = std::mem::size_of::<BB31Ext>();
     let has_logup_numerator = composed_cs.is_with_lk_table();
 
-    let occupied_rows = input.num_instances() << composed_cs.rotation_vars().unwrap_or(0);
+    let occupied_rows = input
+        .witness
+        .first()
+        .map(|mle| mle.evaluations_len())
+        .or(occupied_rows_override)
+        .unwrap_or_else(|| input.num_instances() << composed_cs.rotation_vars().unwrap_or(0));
     let build_est = estimate_build_tower_memory(
         num_prod_towers,
         num_logup_towers,
@@ -676,7 +708,7 @@ pub(crate) fn estimate_tower_stage_bytes<E: ExtensionField, PCS: PolynomialCommi
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
 ) -> (usize, usize) {
     let (build_bytes, prove_local_bytes, _, _) =
-        estimate_tower_stage_components(composed_cs, input);
+        estimate_tower_stage_components(composed_cs, input, None);
     (build_bytes, prove_local_bytes)
 }
 
@@ -685,7 +717,7 @@ pub(crate) fn estimate_tower_bytes<E: ExtensionField, PCS: PolynomialCommitmentS
     input: &ProofInput<'_, GpuBackend<E, PCS>>,
 ) -> usize {
     let (build_bytes, prove_local_bytes, tower_input_live_bytes, _) =
-        estimate_tower_stage_components(composed_cs, input);
+        estimate_tower_stage_components(composed_cs, input, None);
     build_bytes.max(tower_input_live_bytes + prove_local_bytes)
 }
 
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index eafad53ff..eb31e8901 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -222,6 +222,8 @@ impl<
             let mut structural_rmms = Vec::with_capacity(name_and_instances.len());
             #[cfg(feature = "gpu")]
             let mut gpu_replay_plans = Vec::with_capacity(name_and_instances.len());
+            #[cfg(feature = "gpu")]
+            let mut witness_trace_rows = Vec::with_capacity(name_and_instances.len());
             // commit to opcode circuits first and then commit to table circuits, sorted by name
             for (i, chip_input) in witnesses.into_iter_sorted().enumerate() {
                 let crate::structs::ChipInput {
@@ -235,6 +237,15 @@ impl<
                 #[cfg(feature = "gpu")]
                 let use_deferred_gpu_commit = crate::instructions::gpu::config::is_gpu_witgen_enabled()
                     && !crate::instructions::gpu::config::should_retain_witness_device_backing_after_commit();
+                #[cfg(feature = "gpu")]
+                let trace_rows_for_estimate =
+                    if !crate::instructions::gpu::config::is_gpu_witgen_enabled()
+                        && witness_rmm.num_instances() > 0
+                    {
+                        Some(witness_rmm.height())
+                    } else {
+                        None
+                    };
 
                 #[cfg(feature = "gpu")]
                 if use_deferred_gpu_commit {
@@ -255,6 +266,8 @@ impl<
                 }
                 structural_rmms.push(structural_witness_rmm);
                 #[cfg(feature = "gpu")]
+                witness_trace_rows.push(trace_rows_for_estimate);
+                #[cfg(feature = "gpu")]
                 gpu_replay_plans.push(gpu_replay_plan);
             }
 
@@ -366,6 +379,8 @@ impl<
                 structural_rmms,
                 #[cfg(feature = "gpu")]
                 gpu_replay_plans,
+                #[cfg(feature = "gpu")]
+                witness_trace_rows,
                 witness_mles,
                 &witness_data,
                 fixed_mles,
@@ -873,6 +888,7 @@ impl<
         name_and_instances: Vec<(String, [usize; 2])>,
         structural_rmms: Vec<witness::RowMajorMatrix<E::BaseField>>,
         #[cfg(feature = "gpu")] gpu_replay_plans: Vec<Option<crate::structs::GpuReplayPlan<E>>>,
+        #[cfg(feature = "gpu")] witness_trace_rows: Vec<Option<usize>>,
         #[allow(unused_mut)] mut witness_mles: Vec<PB::MultilinearPoly<'data>>,
         witness_data: &PB::PcsData,
         mut fixed_mles: Vec<Arc<PB::MultilinearPoly<'data>>>,
@@ -1001,6 +1017,7 @@ impl<
                     gpu_input,
                     &circuit_name,
                     gpu_replay_plans[this_idx].as_ref(),
+                    witness_trace_rows[this_idx],
                     structural_cached_on_device,
                 )
             };
@@ -1054,6 +1071,8 @@ impl<
                 witness_trace_idx,
                 #[cfg(feature = "gpu")]
                 gpu_replay_plan,
+                #[cfg(feature = "gpu")]
+                witness_trace_rows: witness_trace_rows[this_idx],
                 num_witin: cs.num_witin(),
                 structural_rmm: task_structural_rmm,
             });
diff --git a/ceno_zkvm/src/scheme/scheduler.rs b/ceno_zkvm/src/scheme/scheduler.rs
index e792b6fd4..060f91083 100644
--- a/ceno_zkvm/src/scheme/scheduler.rs
+++ b/ceno_zkvm/src/scheme/scheduler.rs
@@ -90,6 +90,9 @@ pub struct ChipTask<'a, PB: ProverBackend> {
     /// Replay witness directly from shard-resident raw GPU data when available.
     #[cfg(feature = "gpu")]
     pub gpu_replay_plan: Option<GpuReplayPlan<PB::E>>,
+    /// Actual witness trace rows used for cache-none extraction estimates.
+    #[cfg(feature = "gpu")]
+    pub witness_trace_rows: Option<usize>,
     /// Expected number of witness polynomials for this circuit
     pub num_witin: usize,
     /// CPU-side structural witness RowMajorMatrix, transported to GPU on-demand

From d1ab71a052c7ef5a49a9aa3b8daaec66e1c380f8 Mon Sep 17 00:00:00 2001
From: "sm.wu" <hero78119@gmail.com>
Date: Tue, 28 Apr 2026 15:44:25 +0800
Subject: [PATCH 25/25] Tune ShardRam tower proof estimate

---
 ceno_zkvm/src/scheme/gpu/memory.rs | 39 ++++++++++++++++++------------
 ceno_zkvm/src/scheme/gpu/mod.rs    |  4 +--
 ceno_zkvm/src/scheme/prover.rs     | 12 +++------
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/ceno_zkvm/src/scheme/gpu/memory.rs b/ceno_zkvm/src/scheme/gpu/memory.rs
index c4b54a2ff..02617b83a 100644
--- a/ceno_zkvm/src/scheme/gpu/memory.rs
+++ b/ceno_zkvm/src/scheme/gpu/memory.rs
@@ -46,15 +46,7 @@ pub fn init_gpu_mem_tracker<'a>(
 const ESTIMATION_TOLERANCE_BYTES: usize = 2 * 1024 * 1024; // max under-estimation error: 2 MB
 const ESTIMATION_SAFETY_MARGIN_BYTES: usize = 10 * 1024 * 1024; // reserved headroom / allowed over-estimate margin: 10 MB
 const SCHEDULER_ESTIMATION_WARNING_MARGIN_BYTES: usize = 512 * 1024 * 1024;
-const SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES: usize = 16 * 1024 * 1024;
-
-pub(crate) fn tower_prove_allocator_overhead_bytes(circuit_name: &str) -> usize {
-    if circuit_name == "ShardRamCircuit" {
-        SHARD_RAM_TOWER_PROVE_ALLOCATOR_OVERHEAD_BYTES
-    } else {
-        0
-    }
-}
+const SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES: usize = 16 * 1024 * 1024;
 
 /// Validate that the estimated GPU memory matches actual usage within tolerance.
 /// - Under-estimate (actual > estimated): diff must be <= `ESTIMATION_TOLERANCE_BYTES`
@@ -77,6 +69,28 @@ pub fn check_gpu_mem_estimation_with_context(
     );
 }
 
+pub(crate) fn check_gpu_tower_prove_mem_estimation_with_context(
+    mem_tracker: Option<MemTracker>,
+    estimated_bytes: usize,
+    context: Option<&str>,
+) {
+    let (under_tolerance_bytes, over_tolerance_bytes) = if context == Some("ShardRamCircuit") {
+        (
+            SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES,
+            SHARD_RAM_TOWER_PROVE_TOLERANCE_BYTES,
+        )
+    } else {
+        (ESTIMATION_TOLERANCE_BYTES, ESTIMATION_SAFETY_MARGIN_BYTES)
+    };
+    check_gpu_mem_estimation_with_margins(
+        mem_tracker,
+        estimated_bytes,
+        context,
+        under_tolerance_bytes,
+        over_tolerance_bytes,
+    );
+}
+
 pub fn check_gpu_scheduler_mem_estimation_with_context(
     mem_tracker: Option<MemTracker>,
     estimated_bytes: usize,
@@ -686,12 +700,7 @@ fn estimate_tower_stage_components<E: ExtensionField, PCS: PolynomialCommitmentS
         prove_est.prod_tower_buffer_bytes + prove_est.logup_tower_buffer_bytes;
     let borrowed_input_bytes =
         prove_est.prod_borrowed_input_bytes + prove_est.logup_borrowed_input_bytes;
-    let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes)
-        + if is_shard_ram {
-            tower_prove_allocator_overhead_bytes("ShardRamCircuit")
-        } else {
-            0
-        };
+    let prove_local_bytes = prove_est.total_bytes.saturating_sub(tower_input_live_bytes);
 
     (
         build_bytes,
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 4e5f3d0de..e0e0b91f9 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -73,10 +73,10 @@ mod memory;
 mod util;
 pub(crate) use memory::{
     check_gpu_mem_estimation, check_gpu_mem_estimation_with_context,
-    check_gpu_scheduler_mem_estimation_with_context, estimate_chip_proof_memory,
+    check_gpu_scheduler_mem_estimation_with_context,
+    check_gpu_tower_prove_mem_estimation_with_context, estimate_chip_proof_memory,
     estimate_main_witness_bytes, estimate_replay_materialization_bytes_for_plan,
     estimate_tower_bytes, estimate_tower_stage_bytes, init_gpu_mem_tracker,
-    tower_prove_allocator_overhead_bytes,
 };
 use memory::{
     estimate_ecc_quark_bytes_from_num_vars, estimate_main_constraints_bytes,
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index eb31e8901..c0e4e42e6 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -1361,18 +1361,14 @@ where
             .saturating_sub(tower_input_live_bytes);
         let release_adjusted_prebuild_bytes =
             tower_prove_prebuild_estimated_bytes / NUM_FANIN + 4 * 1024 * 1024;
-        let allocator_overhead_bytes =
-            crate::scheme::gpu::tower_prove_allocator_overhead_bytes(name);
-        let tower_prove_estimated_bytes = runtime_layout_prove_bytes
-            .max(release_adjusted_prebuild_bytes)
-            + allocator_overhead_bytes;
+        let tower_prove_estimated_bytes =
+            runtime_layout_prove_bytes.max(release_adjusted_prebuild_bytes);
         tracing::info!(
-            "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, allocator_overhead={:.2}MB, local={:.2}MB, tower_live={:.2}MB",
+            "[gpu tower][{}] refined prove_tower estimate: prebuild={:.2}MB, runtime_layout={:.2}MB, release_adjusted={:.2}MB, local={:.2}MB, tower_live={:.2}MB",
             name,
             tower_prove_prebuild_estimated_bytes as f64 / (1024.0 * 1024.0),
             runtime_layout_prove_bytes as f64 / (1024.0 * 1024.0),
             release_adjusted_prebuild_bytes as f64 / (1024.0 * 1024.0),
-            allocator_overhead_bytes as f64 / (1024.0 * 1024.0),
             tower_prove_estimated_bytes as f64 / (1024.0 * 1024.0),
             tower_input_live_bytes as f64 / (1024.0 * 1024.0),
         );
@@ -1402,7 +1398,7 @@ where
         log_gpu_pool_usage(&format!("{name}:after_prove_tower"));
         let rt_tower: Point<E> = unsafe { std::mem::transmute(rt_tower_gl) };
         let tower_proof: TowerProofs<E> = unsafe { std::mem::transmute(tower_proof_gpu) };
-        crate::scheme::gpu::check_gpu_mem_estimation_with_context(
+        crate::scheme::gpu::check_gpu_tower_prove_mem_estimation_with_context(
             tower_prove_mem_tracker,
             tower_prove_estimated_bytes,
             Some(name),