diff --git a/crates/exec-harness/preload/codspeed_preload.c b/crates/exec-harness/preload/codspeed_preload.c
index 0b908e04..418af143 100644
--- a/crates/exec-harness/preload/codspeed_preload.c
+++ b/crates/exec-harness/preload/codspeed_preload.c
@@ -7,8 +7,6 @@
 //
 // Environment variables:
 //   CODSPEED_BENCH_URI - The benchmark URI to report (required)
-//   CODSPEED_PRELOAD_LOCK - Set by the first process to prevent child processes
-//                          from re-initializing instrumentation
 
 #include <stdlib.h>
 #include <unistd.h>
@@ -22,8 +20,6 @@
 #define RUNNING_ON_VALGRIND 0
 #endif
 
-static const char *LOCK_ENV = "CODSPEED_PRELOAD_LOCK";
-
 // These constants are defined by the build script (build.rs) via -D flags
 #ifndef CODSPEED_URI_ENV
 #error "CODSPEED_URI_ENV must be defined by the build system"
@@ -54,14 +50,6 @@ __attribute__((constructor)) static void codspeed_preload_init(void) {
     return;
   }
 
-  // Check if another process already owns the instrumentation
-  if (getenv(LOCK_ENV)) {
-    return;
-  }
-
-  // Set the lock to prevent child processes from initializing
-  setenv(LOCK_ENV, "1", 1);
-
   g_bench_uri = getenv(URI_ENV);
   if (!g_bench_uri) {
     return;
diff --git a/crates/exec-harness/src/analysis/mod.rs b/crates/exec-harness/src/analysis/mod.rs
index 65e220d7..430a6ef9 100644
--- a/crates/exec-harness/src/analysis/mod.rs
+++ b/crates/exec-harness/src/analysis/mod.rs
@@ -6,6 +6,7 @@ use crate::BenchmarkCommand;
 use crate::constants;
 use crate::uri;
 use instrument_hooks_bindings::InstrumentHooks;
+use std::path::PathBuf;
 use std::process::Command;
 
 mod ld_preload_check;
@@ -54,9 +55,15 @@ pub fn perform_with_valgrind(commands: Vec<BenchmarkCommand>) -> Result<()> {
         cmd.args(&benchmark_cmd.command[1..]);
         // Use LD_PRELOAD to inject instrumentation into the child process
         cmd.env("LD_PRELOAD", preload_lib_path);
+        // Make sure python processes output perf maps. This is usually done by `pytest-codspeed`
+        cmd.env("PYTHONPERFSUPPORT", "1");
         cmd.env(constants::URI_ENV, &name_and_uri.uri);
 
-        let status = cmd.status().context("Failed to execute command")?;
+        let mut child = cmd.spawn().context("Failed to spawn command")?;
+
+        let status = child.wait().context("Failed to execute command")?;
+
+        bail_if_command_spawned_subprocesses_under_valgrind(child.id())?;
 
         if !status.success() {
             bail!("Command exited with non-zero status: {status}");
@@ -65,3 +72,48 @@ pub fn perform_with_valgrind(commands: Vec<BenchmarkCommand>) -> Result<()> {
 
     Ok(())
 }
+
+/// Checks if the benchmark process spawned subprocesses under valgrind by looking for <pid>.out
+/// files in the profile folder.
+///
+/// The presence of <pid>.out files where <pid> is greater than the benchmark process pid indicates
+/// that the benchmark process spawned subprocesses. This .out file will be almost empty, with a 0
+/// cost reported due to the disabled instrumentation.
+///
+/// We currently do not support measuring processes that spawn subprocesses under valgrind, because
+/// valgrind will not have its instrumentation in the new process.
+/// The LD_PRELOAD trick that we use to inject our instrumentation into the benchmark process only
+/// works for the first process.
+///
+/// TODO(COD-2163): Remove this once we support nested processes under valgrind
+fn bail_if_command_spawned_subprocesses_under_valgrind(pid: u32) -> Result<()> {
+    let Some(profile_folder) = std::env::var_os("CODSPEED_PROFILE_FOLDER") else {
+        debug!("CODSPEED_PROFILE_FOLDER is not set, skipping subprocess detection");
+        return Ok(());
+    };
+
+    let profile_folder = PathBuf::from(profile_folder);
+
+    // Bail if any <pid>.out where <pid> > pid of the benchmark process exists in the profile
+    // folder, which indicates that the benchmark process spawned subprocesses.
+    for entry in std::fs::read_dir(profile_folder)? {
+        let entry = entry?;
+        let file_name = entry.file_name();
+        let file_name = file_name.to_string_lossy();
+
+        if let Some(stripped) = file_name.strip_suffix(".out") {
+            if let Ok(subprocess_pid) = stripped.parse::<u32>() {
+                if subprocess_pid > pid {
+                    bail!(
+                        "The codspeed CLI in CPU Simulation mode does not support measuring processes that spawn other processes yet.\n\n\
+                         Please either:\n\
+                         - Use the walltime measurement mode, or\n\
+                         - Benchmark a process that does not create subprocesses"
+                    )
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/executor/helpers/harvest_perf_maps_for_pids.rs b/src/executor/helpers/harvest_perf_maps_for_pids.rs
index 67270359..4ffda881 100644
--- a/src/executor/helpers/harvest_perf_maps_for_pids.rs
+++ b/src/executor/helpers/harvest_perf_maps_for_pids.rs
@@ -1,7 +1,7 @@
 use crate::prelude::*;
 use std::collections::HashSet;
-use std::fs;
 use std::path::{Path, PathBuf};
+use tokio::fs;
 
 pub async fn harvest_perf_maps_for_pids(
     profile_folder: &Path,
@@ -21,7 +21,7 @@ pub async fn harvest_perf_maps_for_pids(
     debug!("Found {} perf maps", perf_maps.len());
 
     for (src_path, dst_path) in perf_maps {
-        fs::copy(&src_path, &dst_path).map_err(|e| {
+        fs::copy(&src_path, &dst_path).await.map_err(|e| {
             anyhow!(
                 "Failed to copy perf map file: {:?} to {}: {}",
                 src_path.file_name(),
diff --git a/src/executor/shared/fifo.rs b/src/executor/shared/fifo.rs
index 40f2cc81..723f921c 100644
--- a/src/executor/shared/fifo.rs
+++ b/src/executor/shared/fifo.rs
@@ -69,7 +69,6 @@ impl GenericFifo {
 pub struct FifoBenchmarkData {
     /// Name and version of the integration
     pub integration: Option<(String, String)>,
-    pub bench_pids: HashSet<pid_t>,
 }
 
 pub struct RunnerFifo {
@@ -255,10 +254,7 @@ impl RunnerFifo {
                     );
                     let marker_result =
                         ExecutionTimestamps::new(&bench_order_by_timestamp, &markers);
-                    let fifo_data = FifoBenchmarkData {
-                        integration,
-                        bench_pids,
-                    };
+                    let fifo_data = FifoBenchmarkData { integration };
                     return Ok((marker_result, fifo_data, exit_status));
                 }
                 Err(e) => return Err(anyhow::Error::from(e)),
diff --git a/src/executor/wall_time/perf/jit_dump.rs b/src/executor/wall_time/perf/jit_dump.rs
index a337f263..9f92c4f4 100644
--- a/src/executor/wall_time/perf/jit_dump.rs
+++ b/src/executor/wall_time/perf/jit_dump.rs
@@ -102,7 +102,10 @@ impl JitDump {
 }
 
 /// Converts all the `jit-<pid>.dump` into unwind data and copies it to the profile folder.
-pub async fn harvest_perf_jit_for_pids(profile_folder: &Path, pids: &HashSet<i32>) -> Result<()> {
+pub async fn harvest_perf_jit_for_pids(
+    profile_folder: &Path,
+    pids: &HashSet<libc::pid_t>,
+) -> Result<()> {
     for pid in pids {
         let name = format!("jit-{pid}.dump");
         let path = PathBuf::from("/tmp").join(&name);
@@ -131,7 +134,7 @@ pub async fn harvest_perf_jit_for_pids(profile_folder: &Path, pids: &HashSet<i32
         };
 
         for module in unwind_data {
-            module.save_to(profile_folder, *pid as _)?;
+            module.save_to(profile_folder, *pid)?;
         }
     }
 
diff --git a/src/executor/wall_time/perf/mod.rs b/src/executor/wall_time/perf/mod.rs
index 7d861e61..e48e25d3 100644
--- a/src/executor/wall_time/perf/mod.rs
+++ b/src/executor/wall_time/perf/mod.rs
@@ -105,9 +105,9 @@ impl PerfRunner {
             || config.command.contains("uv")
             || config.command.contains("python")
         {
-            // Max supported stack size is 64KiB, but this will increase the file size by a lot. In
-            // order to allow uploads and maintain accuracy, we limit this to 8KiB.
-            (UnwindingMode::Dwarf, Some(8 * 1024))
+            // Note that the higher the stack size, the larger the file, although it is mitigated
+            // by zstd compression
+            (UnwindingMode::Dwarf, Some(16 * 1024))
         } else {
             // Default to dwarf unwinding since it works well with most binaries.
             debug!("No call graph mode detected, defaulting to dwarf");
@@ -195,15 +195,10 @@ impl PerfRunner {
             .get()
             .expect("Benchmark order is not available");
 
-        // Harvest the perf maps generated by python. This will copy the perf
-        // maps from /tmp to the profile folder. We have to write our own perf
-        // maps to these files AFTERWARDS, otherwise it'll be overwritten!
-        harvest_perf_maps_for_pids(profile_folder, &bench_data.fifo_data.bench_pids).await?;
-        harvest_perf_jit_for_pids(profile_folder, &bench_data.fifo_data.bench_pids).await?;
-
         // Append perf maps, unwind info and other metadata
-        if let Err(BenchmarkDataSaveError::MissingIntegration) =
-            bench_data.save_to(profile_folder, &self.get_perf_file_path(profile_folder))
+        if let Err(BenchmarkDataSaveError::MissingIntegration) = bench_data
+            .save_to(profile_folder, &self.get_perf_file_path(profile_folder))
+            .await
         {
             warn!(
                 "Perf is enabled, but failed to detect benchmarks. If you wish to disable this warning, set CODSPEED_PERF_ENABLED=false"
@@ -273,16 +268,20 @@ pub struct BenchmarkData {
 pub enum BenchmarkDataSaveError {
     MissingIntegration,
     FailedToParsePerfFile,
+    FailedToHarvestPerfMaps,
+    FailedToHarvestJitDumps,
 }
 
 impl BenchmarkData {
-    pub fn save_to<P: AsRef<std::path::Path>>(
+    pub async fn save_to<P: AsRef<std::path::Path>>(
         &self,
         path: P,
         perf_file_path: P,
     ) -> Result<(), BenchmarkDataSaveError> {
         self.marker_result.save_to(&path).unwrap();
 
+        let path_ref = path.as_ref();
+
         debug!("Reading perf data from file for mmap extraction");
         let MemmapRecordsOutput {
             symbols_by_pid,
@@ -294,7 +293,23 @@ impl BenchmarkData {
             })?
         };
 
-        let path_ref = path.as_ref();
+        // Harvest the perf maps generated by python. This will copy the perf
+        // maps from /tmp to the profile folder. We have to write our own perf
+        // maps to these files AFTERWARDS, otherwise it'll be overwritten!
+        let bench_pids = symbols_by_pid.keys().copied().collect();
+        harvest_perf_maps_for_pids(path_ref, &bench_pids)
+            .await
+            .map_err(|e| {
+                error!("Failed to harvest perf maps: {e}");
+                BenchmarkDataSaveError::FailedToHarvestPerfMaps
+            })?;
+        harvest_perf_jit_for_pids(path_ref, &bench_pids)
+            .await
+            .map_err(|e| {
+                error!("Failed to harvest jit dumps: {e}");
+                BenchmarkDataSaveError::FailedToHarvestJitDumps
+            })?;
+
         debug!("Saving symbols addresses");
         symbols_by_pid.par_iter().for_each(|(_, proc_sym)| {
             proc_sym.save_to(path_ref).unwrap();