diff --git a/benchmarks/tpc/README.md b/benchmarks/tpc/README.md index 319288df05..fac54a7894 100644 --- a/benchmarks/tpc/README.md +++ b/benchmarks/tpc/README.md @@ -38,17 +38,21 @@ All benchmarks are run via `run.py`: python3 run.py --engine --benchmark [options] ``` -| Option | Description | -| -------------- | -------------------------------------------------------- | -| `--engine` | Engine name (matches a TOML file in `engines/`) | -| `--benchmark` | `tpch` or `tpcds` | -| `--iterations` | Number of iterations (default: 1) | -| `--output` | Output directory (default: `.`) | -| `--query` | Run a single query number | -| `--no-restart` | Skip Spark master/worker restart | -| `--dry-run` | Print the spark-submit command without executing | -| `--jfr` | Enable Java Flight Recorder profiling | -| `--jfr-dir` | Directory for JFR output files (default: `/results/jfr`) | +| Option | Description | +| ------------------------- | ------------------------------------------------------------------------------- | +| `--engine` | Engine name (matches a TOML file in `engines/`) | +| `--benchmark` | `tpch` or `tpcds` | +| `--iterations` | Number of iterations (default: 1) | +| `--output` | Output directory (default: `.`) | +| `--query` | Run a single query number | +| `--no-restart` | Skip Spark master/worker restart | +| `--dry-run` | Print the spark-submit command without executing | +| `--jfr` | Enable Java Flight Recorder profiling | +| `--jfr-dir` | Directory for JFR output files (default: `/results/jfr`) | +| `--async-profiler` | Enable async-profiler (profiles Java + native code) | +| `--async-profiler-dir` | Directory for async-profiler output (default: `/results/async-profiler`) | +| `--async-profiler-event` | Event type: `cpu`, `wall`, `alloc`, `lock`, etc. (default: `cpu`) | +| `--async-profiler-format` | Output format: `flamegraph`, `jfr`, `collapsed`, `text` (default: `flamegraph`) | Available engines: `spark`, `comet`, `comet-iceberg`, `gluten` @@ -392,3 +396,88 @@ docker compose -f benchmarks/tpc/infra/docker/docker-compose.yml \ Open the `.jfr` files with [JDK Mission Control](https://jdk.java.net/jmc/), IntelliJ IDEA's profiler, or `jfr` CLI tool (`jfr summary driver.jfr`). + +## async-profiler Profiling + +Use the `--async-profiler` flag to capture profiles with +[async-profiler](https://github.com/async-profiler/async-profiler). Unlike JFR, +async-profiler can profile **both Java and native (Rust/C++) code** in the same +flame graph, making it especially useful for profiling Comet workloads. + +### Prerequisites + +async-profiler must be installed on every node where the driver or executors run. +Set `ASYNC_PROFILER_HOME` to the installation directory: + +```shell +# Download and extract (Linux x64 example) +wget https://github.com/async-profiler/async-profiler/releases/download/v3.0/async-profiler-3.0-linux-x64.tar.gz +tar xzf async-profiler-3.0-linux-x64.tar.gz -C /opt/async-profiler --strip-components=1 +export ASYNC_PROFILER_HOME=/opt/async-profiler +``` + +On Linux, `perf_event_paranoid` must be set to allow profiling: + +```shell +sudo sysctl kernel.perf_event_paranoid=1 # or 0 / -1 for full access +sudo sysctl kernel.kptr_restrict=0 # optional: enable kernel symbols +``` + +### Basic usage + +```shell +python3 run.py --engine comet --benchmark tpch --async-profiler +``` + +This produces HTML flame graphs in `/results/async-profiler/` by default +(`driver.html` and `executor.html`). + +### Choosing events and output format + +```shell +# Wall-clock profiling (includes time spent waiting/sleeping) +python3 run.py --engine comet --benchmark tpch \ + --async-profiler --async-profiler-event wall + +# Allocation profiling with JFR output +python3 run.py --engine comet --benchmark tpch \ + --async-profiler --async-profiler-event alloc --async-profiler-format jfr + +# Lock contention profiling +python3 run.py --engine comet --benchmark tpch \ + --async-profiler --async-profiler-event lock +``` + +| Event | Description | +| ------- | --------------------------------------------------- | +| `cpu` | On-CPU time (default). Shows where CPU cycles go. | +| `wall` | Wall-clock time. Includes threads that are blocked. | +| `alloc` | Heap allocation profiling. | +| `lock` | Lock contention profiling. | + +| Format | Extension | Description | +| ------------ | --------- | ---------------------------------------- | +| `flamegraph` | `.html` | Interactive HTML flame graph (default). | +| `jfr` | `.jfr` | JFR format, viewable in JMC or IntelliJ. | +| `collapsed` | `.txt` | Collapsed stacks for FlameGraph scripts. | +| `text` | `.txt` | Flat text summary of hot methods. | + +### Docker usage + +The Docker image includes async-profiler pre-installed at +`/opt/async-profiler`. The `ASYNC_PROFILER_HOME` environment variable is +already set in the compose files, so no extra configuration is needed: + +```shell +docker compose -f benchmarks/tpc/infra/docker/docker-compose.yml \ + run --rm bench \ + python3 /opt/benchmarks/run.py \ + --engine comet --benchmark tpch --output /results --no-restart --async-profiler +``` + +Output files are collected in `$RESULTS_DIR/async-profiler/` on the host. + +**Note:** On Linux, the Docker container needs `--privileged` or +`SYS_PTRACE` capability and `perf_event_paranoid <= 1` on the host for +`cpu`/`wall` events. Allocation (`alloc`) and lock (`lock`) events work +without special privileges. diff --git a/benchmarks/tpc/infra/docker/Dockerfile b/benchmarks/tpc/infra/docker/Dockerfile index 60567536ad..9bf5ae3935 100644 --- a/benchmarks/tpc/infra/docker/Dockerfile +++ b/benchmarks/tpc/infra/docker/Dockerfile @@ -29,10 +29,23 @@ RUN apt-get update \ && apt-get install -y --no-install-recommends \ openjdk-8-jdk-headless \ openjdk-17-jdk-headless \ - python3 python3-pip procps \ + python3 python3-pip procps wget \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Install async-profiler for profiling Java + native (Rust/C++) code. +ARG ASYNC_PROFILER_VERSION=3.0 +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "x86_64" ]; then AP_ARCH="linux-x64"; \ + elif [ "$ARCH" = "aarch64" ]; then AP_ARCH="linux-aarch64"; \ + else echo "Unsupported architecture: $ARCH" && exit 1; fi && \ + wget -q "https://github.com/async-profiler/async-profiler/releases/download/v${ASYNC_PROFILER_VERSION}/async-profiler-${ASYNC_PROFILER_VERSION}-${AP_ARCH}.tar.gz" \ + -O /tmp/async-profiler.tar.gz && \ + mkdir -p /opt/async-profiler && \ + tar xzf /tmp/async-profiler.tar.gz -C /opt/async-profiler --strip-components=1 && \ + rm /tmp/async-profiler.tar.gz +ENV ASYNC_PROFILER_HOME=/opt/async-profiler + # Default to Java 17 (override with JAVA_HOME at runtime for Gluten). # Detect architecture (amd64 or arm64) so the image works on both Linux and macOS. ARG TARGETARCH diff --git a/benchmarks/tpc/infra/docker/docker-compose-laptop.yml b/benchmarks/tpc/infra/docker/docker-compose-laptop.yml index 7272684065..02a520894d 100644 --- a/benchmarks/tpc/infra/docker/docker-compose-laptop.yml +++ b/benchmarks/tpc/infra/docker/docker-compose-laptop.yml @@ -30,6 +30,7 @@ # ICEBERG_JAR - Host path to Iceberg Spark runtime JAR # BENCH_JAVA_HOME - Java home inside container (default: /usr/lib/jvm/java-17-openjdk) # Set to /usr/lib/jvm/java-8-openjdk for Gluten +# ASYNC_PROFILER_HOME - async-profiler install path (default: /opt/async-profiler) x-volumes: &volumes - ${DATA_DIR:-/tmp/tpc-data}:/data:ro @@ -95,5 +96,6 @@ services: - TPCH_DATA=/data - TPCDS_DATA=/data - SPARK_EVENT_LOG_DIR=/results/spark-events + - ASYNC_PROFILER_HOME=/opt/async-profiler mem_limit: 4g memswap_limit: 4g diff --git a/benchmarks/tpc/infra/docker/docker-compose.yml b/benchmarks/tpc/infra/docker/docker-compose.yml index f5c9f0ebe9..4e9f45d031 100644 --- a/benchmarks/tpc/infra/docker/docker-compose.yml +++ b/benchmarks/tpc/infra/docker/docker-compose.yml @@ -33,6 +33,7 @@ # BENCH_MEM_LIMIT - Hard memory limit for the bench runner (default: 10g) # BENCH_JAVA_HOME - Java home inside container (default: /usr/lib/jvm/java-17-openjdk) # Set to /usr/lib/jvm/java-8-openjdk for Gluten +# ASYNC_PROFILER_HOME - async-profiler install path (default: /opt/async-profiler) x-volumes: &volumes - ${DATA_DIR:-/tmp/tpc-data}:/data:ro @@ -109,6 +110,7 @@ services: - TPCH_DATA=/data - TPCDS_DATA=/data - SPARK_EVENT_LOG_DIR=/results/spark-events + - ASYNC_PROFILER_HOME=/opt/async-profiler mem_limit: ${BENCH_MEM_LIMIT:-10g} memswap_limit: ${BENCH_MEM_LIMIT:-10g} diff --git a/benchmarks/tpc/run.py b/benchmarks/tpc/run.py index 58afc0bbea..5a89166cda 100755 --- a/benchmarks/tpc/run.py +++ b/benchmarks/tpc/run.py @@ -279,6 +279,38 @@ def build_spark_submit_cmd(config, benchmark, args): existing = conf.get(spark_key, "") conf[spark_key] = f"{existing} {jfr_opts}".strip() + # async-profiler: attach as a Java agent via -agentpath + if args.async_profiler: + ap_home = os.environ.get("ASYNC_PROFILER_HOME", "") + if not ap_home: + print( + "Error: ASYNC_PROFILER_HOME is not set. " + "Set it to the async-profiler installation directory.", + file=sys.stderr, + ) + sys.exit(1) + lib_ext = "dylib" if sys.platform == "darwin" else "so" + ap_lib = os.path.join(ap_home, "lib", f"libasyncProfiler.{lib_ext}") + ap_dir = args.async_profiler_dir + ap_event = args.async_profiler_event + ap_fmt = args.async_profiler_format + ext = {"flamegraph": "html", "jfr": "jfr", "collapsed": "txt", "text": "txt"}[ap_fmt] + + driver_ap = ( + f"-agentpath:{ap_lib}=start,event={ap_event}," + f"{ap_fmt},file={ap_dir}/driver.{ext}" + ) + executor_ap = ( + f"-agentpath:{ap_lib}=start,event={ap_event}," + f"{ap_fmt},file={ap_dir}/executor.{ext}" + ) + for spark_key, ap_opts in [ + ("spark.driver.extraJavaOptions", driver_ap), + ("spark.executor.extraJavaOptions", executor_ap), + ]: + existing = conf.get(spark_key, "") + conf[spark_key] = f"{existing} {ap_opts}".strip() + for key, val in sorted(conf.items()): cmd += ["--conf", f"{key}={val}"] @@ -385,6 +417,27 @@ def main(): default="/results/jfr", help="Directory for JFR output files (default: /results/jfr)", ) + parser.add_argument( + "--async-profiler", + action="store_true", + help="Enable async-profiler for driver and executors (profiles Java + native code)", + ) + parser.add_argument( + "--async-profiler-dir", + default="/results/async-profiler", + help="Directory for async-profiler output files (default: /results/async-profiler)", + ) + parser.add_argument( + "--async-profiler-event", + default="cpu", + help="async-profiler event type: cpu, wall, alloc, lock, etc. (default: cpu)", + ) + parser.add_argument( + "--async-profiler-format", + default="flamegraph", + choices=["flamegraph", "jfr", "collapsed", "text"], + help="async-profiler output format (default: flamegraph)", + ) args = parser.parse_args() config = load_engine_config(args.engine) @@ -401,9 +454,12 @@ def main(): if not args.no_restart and not args.dry_run: restart_spark() - # Create JFR output directory if profiling is enabled - if args.jfr: - os.makedirs(args.jfr_dir, exist_ok=True) + # Create profiling output directories (skip for dry-run) + if not args.dry_run: + if args.jfr: + os.makedirs(args.jfr_dir, exist_ok=True) + if args.async_profiler: + os.makedirs(args.async_profiler_dir, exist_ok=True) cmd = build_spark_submit_cmd(config, args.benchmark, args)