Skip to content

Commit aa57898

Browse files
authored
Add files via upload
1 parent fb68dd7 commit aa57898

22 files changed

Lines changed: 6631 additions & 1 deletion

Cargo.lock

Lines changed: 2960 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
[package]
2+
name = "bazelbench-rs"
3+
version = "1.0.0"
4+
edition = "2021"
5+
6+
[profile.release]
7+
incremental = true
8+
# overflow-checks = true
9+
# debug = true
10+
# lto = true
11+
opt-level = 3
12+
13+
[lib]
14+
name = "bazelbench"
15+
path = "src/lib.rs"
16+
17+
[features]
18+
#default = ["cuda"]
19+
cuda = ["dep:cust", "dep:gpu_rand"]
20+
21+
[dependencies]
22+
anyhow = "1.0.82"
23+
base64 = "0.22.0"
24+
bazel-remote-apis-rs = { git = "https://github.com/zachgrayio/bazel-remote-apis-rs", tag = "v0.4.1"}
25+
clap = { version = "4.5.4", features = ["derive", "env"] }
26+
dotenv = "0.15.0"
27+
futures = "0.3.17"
28+
hex = "0.4.3"
29+
lazy_static = "1.4.0"
30+
prometheus = "0.13.3"
31+
prost = "0.11.6"
32+
prost-types = "0.11.6"
33+
rand = "0.8.5"
34+
reqwest = { version = "0.12.3", features = ["json"] }
35+
serde = { version = "1.0.152", features = ["derive"] }
36+
sha2 = "0.10.6"
37+
time = "0.3.36"
38+
tokio = { version = "1.37.0", features = ["rt", "rt-multi-thread", "macros"] }
39+
tokio-stream = { version = "0.1.15", features = ["fs", "sync"] }
40+
tokio-graceful-shutdown = "0.15.0"
41+
tonic = { version = "0.8.3", features = ["tls"]}
42+
uuid = { version = "1.8.0", features = ["v4"] }
43+
warp = "0.3.7"
44+
fastrand = "2.0.2"
45+
blake3 = { version = "1.5.1", features = ["rayon"] }
46+
47+
gpu_rand = { version = "0.1.3", optional = true }
48+
cust = { version = "0.3.2", optional = true }
49+
rand_core = "0.6.4"
50+
51+
[dev-dependencies]
52+
criterion = { version = "0.5.1", features = ["html_reports"] }
53+
54+
[[bench]]
55+
name = "rust_benchmarks"
56+
path = "src/rust_bench.rs"
57+
harness = false

Makefile

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!make
2+
3+
# set up env from env file, similar to rust's dotenv
4+
ifneq ("$(wildcard .env)","")
5+
include .env
6+
export $(shell sed 's/=.*//' .env)
7+
endif
8+
9+
# Check if CUDA_HOME is set or nvcc is in PATH
10+
CUDA_INSTALLED := $(shell (test -n "$$CUDA_HOME" || which nvcc > /dev/null) && echo 1 || echo 0)
11+
12+
# choose the appropriate base command for building and running the benchmark with or without the cuda feature
13+
ifeq ($(CUDA_INSTALLED),1)
14+
CARGO_RUN_CMD=@cargo run --release --bin bazelbench-rs --features cuda --
15+
else
16+
CARGO_RUN_CMD=@cargo run --release --bin bazelbench-rs --
17+
endif
18+
19+
help:
20+
$(CARGO_RUN_CMD) --help
21+
22+
flamegraph: grafana bazel_remote
23+
CARGO_PROFILE_RELEASE_DEBUG=true cargo flamegraph --root --bin bazelbench-rs -- --read_amplification 1 --flamegraph --num_threads 1
24+
25+
bench_garnet: grafana
26+
$(CARGO_RUN_CMD) \
27+
--target_endpoint http://localhost:1337 \
28+
--ac_read_sleep_ms 0 \
29+
--digest_function blake3 \
30+
--label garnet \
31+
--num_threads 100 --read_amplification 100
32+
33+
bench_garnet_write_only: grafana
34+
$(CARGO_RUN_CMD) \
35+
--target_endpoint http://localhost:1337 \
36+
--write_only \
37+
--num_threads 100 \
38+
--digest_function blake3 \
39+
--label garnet
40+
41+
# bazel-remote benchmarks
42+
bench_bazelremote: grafana bazel_remote
43+
$(CARGO_RUN_CMD) --label bazelremote --ac_read_sleep_ms 100
44+
45+
bench_bazelremote_write_only: grafana bazel_remote
46+
$(CARGO_RUN_CMD) \
47+
--write_only \
48+
--num_threads 100 \
49+
--label bazelremote
50+
51+
bench_bazelremote_write_only_big: grafana bazel_remote
52+
$(CARGO_RUN_CMD) \
53+
--write_only \
54+
--chunk_size 1048576 \
55+
--blob_size_bytes 10485760 \
56+
--num_threads 40 \
57+
--label bazelremote
58+
59+
# nativelink benchmarks
60+
bench_nativelink: grafana nativelink
61+
$(CARGO_RUN_CMD) \
62+
--target_endpoint http://localhost:50051 \
63+
--ac_read_sleep_ms 0 \
64+
--digest_function blake3 \
65+
--label nativelink \
66+
--num_threads 100 --read_amplification 100
67+
68+
bench_nativelink_write_only: grafana nativelink
69+
$(CARGO_RUN_CMD) \
70+
--write_only \
71+
--num_threads 100 \
72+
--target_endpoint http://localhost:50051 \
73+
--digest_function blake3 \
74+
--label nativelink
75+
76+
bench_nativelink_write_only_big: grafana nativelink
77+
$(CARGO_RUN_CMD) \
78+
--write_only \
79+
--chunk_size 1048576 \
80+
--blob_size_bytes 10485760 \
81+
--num_threads 40 \
82+
--target_endpoint http://localhost:50051 \
83+
--digest_function blake3 \
84+
--label nativelink
85+
86+
bench_nativelink_write_only_big_cpu: grafana nativelink
87+
$(CARGO_RUN_CMD) \
88+
--write_only \
89+
--chunk_size 1048576 \
90+
--blob_size_bytes 10485760 \
91+
--num_threads 40 \
92+
--target_endpoint http://localhost:50051 \
93+
--digest_function blake3 \
94+
--label nativelink \
95+
--allow_gpu false
96+
97+
98+
grafana: prometheus
99+
@docker kill grafana >/dev/null 2>&1 || true
100+
@docker rm grafana >/dev/null 2>&1 || true
101+
@docker run -d --name grafana \
102+
--network="host" \
103+
-v $(PWD)/datasources:/etc/grafana/provisioning/datasources \
104+
-v $(PWD)/dashboards:/etc/grafana/provisioning/dashboards \
105+
-e "GF_PROMETHEUS_URL=http://localhost:9090" \
106+
-e "GF_AUTH_ANONYMOUS_ENABLED=true" \
107+
-e "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin" \
108+
grafana/grafana:latest >/dev/null
109+
@echo "\nbench dash available at http://localhost:3000/d/bdj4arf1qn0g0c/benchmark-metrics?orgId=1"
110+
111+
prometheus:
112+
# @docker kill prometheus >/dev/null 2>&1 || true
113+
# @docker rm prometheus >/dev/null 2>&1 || true
114+
@docker run -d --name prometheus \
115+
--rm \
116+
--network="host" \
117+
-v $(PWD)/prometheus.yml:/etc/prometheus/prometheus.yml \
118+
prom/prometheus >/dev/null 2>&1 || true
119+
120+
bazel_remote: stop_bazel_remote
121+
@mkdir /tmp/br || true
122+
@bazel-remote --dir "/tmp/br" --max_size 10 --experimental_remote_asset_api > /tmp/bazel_remote.log 2>&1 &
123+
@echo "bazel-remote running, tail logs with tail -f /tmp/bazel_remote.log\n"
124+
125+
stop_bazel_remote:
126+
@pkill -f -x bazel-remote || true
127+
@sleep 1
128+
@rm -rf "/tmp/br" || true
129+
130+
install_nativelink:
131+
cargo install --git https://github.com/TraceMachina/nativelink --rev ea508561d8faf1de3a7188867c70b7ef36069572
132+
133+
stop_nativelink:
134+
@pkill -f -x 'nativelink $(PWD)/nativelink/' || true
135+
@sleep 1
136+
@rm -rf "/tmp/nativelink" || true
137+
138+
nativelink: stop_nativelink install_nativelink
139+
@mkdir /tmp/nativelink || true
140+
@nativelink $(PWD)/nativelink/memory_and_disk.json > /tmp/nativelink.log 2>&1 &
141+
@echo "nativelink running, tail logs with tail -f /tmp/nativelink.log\n"
142+
143+
stop: stop_bazel_remote stop_nativelink
144+
@docker kill prometheus >/dev/null 2>&1 || true
145+
@docker kill grafana >/dev/null 2>&1 || true
146+
147+
cargo_bench:
148+
ifeq ($(CUDA_INSTALLED),1)
149+
cargo bench --features cuda
150+
else
151+
cargo bench
152+
endif

README.md

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,131 @@
1-
# cache-load-utils
1+
# bazelbench-rs
2+
3+
A Bazel reAPI benchmarking and load-testing tool.
4+
5+
## Overview
6+
7+
- Random files are generated with `fastrng` on the CPU; if GPU is available, blobs are generated via `CUDA`, allowing for more load to be generated
8+
- There are 2 modes: write-only, and the default write-read mode
9+
- Only gRPC backends are supported, as this tool is built on top of `tonic` and `tokio`
10+
- This tool makes heavy use of `prometheus` metrics to expose performance information, which are just exposed as counters on the binary via `http`
11+
- As such, `prometheus` and `grafana` are needed to scrape and display this information, see definitions in the `Makefile`
12+
13+
## Dashboard Example
14+
15+
![dash.png](images%2Fdash.png)
16+
17+
## Usage
18+
19+
### Makefile
20+
21+
- Test `bazel-remote` with `make bench_bazelremote`
22+
- Test `nativelink` with `make bench_nativelink`
23+
24+
### CLI Invocation
25+
26+
See the definition of the `Makefile` tasks for a complete example of how `bazel-remote` and `nativelink` are benchmarked and compared to one another.
27+
28+
Run `bazelbench-rs --help` or `make help` to see options:
29+
30+
```text
31+
Usage: bazelbench-rs [OPTIONS] --target_endpoint <TARGET_ENDPOINT>
32+
33+
Options:
34+
--auth_header <AUTH_HEADER>
35+
The optional auth header to be included in request metadata to authenticate requests against a secure backend
36+
37+
[env: AUTH_HEADER=]
38+
39+
--target_endpoint <TARGET_ENDPOINT>
40+
The reAPI gRPC endpoint under test; Note it is required to use http:// and https:// schemes even while this is gRPC only
41+
42+
[env: TARGET_ENDPOINT=http://localhost:9092]
43+
44+
--remote_instance_name <REMOTE_INSTANCE_NAME>
45+
The instance_name field passed to the RE API calls
46+
47+
[env: REMOTE_INSTANCE_NAME=]
48+
[default: main]
49+
50+
--blob_size_bytes <BLOB_SIZE_BYTES>
51+
The size in bytes to use for the randomly generated blobs used to test the backend
52+
53+
[env: BLOB_SIZE_BYTES=1048576]
54+
[default: 32768]
55+
56+
--prometheus_port <PROMETHEUS_PORT>
57+
The port on which prometheus metrics are made available for scraping at /metrics by prometheus
58+
59+
[env: PROMETHEUS_PORT=9091]
60+
[default: 9091]
61+
62+
--container_port <CONTAINER_PORT>
63+
The port on which this program will respond to HTTP requests regarding its health and readiness status, useful if running in kubernetes
64+
65+
[env: UTILITY_PORT=3001]
66+
[default: 3001]
67+
68+
--num_threads <NUM_THREADS>
69+
The number of concurrent benchmark tasks to spawn.
70+
Note these are Tokio tasks spawned into the tokio default runtime's threadpool (thread-per-core and work-stealing).
71+
72+
- Going beyond ~2x physical core count will harm performance
73+
- For each num_thread, a new connection is made to the backend
74+
- Within each "iteration" running on a "thread" additional tasks are spawned;
75+
- A task to call get_capabilities
76+
- A task to call bs_write, then N tasks to call bs_read up to read_amplification
77+
- A task to call ac_write, then ac_read;
78+
note this task also has a configurable sleep on it to work around a race condition present in some cache backends
79+
80+
[env: NUM_THREADS=]
81+
[default: 20]
82+
83+
--write_only
84+
Only generate and write data with Bytestream.Write() RPC
85+
86+
[env: WRITE_ONLY=]
87+
88+
--read_amplification <READ_AMPLIFICATION>
89+
For each iteration of requests, how many additional read requests to send; allows for simulation of a more realistic read-heavy workload with a single client
90+
91+
[env: READ_AMPLIFICATION=]
92+
[default: 20]
93+
94+
--chunk_size <CHUNK_SIZE>
95+
The size in which chunks should be created and sent up to the backend. Bigger generally is better for speed on a good network, but bazel defaults this to quite small at 16k, so that is the default here
96+
97+
[env: CHUNK_SIZE=]
98+
[default: 16384]
99+
100+
--zero_pad_blob_pct <ZERO_PAD_BLOB_PCT>
101+
A mechanism by which to increase write performance (less RNG) and also test data deduplication, by replacing the specified percentage of each blob by 0s
102+
103+
[env: ZERO_PAD_BLOB_PCT=]
104+
[default: 50]
105+
106+
--ac_read_sleep_ms <AC_READ_SLEEP_MS>
107+
Some caches will return not-found to requests coming too soon after an AC entry is written. This delay offsets that, but in a single threaded mode it will significantly affect RPS. Set as close to 0 as possible while minimizing ac_read error rates in the Grafana dashboard
108+
109+
[env: AC_READ_SLEEP_MS=]
110+
[default: 20]
111+
112+
--digest_function <DIGEST_FUNCTION>
113+
The reAPI digest function to use to generate hashes; blake3 is preferred but not supported in bazel-remote, so the default is sha256. If the cache under test supports blake3, be sure to use this to remove the CPU-bound nature of large write benchmarks caused by sha256 hashing
114+
115+
[env: DIGEST_FUNCTION=]
116+
[default: sha256]
117+
[possible values: sha256, blake3]
118+
119+
--label <LABEL>
120+
The label to associate with metrics reported into prometheus, so that separate lines can be shown for each cache backend under test; comes in as a value on the "cache_backend" label
121+
122+
[env: LABEL=]
123+
[default: default]
124+
125+
-h, --help
126+
Print help (see a summary with '-h')
127+
128+
-V, --version
129+
Print version
130+
131+
```

0 commit comments

Comments
 (0)