From 305bcb2fee468046332725512ffcca44abf7abe4 Mon Sep 17 00:00:00 2001 From: sainijit Date: Tue, 26 May 2026 20:40:54 +0530 Subject: [PATCH] feat(poi): benchmark device/precision propagation and reliability fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add --resource_config CLI arg to poi_stream_density.py so DEVICE and MODEL_PRECISION are propagated through all stream-density re-init iterations (make benchmark DEVICE=all-gpu.env now works end-to-end) - Catch PermissionError in _save_alert_thumbnails: logs a clear warning instead of crashing when results/ is root-owned - Clarify true E2E latency measurement: the primary VLM metrics-file latency (e.g. 128ms) is frame capture → alert dispatch (start set via user_log_start_time with DLStreamer frame timestamp, end via log_end_time at alert dispatch). Corrects misleading comments that said the metric excluded DLStreamer pipeline latency. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmark-scripts/poi_stream_density.py | 89 ++++++++++++++++++++----- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/benchmark-scripts/poi_stream_density.py b/benchmark-scripts/poi_stream_density.py index 64694c3..d1319ac 100644 --- a/benchmark-scripts/poi_stream_density.py +++ b/benchmark-scripts/poi_stream_density.py @@ -23,8 +23,14 @@ poi-alert-service, and poi-ui stay running Latency is measured from ``vlm_application_metrics`` files written by the -``vlm_metrics_logger`` package (user_log_start_time / log_end_time calls -in poi-backend). +``vlm_metrics_logger`` package. In ``alert_service.py``: + - **start**: ``user_log_start_time(frame_ts_ms, ...)`` — the DLStreamer frame + capture timestamp from the MQTT payload (epoch ms). This is the moment + the camera frame was decoded by the pipeline. + - **end**: ``log_end_time(...)`` — wall-clock time at alert dispatch. + +This measures **true end-to-end latency**: + camera frame capture → DLStreamer decode/detect/reid → MQTT → FAISS match → alert dispatch. Sub-commands ------------ @@ -43,6 +49,12 @@ BENCHMARK_DURATION Max wait for single benchmark in seconds (default: 120) RESULTS_DIR Where to write results (default: ./results) MAX_ITERATIONS Safety cap on iterations (default: 50) +RESOURCE_CONFIG Path to device resource config file relative to app_dir + (e.g. configs/res/all-gpu.env). Passed to init.sh on + every re-init so device and model precision are preserved + across stream-density iterations. Prefer passing + --resource_config on the CLI (set automatically by + ``make benchmark DEVICE=...``). """ import argparse @@ -444,16 +456,33 @@ def _generate_dlstreamer_config(app_dir: str, num_scenes: int) -> None: logger.info("Generated DLStreamer configs for %d total cameras", base_camera_count + num_scenes - 1) -def _reinit_env(app_dir: str) -> None: - """Re-run init.sh to regenerate .env with updated config.""" +def _reinit_env(app_dir: str, resource_config: str = "") -> None: + """Re-run init.sh to regenerate .env with updated config. + + Parameters + ---------- + app_dir: + Absolute path to the person-of-interest/ directory. + resource_config: + Path to the device resource config file (e.g. configs/res/all-gpu.env). + Passed as the ``RESOURCE_CONFIG`` env var to init.sh so that device, + precision, and pre-process settings are preserved across stream-density + iterations. When empty, init.sh uses its own default (all-gpu-cpu.env). + """ init_script = Path(app_dir) / ".." / "scenescape" / "scripts" / "init.sh" if not init_script.exists(): logger.warning("init.sh not found at %s — skipping .env regeneration", init_script) return + env = os.environ.copy() + if resource_config: + env["RESOURCE_CONFIG"] = resource_config + logger.info("Re-running init.sh with RESOURCE_CONFIG=%s …", resource_config) + else: + logger.info("Re-running init.sh to update .env …") + cmd = f"bash {shlex.quote(str(init_script))} {shlex.quote(app_dir)}" - logger.info("Re-running init.sh to update .env …") - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, env=env) if result.returncode != 0: logger.warning("init.sh returned non-zero:\n%s", result.stderr[-500:]) else: @@ -479,7 +508,7 @@ def _wait_for_web_healthy(timeout: int = 300) -> None: logger.warning("Web container did not become healthy after %ds — continuing anyway", timeout) -def _scale_pipeline_services(app_dir: str, num_scenes: int, wait: int = 90) -> None: +def _scale_pipeline_services(app_dir: str, num_scenes: int, wait: int = 90, resource_config: str = "") -> None: """ Scale the POI video pipeline to N scenes. @@ -497,7 +526,7 @@ def _scale_pipeline_services(app_dir: str, num_scenes: int, wait: int = 90) -> N _set_stream_density(app_dir, num_scenes) _generate_cameras_override(app_dir, num_scenes) - _reinit_env(app_dir) + _reinit_env(app_dir, resource_config=resource_config) _generate_dlstreamer_config(app_dir, num_scenes) # Bring up any new camera services @@ -734,7 +763,16 @@ def _save_alert_thumbnails( import urllib.error thumbs_dir = os.path.join(results_dir, f"thumbnails_iter{iteration}") - os.makedirs(thumbs_dir, exist_ok=True) + try: + os.makedirs(thumbs_dir, exist_ok=True) + except PermissionError: + logger.warning( + "Cannot create thumbnails directory %s (permission denied). " + "The results/ directory may be owned by root (written by a Docker container). " + "Run: sudo chown -R $USER results/", + thumbs_dir, + ) + return 0 saved = 0 try: @@ -858,10 +896,15 @@ def _extract_poi_latency(stats: Dict[str, float], metric: str) -> float: Extract a single representative POI latency value from collected stats. Priority: - 1. vlm_application_metrics file values (MQTT receive → alert dispatch, - measures POI application latency only, excludes DLStreamer pipeline) - 2. Alerts API fallback (``mqtt_received_at`` → ``dispatched_at``) - 3. Returns 0 if no data available + 1. vlm_application_metrics file values — TRUE end-to-end latency: + start = DLStreamer frame capture timestamp (set by user_log_start_time + in alert_service.py using the MQTT payload's frame timestamp field), + end = wall-clock time at alert dispatch (log_end_time). + This spans: camera frame capture → DLStreamer pipeline → FAISS match + → alert dispatch. + 2. Alerts API fallback (``mqtt_received_at`` → ``dispatched_at``) — + POI application latency only, excludes DLStreamer pipeline latency. + 3. Returns 0 if no data available. Note: Docker-log-based ``log_detection_to_alert_ms`` is excluded because log timestamps have only second-level precision and the first-match-to- @@ -940,6 +983,7 @@ def __init__( single_run: bool = False, single_run_scenes: int = 1, benchmark_duration: int = 120, + resource_config: str = "", ): self.app_dir = os.path.abspath(app_dir) self.target_latency_ms = target_latency_ms @@ -952,6 +996,7 @@ def __init__( self.single_run = single_run self.single_run_scenes = single_run_scenes self.benchmark_duration = benchmark_duration + self.resource_config = resource_config os.makedirs(self.results_dir, exist_ok=True) def _services_running(self) -> bool: @@ -1029,7 +1074,8 @@ def run(self) -> StreamDensityResult: logger.info("Services already running — skipping scaling for single benchmark") else: # Scale to desired scene count - _scale_pipeline_services(self.app_dir, num_scenes, wait=self.init_duration) + _scale_pipeline_services(self.app_dir, num_scenes, wait=self.init_duration, + resource_config=self.resource_config) # Wait for data collection if self.single_run: @@ -1250,6 +1296,7 @@ def cmd_run(args) -> None: single_run=args.single_run, single_run_scenes=args.scenes, benchmark_duration=args.benchmark_duration, + resource_config=args.resource_config, ) result = tester.run() sys.exit(0 if result.met_target else 1) @@ -1260,7 +1307,7 @@ def cmd_generate(args) -> None: _set_stream_density(args.app_dir, num) _generate_dlstreamer_config(args.app_dir, num) _generate_cameras_override(args.app_dir, num) - _reinit_env(args.app_dir) + _reinit_env(args.app_dir, resource_config=args.resource_config) print(f"Generated overrides for {num} scene(s). Run 'make demo' to start.") @@ -1275,7 +1322,7 @@ def cmd_clean(args) -> None: _set_stream_density(app_dir, 1) _generate_dlstreamer_config(app_dir, 1) _clean_cameras_override(app_dir) - _reinit_env(app_dir) + _reinit_env(app_dir, resource_config=getattr(args, "resource_config", "")) print("Cleaned up – stream_density reset to 1.") @@ -1317,17 +1364,27 @@ def main() -> None: "Exits early when an alert is received.") p_run.add_argument("--scenes", type=int, default=1, help="Number of scenes for single-run mode") + p_run.add_argument("--resource_config", default="", + help="Absolute path to device resource config " + "(e.g. /path/to/configs/res/all-gpu.env). " + "Passed as RESOURCE_CONFIG to init.sh on every " + "re-init so device and precision are preserved " + "across stream-density iterations.") p_run.set_defaults(func=cmd_run) # --- generate --- p_gen = sub.add_parser("generate", help="Generate overrides for N scenes") p_gen.add_argument("app_dir") p_gen.add_argument("--scenes", type=int, default=1) + p_gen.add_argument("--resource_config", default="", + help="Absolute path to device resource config file.") p_gen.set_defaults(func=cmd_generate) # --- clean --- p_clean = sub.add_parser("clean", help="Revert to single scene") p_clean.add_argument("app_dir") + p_clean.add_argument("--resource_config", default="", + help="Absolute path to device resource config file.") p_clean.set_defaults(func=cmd_clean) # --- down ---