From c2758bc929160b02c8c651ed970bf8a0e0efae65 Mon Sep 17 00:00:00 2001 From: Marko Jeremic Date: Fri, 20 Mar 2026 13:39:35 +0100 Subject: [PATCH] Add processing for acceptance criteria for ShieldBenchmarkDataMapper --- .github/actions/collect_data/src/benchmark.py | 64 ++++++++- .../test/test_benchmark_mapper.py | 134 ++++++++++++++++++ 2 files changed, 196 insertions(+), 2 deletions(-) diff --git a/.github/actions/collect_data/src/benchmark.py b/.github/actions/collect_data/src/benchmark.py index 8e56934..102da1d 100644 --- a/.github/actions/collect_data/src/benchmark.py +++ b/.github/actions/collect_data/src/benchmark.py @@ -188,7 +188,14 @@ def map_benchmark_data(self, pipeline, job_id, report_data, model_spec_data=None metadata, model_spec_data, ) - return benchmark_runs + benchmark_summary_runs + eval_runs + acceptance_summary_runs = self._process_acceptance_summary( + pipeline, + job, + report_data.get("acceptance_summary", {}), + metadata, + model_spec_data, + ) + return benchmark_runs + benchmark_summary_runs + eval_runs + acceptance_summary_runs except ValidationError as e: failure_happened() logger.error(f"Validation error: {e}") @@ -380,6 +387,58 @@ def _process_evals(self, pipeline, job, evals, metadata=None, model_spec_data=No ) return results + def _process_acceptance_summary(self, pipeline, job, acceptance_summary, metadata=None, model_spec_data=None): + """ + Processes acceptance summary entries and creates CompleteBenchmarkRun objects for each entry. + """ + results = [] + if acceptance_summary: + if metadata: + logger.debug(f"Processing acceptance summary with metadata included...") + acceptance_summary = {**acceptance_summary, **metadata} # metadata values take precedence + measurements = self._create_measurements( + job, + "acceptance_summary", + acceptance_summary, + [ + "acceptance_criteria", + ], + ) + + # Merge leftover key-values from acceptance_summary into model_spec_data for config_params + acceptance_criteria_metadata = model_spec_data.copy() if model_spec_data else {} + for key, value in acceptance_summary.items(): + if key not in acceptance_criteria_metadata: + acceptance_criteria_metadata[key] = value + results.append( + self._create_complete_benchmark_run( + pipeline=pipeline, + job=job, + data=acceptance_summary, + run_type="acceptance_summary", + measurements=measurements, + device_info=acceptance_summary.get("device"), + model_name=acceptance_summary.get("model"), + model_type=model_spec_data.get("model_type") if model_spec_data else None, + input_seq_length=None, + output_seq_length=None, + dataset_name=None, + batch_size=None, + config_params=acceptance_criteria_metadata, + ) + ) + return results + + def _normalize_measurement_value(self, value): + if isinstance(value, str): + if value.lower() == "true": + return 1.0 + elif value.lower() == "false": + return 0.0 + elif value == "": + return None + return value + def _create_measurements(self, job, step_name, data, keys): """ Creates BenchmarkMeasurement objects for the specified keys in the data. @@ -388,6 +447,7 @@ def _create_measurements(self, job, step_name, data, keys): for key in keys: if key in data: try: + value = self._normalize_measurement_value(data.get(key)) measurement = BenchmarkMeasurement( step_start_ts=job.job_start_ts, step_end_ts=job.job_end_ts, @@ -395,7 +455,7 @@ def _create_measurements(self, job, step_name, data, keys): step_name=step_name, step_warm_up_num_iterations=None, name=key, - value=data.get(key), + value=value, target=None, device_power=None, device_temperature=None, diff --git a/.github/actions/collect_data/test/test_benchmark_mapper.py b/.github/actions/collect_data/test/test_benchmark_mapper.py index c8dddee..2d05972 100644 --- a/.github/actions/collect_data/test/test_benchmark_mapper.py +++ b/.github/actions/collect_data/test/test_benchmark_mapper.py @@ -255,6 +255,140 @@ def test_evals_model_type_without_model_spec(mapper, pipeline): assert result[0].ml_model_type is None +def test_process_acceptance_summary(mapper, pipeline): + report_data = { + "metadata": { + "report_id": "test_report", + "model": "test_model", + "device": "test_device", + "model_id": "id_test_spec_test_model_test_device", + "inference_engine": "vllm", + }, + "acceptance_summary": { + "acceptance_criteria": "true", + "acceptance_blockers": "", + "acceptance_summary_markdown": "## Summary\nAll good", + }, + } + result = mapper.map_benchmark_data( + pipeline, 1, report_data, {"model_name": "test_model", "device_type": "test_device"} + ) + assert len(result) == 1 + assert isinstance(result[0], CompleteBenchmarkRun) + assert result[0].run_type == "acceptance_summary" + assert len(result[0].measurements) == 1 + assert result[0].measurements[0].name == "acceptance_criteria" + assert result[0].measurements[0].value == 1.0 + + +def test_process_acceptance_summary_config_params(mapper, pipeline): + report_data = { + "metadata": { + "report_id": "test_report", + "model": "test_model", + "device": "test_device", + "model_id": "id_test_spec_test_model_test_device", + "inference_engine": "vllm", + }, + "acceptance_summary": { + "acceptance_criteria": "true", + "acceptance_blockers": "", + "acceptance_summary_markdown": "## Summary\nAll good", + }, + } + model_spec_data = { + "model_name": "test_model", + "device_type": "test_device", + "extra_param": "extra_value", + } + result = mapper.map_benchmark_data(pipeline, 1, report_data, model_spec_data) + assert len(result) == 1 + assert isinstance(result[0], CompleteBenchmarkRun) + assert result[0].run_type == "acceptance_summary" + assert isinstance(result[0].config_params, dict) + assert result[0].config_params.get("extra_param") == "extra_value" + assert result[0].config_params.get("acceptance_blockers") == "" + assert result[0].config_params.get("acceptance_summary_markdown") == "## Summary\nAll good" + + +def test_process_acceptance_summary_with_metadata(mapper, pipeline): + report_data = { + "metadata": { + "report_id": "test_report", + "model": "test_model", + "device": "test_device", + "model_id": "id_test_spec_test_model_test_device", + "inference_engine": "vllm", + }, + "acceptance_summary": { + "model": "test_model_2", + "device": "test_device_2", + "acceptance_criteria": "false", + "acceptance_blockers": "Test blocker 1, Test blocker 2", + "acceptance_summary_markdown": "## Summary\nSome issues found", + }, + } + result = mapper.map_benchmark_data(pipeline, 1, report_data, {"model_name": "test_model"}) + assert len(result) == 1 + assert isinstance(result[0], CompleteBenchmarkRun) + assert result[0].run_type == "acceptance_summary" + assert result[0].ml_model_name == "test_model" + assert result[0].device_info == {"device_name": "test_device"} + assert len(result[0].measurements) == 1 + assert result[0].measurements[0].name == "acceptance_criteria" + assert result[0].measurements[0].value == 0.0 + + +def test_process_acceptance_summary_empty(mapper, pipeline): + report_data = {"acceptance_summary": {}} + result = mapper.map_benchmark_data(pipeline, 1, report_data) + assert len(result) == 0 + + +def test_process_acceptance_summary_missing(mapper, pipeline): + report_data = {} + result = mapper.map_benchmark_data(pipeline, 1, report_data) + assert len(result) == 0 + + +def test_process_acceptance_summary_model_spec_precedence(mapper, pipeline): + model_spec_data = { + "model_name": "model_from_spec", + "device_type": "tt", + "shared_key": "value_from_model_spec", + "only_in_spec": "spec_value", + } + report_data = { + "metadata": { + "report_id": "test_report", + "model": "test_model", + "device": "test_device", + "model_id": "id_test_spec_test_model_test_device", + "inference_engine": "vllm", + }, + "acceptance_summary": { + "acceptance_criteria": "true", + "shared_key": "value_from_acceptance_summary", + "only_in_acceptance": "acceptance_value", + }, + } + result = mapper.map_benchmark_data(pipeline, 1, report_data, model_spec_data) + assert len(result) == 1 + assert isinstance(result[0], CompleteBenchmarkRun) + assert result[0].run_type == "acceptance_summary" + + # Verify model_spec_data key takes precedence + assert result[0].config_params.get("shared_key") == "value_from_model_spec" + + # Verify both unique keys are present + assert result[0].config_params.get("only_in_spec") == "spec_value" + assert result[0].config_params.get("only_in_acceptance") == "acceptance_value" + + # Verify model_spec_data keys are present + assert result[0].config_params.get("model_name") == "model_from_spec" + assert result[0].config_params.get("device_type") == "tt" + + @pytest.mark.parametrize( "input_val, expected", [