diff --git a/inference/eval_harness.py b/inference/eval_harness.py new file mode 100644 index 0000000..f2d3323 --- /dev/null +++ b/inference/eval_harness.py @@ -0,0 +1,35 @@ +def exact_match_accuracy(predictions, references): + if not references: + return 0.0 + correct = sum(1 for p, r in zip(predictions, references) if p == r) + return correct / len(references) + +def eval_step_trace(generated_traces, expected_traces): + if not expected_traces: + return 0.0 + score = sum(1 for g, e in zip(generated_traces, expected_traces) if g == e) + return score / len(expected_traces) + +def compare_models_v1_placeholder(model_out, fallback_out, groq_out, ground_truth): + return { + "model_exact_match": model_out == ground_truth, + "fallback_exact_match": fallback_out == ground_truth, + "groq_exact_match": groq_out == ground_truth + } + +def categorize_error_v1_placeholder(prediction, reference): + if not prediction: + return "empty_output" + if len(prediction) < len(reference) / 2: + return "severe_truncation" + if prediction.lower() == reference.lower(): + return "casing_mismatch" + return "logic_or_hallucination" + +def run_error_analysis_v1_placeholder(predictions, references): + report = {} + for p, r in zip(predictions, references): + if p != r: + category = categorize_error_v1_placeholder(p, r) + report[category] = report.get(category, 0) + 1 + return report \ No newline at end of file diff --git a/inference/router.py b/inference/router.py new file mode 100644 index 0000000..bedf774 --- /dev/null +++ b/inference/router.py @@ -0,0 +1,11 @@ +def standalone_inference(checkpoint_path, input_data, strategy="beam"): + from inference.core import CalculusSolverInference + + inferencer = CalculusSolverInference(checkpoint_path) + + if strategy == "beam": + return inferencer.beam_search_decode(input_data) + elif strategy == "standard": + return inferencer.solve(input_data) + + raise ValueError("Unknown strategy") \ No newline at end of file diff --git a/model/checkpoint_utils.py b/model/checkpoint_utils.py new file mode 100644 index 0000000..7d59a0c --- /dev/null +++ b/model/checkpoint_utils.py @@ -0,0 +1,23 @@ +import torch +import torch.nn as nn + +class DummyModel(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + +def create_dummy_checkpoint(path): + model = DummyModel() + torch.save(model.state_dict(), path) + +def validate_checkpoint(checkpoint_path, expected_shapes): + state_dict = torch.load(checkpoint_path) + for key, shape in expected_shapes.items(): + if key not in state_dict: + raise KeyError(key) + if state_dict[key].shape != shape: + raise ValueError(shape) + return True \ No newline at end of file diff --git a/run_pipeline.py b/run_pipeline.py new file mode 100644 index 0000000..33b27a5 --- /dev/null +++ b/run_pipeline.py @@ -0,0 +1,72 @@ +import os +import json +import glob +from model.checkpoint_utils import create_dummy_checkpoint, validate_checkpoint +from inference.router import standalone_inference +from inference.eval_harness import ( + exact_match_accuracy, + eval_step_trace, + compare_models_v1_placeholder, + run_error_analysis_v1_placeholder, +) + +def load_benchmark_data(benchmark_dir="eval/benchmarks"): + inputs = [] + ground_truths = [] + expected_traces = [] + + if not os.path.exists(benchmark_dir): + return inputs, ground_truths, expected_traces + + for filepath in glob.glob(os.path.join(benchmark_dir, "*.json")): + with open(filepath, 'r') as f: + data = json.load(f) + for item in data: + inputs.append(item.get("input", "")) + ground_truths.append(item.get("output", "")) + expected_traces.append(item.get("trace", [])) + + return inputs, ground_truths, expected_traces + +def run_end_to_end_pipeline(): + checkpoint_path = "checkpoints/dummy_model.pt" + is_dummy = "dummy" in checkpoint_path + + if is_dummy: + expected_shapes = { + "linear.weight": (10, 10), + "linear.bias": (10,) + } + if not os.path.exists(checkpoint_path): + create_dummy_checkpoint(checkpoint_path) + else: + expected_shapes = { + "CalculusSolverModel.real_key_placeholder": (0, 0) + } + + validate_checkpoint(checkpoint_path, expected_shapes) + + inputs, ground_truths, expected_traces = load_benchmark_data() + + predictions = [] + generated_traces = [] + + for x in inputs: + output = standalone_inference(checkpoint_path, x, strategy="beam") + if isinstance(output, dict): + predictions.append(output.get("prediction", "")) + generated_traces.append(output.get("trace", [])) + else: + predictions.append(output) + generated_traces.append([]) + + em_score = exact_match_accuracy(predictions, ground_truths) + trace_score = eval_step_trace(generated_traces, expected_traces) + error_report = run_error_analysis_v1_placeholder(predictions, ground_truths) + + print(f"Accuracy: {em_score}") + print(f"Trace Score: {trace_score}") + print(f"Errors: {error_report}") + +if __name__ == "__main__": + run_end_to_end_pipeline() \ No newline at end of file