MarkRagg · MarkRagg · Apr 14, 2026 · Feb 3, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -60,7 +60,7 @@ jobs:
           npm install
           npx semantic-release
         env:
-          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+          # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
           RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
           # dry run if not on main/master branch, or if initial commit

diff --git a/.gitignore b/.gitignore
@@ -142,3 +142,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# lm eval cache
+hf_cache/
diff --git a/GoT/__init__.py b/GoT/__init__.py
@@ -1,39 +1,75 @@
 import json
 import logging
+from dotenv import load_dotenv
 
 from lm_eval import evaluator, tasks
-from GoT.model.graph_model import invoke_graph, set_prompt
-from GoT.model.lm_wrapper import LangGraphLMWrapper
+from GoT.model.graph_model import call_graph
+from GoT.model.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
+from GoT.model.utils.parse_args import call_benchmark, defining_and_parse_args
+from GoT.model.utils.utils import (
+    print_benchmark_result,
+    print_benchmark_result_loglikehood,
+)
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("GoT")
 
+load_dotenv()
 
-def lm_eval_benchmark():
-    task_list = ["gsm8k"]
-    lm = LangGraphLMWrapper()
+# Possible filter = "flexible", "none", "strict"
+
+
+def lm_eval_test_benchmark():
+    task_name = "gpqa_diamond_zeroshot"
+    task_list = [task_name]
+    test_lm = TestBigBenchWrapper()
+    task_dict = tasks.get_task_dict(task_list)
+
+    results = evaluator.evaluate(
+        lm=test_lm,
+        task_dict=task_dict,
+        limit=2,  # Limit the number of samples
+        log_samples=True,
+        # samples={task_name: [20, 25, 100]},
+    )
+
+    # Save results to a JSON file
+    with open("test_benchmark_results.json", "w") as f:
+        json.dump(results["samples"], f, indent=2)
+
+    print_benchmark_result(results, task_name, filter="strict-match")
+
+
+def lm_eval_graph_benchmark():
+    # hendrycks_math_geometry
+    task_name = "gpqa_diamond_zeroshot"
+    task_list = [task_name]
+    lm = LangGraphBigBenchWrapper()
     task_dict = tasks.get_task_dict(task_list)
 
     results = evaluator.evaluate(
         lm=lm,
+        # limit=1,
         task_dict=task_dict,
-        limit=5,  # Limit to 2 samples for quick testing
+        samples={task_name: [20, 25]},
         log_samples=True,
     )
 
     # Save results to a JSON file
     with open("graph_benchmark_results.json", "w") as f:
-        json.dump(results, f, indent=2)
+        json.dump(results, f, indent=2, default=str)
+
+    print_benchmark_result_loglikehood(results, task_name, filter_val="none")
 
 
 def custom_test():
-    set_prompt("What is 4726621 + 2 * 392 - 3432?")
-    invoke_graph()
+    call_graph("Solve this integral ∫x2⋅ex2dx")
 
 
 def main():
     # It could be changed with custom_test() to test a custom problem instead of the benchmark
-    lm_eval_benchmark()
+    args = defining_and_parse_args()
+    call_benchmark(args)
 
 
 # let this be the last line of this file