MarkRagg · MarkRagg · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -60,7 +60,7 @@ jobs:
           npm install
           npx semantic-release
         env:
-          # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
           RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
           # dry run if not on main/master branch, or if initial commit

diff --git a/GoT/__init__.py b/GoT/__init__.py
@@ -3,10 +3,9 @@
 from dotenv import load_dotenv
 
 from lm_eval import evaluator, tasks
-from GoT.model.graph_model import call_graph
-from GoT.model.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
-from GoT.model.utils.parse_args import call_benchmark, defining_and_parse_args
-from GoT.model.utils.utils import (
+from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
+from GoT.cli.parse_args import call_benchmark, defining_and_parse_args
+from GoT.utils.utils import (
     print_benchmark_result,
     print_benchmark_result_loglikehood,
 )
@@ -62,14 +61,10 @@ def lm_eval_graph_benchmark():
     print_benchmark_result_loglikehood(results, task_name, filter_val="none")
 
 
-def custom_test():
-    call_graph("Solve this integral ∫x2⋅ex2dx")
-
-
 def main():
-    # It could be changed with custom_test() to test a custom problem instead of the benchmark
     args = defining_and_parse_args()
     call_benchmark(args)
+    # download_mlflow_traces(50)
 
 
 # let this be the last line of this file

diff --git a/GoT/tools/ai_tool.py → GoT/agent_tools/ai_tool.py b/GoT/tools/ai_tool.py → GoT/agent_tools/ai_tool.py
diff --git a/GoT/tools/craft_tool.py → GoT/agent_tools/craft_tool.py b/GoT/tools/craft_tool.py → GoT/agent_tools/craft_tool.py
@@ -56,6 +56,8 @@ def craft_tool(tool_function: str) -> str:
     """Save the function definition provided by the LLM as a tool that can be used by other agents.
     The function should be defined as a python function.
     The function should be general and reusable, and should not be specific to the current problem.
+    The function must not use tuple as args type.
+    The function must be defined as gemini api format, with type annotations for all arguments and return type.
     The function should be defined in a way that it can be imported and used by other agents."""
 
     def sanitize_input(query: str) -> str:
@@ -81,6 +83,13 @@ def sanitize_input(query: str) -> str:
 
     func = functions[0]
 
+    try:
+        docstring = ast.get_docstring(func)
+        if not docstring or not docstring.strip():
+            return "Error: missing docstring. A description of the function is mandatory for Gemini tools."
+    except TypeError:
+        return "Error: missing docstring. A description of the function is mandatory for Gemini tools."
+
     for arg in func.args.args:
         if arg.annotation is None:
             return f"Error: missing type annotation for '{arg.arg}'"

diff --git a/GoT/tools/math_tool.py → GoT/agent_tools/math_tool.py b/GoT/tools/math_tool.py → GoT/agent_tools/math_tool.py
diff --git a/GoT/tools/runtime_graph_tool.py → GoT/agent_tools/runtime_graph_tool.py b/GoT/tools/runtime_graph_tool.py → GoT/agent_tools/runtime_graph_tool.py
@@ -1,9 +1,9 @@
 from langchain.messages import HumanMessage, SystemMessage
 from langchain.tools import tool
 
-from GoT.model.ollama_llm import LLM
-from GoT.model.runtime_graph import ReasoningNode, RuntimeGraph
-from GoT.model.utils.utils import parse_response
+from GoT.core.llm import LLM
+from GoT.core.runtime_graph import ReasoningNode, RuntimeGraph
+from GoT.utils.utils import parse_response
 
 MAX_INTERACTIONS = 10
 
@@ -21,6 +21,8 @@ def divide_thought(
     HOW TO USE THIS TOOL:
     - Call it when you think the problem is complex.
     - The two parts must be as independent as possible.
+    IMPORTANT NOTES:
+    - You can't use the result of the first part to reason about the second part, and vice versa. The two parts must be as independent as possible.
      Arguments:
     - first_part: the first part of the thought process
     - second_part: the second part of the thought process

diff --git a/GoT/agent_tools/web_tool.py b/GoT/agent_tools/web_tool.py
@@ -0,0 +1,54 @@
+import arxiv
+from langchain.tools import tool
+import wikipedia
+
+
+@tool
+def search_wikipedia(query: str) -> str:
+    """
+    Fetch a brief summary from Wikipedia.
+
+    Args:
+        query (str): The keyword or topic to search for.
+
+    Returns:
+        str: A 3-sentence summary of the topic, the first option if
+             ambiguous, or an error message if not found.
+    """
+    try:
+        return wikipedia.search(query)
+    except wikipedia.DisambiguationError as e:
+        # happens when query is ambiguous, pick first option
+        return wikipedia.summary(e.options[0], sentences=3)
+    except wikipedia.PageError:
+        return "Page not found"
+
+
+@tool
+def search_arxiv(query: str) -> str:
+    """Search ArXiv for scientific papers on a given topic.
+    Use this when you need to find research papers, abstracts or academic references."""
+
+    try:
+        client = arxiv.Client()
+        search = arxiv.Search(
+            query=query, max_results=3, sort_by=arxiv.SortCriterion.Relevance
+        )
+
+        results = []
+        for paper in client.results(search):
+            results.append(
+                f"Title: {paper.title}\n"
+                f"Authors: {', '.join(a.name for a in paper.authors)}\n"
+                f"Published: {paper.published.strftime('%Y-%m-%d')}\n"
+                f"Summary: {paper.summary[:300]}...\n"
+                f"URL: {paper.entry_id}\n"
+            )
+
+        if not results:
+            return "No papers found for this query."
+
+        return "\n---\n".join(results)
+
+    except Exception as e:
+        return f"Error searching ArXiv: {str(e)}"
diff --git a/GoT/model/utils/parse_args.py → GoT/cli/parse_args.py b/GoT/model/utils/parse_args.py → GoT/cli/parse_args.py
@@ -1,7 +1,13 @@
 import argparse
 import sys
 
-from GoT.model.utils.hf_formatter import use_gpqa, use_gsm8k, use_hendrycks_math
+from GoT.experiments.hf_formatter import (
+    use_gaia,
+    use_gpqa,
+    use_gsm8k,
+    use_hendrycks_math,
+)
+from GoT.experiments.runner_custom import custom_test
 
 
 def defining_and_parse_args():
@@ -12,7 +18,7 @@ def defining_and_parse_args():
         "--benchmark",
         required=True,
         type=str,
-        choices=["gsm8k", "gpqa", "hendrycks_math"],
+        choices=["gsm8k", "gpqa", "hendrycks_math", "gaia", "custom"],
         help="The benchmark to run the model on.",
     )
     parser.add_argument(
@@ -22,14 +28,17 @@ def defining_and_parse_args():
         choices=["graph", "standard"],
         help="Whether to run the standard model or the graph model.",
     )
+    parser.add_argument(
+        "--prompt", type=str, default="", help="Insert a prompt during a custom run."
+    )
     parser.add_argument(
         "--max_run",
         type=int,
         default=1,
         help="The maximum number of runs for the benchmark.",
     )
     parser.add_argument(
-        "--type",
+        "--category",
         type=str,
         default="algebra",
         choices=[
@@ -39,7 +48,7 @@ def defining_and_parse_args():
             "intermediate_algebra",
             "number_theory",
             "precalculus",
-            "statistics",
+            "prealgebra",
         ],
         help="The type of math problems to run, only for hendrycks_math.",
     )
@@ -61,4 +70,10 @@ def call_benchmark(args):
     elif args.benchmark == "gpqa":
         use_gpqa(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "hendrycks_math":
-        use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type)
+        use_hendrycks_math(
+            max_run=max_run, test=test, model_name=mode, type=args.category
+        )
+    elif args.benchmark == "gaia":
+        use_gaia(max_run=max_run, test=test, model_name=mode)
+    elif args.benchmark == "custom" and args.prompt != "":
+        custom_test(args.prompt, test)