From fa71e8fdff52490915fdd7e937786e96550343f5 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 15 Apr 2026 10:35:13 +0200
Subject: [PATCH 01/27] chore: improve prompt

---
 GoT/model/graph_model.py        | 1 +
 GoT/tools/runtime_graph_tool.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py
index 57491ad..db01896 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/model/graph_model.py
@@ -55,6 +55,7 @@
 
         Rules:
         - You MUST respond ONLY using the Score function.
+        - You must consider if the format of the answer follow the instruction
         - You cannot give the full solution, only hints.
         - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem.
         - Do not write natural language outside the function.
diff --git a/GoT/tools/runtime_graph_tool.py b/GoT/tools/runtime_graph_tool.py
index 77df52a..6b2a7ef 100644
--- a/GoT/tools/runtime_graph_tool.py
+++ b/GoT/tools/runtime_graph_tool.py
@@ -21,6 +21,8 @@ def divide_thought(
     HOW TO USE THIS TOOL:
     - Call it when you think the problem is complex.
     - The two parts must be as independent as possible.
+    IMPORTANT NOTES:
+    - You can't use the result of the first part to reason about the second part, and vice versa. The two parts must be as independent as possible.
      Arguments:
     - first_part: the first part of the thought process
     - second_part: the second part of the thought process

From 5569cb605f452347588f29c8416183488fca91b2 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 16 Apr 2026 10:12:49 +0200
Subject: [PATCH 02/27] chore: remove craft tools in hf formatter

---
 GoT/model/utils/hf_formatter.py | 8 ++++----
 GoT/tools/craft_tool.py         | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py
index 7b1eddf..eadd6b0 100644
--- a/GoT/model/utils/hf_formatter.py
+++ b/GoT/model/utils/hf_formatter.py
@@ -103,7 +103,7 @@ def gpqa_format(dataset: Dataset) -> list[ResultEval]:
 def gpqa_run(questions: list[ResultEval], max_run: int, test: bool) -> list[ResultEval]:
     responses = []
     run_counter = 0
-    agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool())
+    agent = LLM().create_custom_agent(LLM().get_tools())
     for q in questions[25:]:
         if run_counter >= max_run:
             break
@@ -188,7 +188,7 @@ def gsm8k_run(
 ) -> list[ResultEval]:
     responses = []
     run_counter = 0
-    agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool())
+    agent = LLM().create_custom_agent(LLM().get_tools())
     for q in questions:
         if run_counter >= max_run:
             break
@@ -277,8 +277,8 @@ def hendrycks_math_run(
 ) -> list[ResultEval]:
     responses = []
     run_counter = 0
-    agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool())
-    for q in questions:
+    agent = LLM().create_custom_agent(LLM().get_tools())
+    for q in questions[10:]:
         if run_counter >= max_run:
             break
         prompt = q.question
diff --git a/GoT/tools/craft_tool.py b/GoT/tools/craft_tool.py
index 8db13dd..dc1d2d1 100644
--- a/GoT/tools/craft_tool.py
+++ b/GoT/tools/craft_tool.py
@@ -56,6 +56,8 @@ def craft_tool(tool_function: str) -> str:
     """Save the function definition provided by the LLM as a tool that can be used by other agents.
     The function should be defined as a python function.
     The function should be general and reusable, and should not be specific to the current problem.
+    The function must not use tuple as args type.
+    The function must be defined as gemini api format, with type annotations for all arguments and return type.
     The function should be defined in a way that it can be imported and used by other agents."""
 
     def sanitize_input(query: str) -> str:

From 53344dd0d4227fd77b3bbccf8f055e5a0ffc7fca Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 16 Apr 2026 16:55:26 +0200
Subject: [PATCH 03/27] chore: create a single benchmark_run for all datasets

---
 GoT/model/utils/hf_formatter.py | 145 ++++++--------------------------
 1 file changed, 27 insertions(+), 118 deletions(-)

diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py
index eadd6b0..fbbdb59 100644
--- a/GoT/model/utils/hf_formatter.py
+++ b/GoT/model/utils/hf_formatter.py
@@ -100,50 +100,6 @@ def gpqa_format(dataset: Dataset) -> list[ResultEval]:
     return questions
 
 
-def gpqa_run(questions: list[ResultEval], max_run: int, test: bool) -> list[ResultEval]:
-    responses = []
-    run_counter = 0
-    agent = LLM().create_custom_agent(LLM().get_tools())
-    for q in questions[25:]:
-        if run_counter >= max_run:
-            break
-        prompt = q.question
-        correct_letter = q.correct_answer
-        try:
-            if test:
-                response = extract_output(
-                    agent.invoke(
-                        {"messages": [HumanMessage(content=prompt)]},
-                        config={"recursion_limit": 10},
-                    )
-                )
-            else:
-                response = extract_output(call_graph(prompt))
-            norm_res = normalize_number(response)
-            responses.append(
-                ResultEval(
-                    question=prompt,
-                    response=norm_res,
-                    filtered_answer="",
-                    correct_answer=correct_letter,
-                    answer_success=0.0,
-                )
-            )
-        except Exception as e:
-            print(f"Error processing question: {e}")
-            responses.append(
-                ResultEval(
-                    question=prompt,
-                    response="Error",
-                    filtered_answer="",
-                    correct_answer=correct_letter,
-                    answer_success=0.0,
-                )
-            )
-        run_counter += 1
-    return responses
-
-
 def gpqa_eval(responses: list[ResultEval]):
     correct = 0
 
@@ -183,52 +139,6 @@ def gsm8k_format(dataset: Dataset) -> list[ResultEval]:
     return questions
 
 
-def gsm8k_run(
-    questions: list[ResultEval], max_run: int, test: bool
-) -> list[ResultEval]:
-    responses = []
-    run_counter = 0
-    agent = LLM().create_custom_agent(LLM().get_tools())
-    for q in questions:
-        if run_counter >= max_run:
-            break
-        prompt = q.question
-        correct_answer = q.correct_answer
-        try:
-            if test:
-                response = extract_output(
-                    agent.invoke(
-                        {"messages": [HumanMessage(content=prompt)]},
-                        config={"recursion_limit": 20},
-                    )
-                )
-            else:
-                response = extract_output(call_graph(prompt))
-            norm_res = normalize_number(response)
-            responses.append(
-                ResultEval(
-                    question=prompt,
-                    response=norm_res,
-                    filtered_answer="",
-                    correct_answer=correct_answer,
-                    answer_success=0.0,
-                )
-            )
-        except Exception as e:
-            print(f"Error processing question: {e}")
-            responses.append(
-                ResultEval(
-                    question=prompt,
-                    response="Error",
-                    filtered_answer="",
-                    correct_answer=correct_answer,
-                    answer_success=0.0,
-                )
-            )
-        run_counter += 1
-    return responses
-
-
 def gsm8k_eval(responses: list[ResultEval]):
     correct = 0
 
@@ -272,13 +182,35 @@ def hendrycks_math_format(dataset: Dataset) -> list[ResultEval]:
     return questions
 
 
-def hendrycks_math_run(
+def hendrycks_math_eval(responses: list[ResultEval]):
+    correct = 0
+
+    for res in responses:
+        opt_res = re.search(r"\\boxed\{(.*)\}", res.response)
+        norm_res = opt_res.group(1) if opt_res else "N/A"
+        norm_correct = normalize_number(res.correct_answer)
+        res.filtered_answer = norm_res
+
+        if (
+            (norm_res in norm_correct)
+            or (normalize_list(norm_res) == normalize_list(norm_correct))
+            or (symbolic_equal(norm_res, norm_correct))
+        ):
+            correct += 1
+            res.answer_success = 1.0
+
+    accuracy = correct / len(responses) * 100
+    print(f"Accuracy: {accuracy:.2f}%")
+    print(f"Total: {len(responses)}")
+    print(f"Correct: {correct}")
+
+def benchmark_run(
     questions: list[ResultEval], max_run: int, test: bool
 ) -> list[ResultEval]:
     responses = []
     run_counter = 0
     agent = LLM().create_custom_agent(LLM().get_tools())
-    for q in questions[10:]:
+    for q in questions[40:]:
         if run_counter >= max_run:
             break
         prompt = q.question
@@ -318,34 +250,11 @@ def hendrycks_math_run(
     return responses
 
 
-def hendrycks_math_eval(responses: list[ResultEval]):
-    correct = 0
-
-    for res in responses:
-        opt_res = re.search(r"\\boxed\{(.*)\}", res.response)
-        norm_res = opt_res.group(1) if opt_res else "N/A"
-        norm_correct = normalize_number(res.correct_answer)
-        res.filtered_answer = norm_res
-
-        if (
-            (norm_res in norm_correct)
-            or (normalize_list(norm_res) == normalize_list(norm_correct))
-            or (symbolic_equal(norm_res, norm_correct))
-        ):
-            correct += 1
-            res.answer_success = 1.0
-
-    accuracy = correct / len(responses) * 100
-    print(f"Accuracy: {accuracy:.2f}%")
-    print(f"Total: {len(responses)}")
-    print(f"Correct: {correct}")
-
-
 def use_gpqa(max_run: int, test: bool, model_name: str):
     ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
     data = ds["train"]
     questions = gpqa_format(data)
-    responses = gpqa_run(questions, max_run=max_run, test=test)
+    responses = benchmark_run(questions, max_run=max_run, test=test)
     gpqa_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
@@ -354,7 +263,7 @@ def use_gsm8k(max_run: int, test: bool, model_name: str):
     ds = load_dataset("gsm8k", "main")
     data = ds["test"]
     questions = gsm8k_format(data)
-    responses = gsm8k_run(questions, max_run=max_run, test=test)
+    responses = benchmark_run(questions, max_run=max_run, test=test)
     gsm8k_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
@@ -363,6 +272,6 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str):
     ds = load_dataset("EleutherAI/hendrycks_math", type)
     data = ds["test"]
     questions = hendrycks_math_format(data)
-    responses = hendrycks_math_run(questions, max_run=max_run, test=test)
+    responses = benchmark_run(questions, max_run=max_run, test=test)
     hendrycks_math_eval(responses)
     save_eval_results(responses, model_name=model_name)

From 4cf3a015caf88f9aaea3016555d9c1bbb241a387 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 16 Apr 2026 17:15:31 +0200
Subject: [PATCH 04/27] feat: gaia benchmark added

---
 GoT/model/utils/hf_formatter.py | 45 +++++++++++++++++++++++++++++++++
 GoT/model/utils/parse_args.py   |  6 +++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py
index fbbdb59..a26ecce 100644
--- a/GoT/model/utils/hf_formatter.py
+++ b/GoT/model/utils/hf_formatter.py
@@ -250,6 +250,43 @@ def benchmark_run(
     return responses
 
 
+def gaia_format(dataset: Dataset) -> list[ResultEval]:
+    questions = []
+    for data in dataset:
+        sample = data
+        question = sample["Question"]
+        correct_answer = sample["Final answer"]
+        prompt = (
+            "Answer the following question. Think step by step before answering.\n\n"
+            f"{question}\n"
+            "Answer:"
+        )
+
+        questions.append(
+            ResultEval.create_empty_result(
+                question=prompt, correct_answer=correct_answer
+            )
+        )
+
+    return questions
+
+def gaia_eval(responses: list[ResultEval]):
+    correct = 0
+
+    for res in responses:
+        norm_res = normalize_number(res.response)
+        norm_correct = normalize_number(res.correct_answer)
+        res.filtered_answer = norm_res
+
+        if norm_res in norm_correct:
+            correct += 1
+            res.answer_success = 1.0
+
+    accuracy = correct / len(responses) * 100
+    print(f"Accuracy: {accuracy:.2f}%")
+    print(f"Total: {len(responses)}")
+    print(f"Correct: {correct}")
+
 def use_gpqa(max_run: int, test: bool, model_name: str):
     ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
     data = ds["train"]
@@ -275,3 +312,11 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str):
     responses = benchmark_run(questions, max_run=max_run, test=test)
     hendrycks_math_eval(responses)
     save_eval_results(responses, model_name=model_name)
+
+def use_gaia(max_run: int, test: bool, model_name: str):
+    ds = load_dataset("gaia-benchmark/GAIA", "2023_level1")
+    data = ds["test"]
+    questions = gaia_format(data)
+    responses = benchmark_run(questions, max_run=max_run, test=test)
+    gaia_eval(responses)
+    save_eval_results(responses, model_name=model_name)
diff --git a/GoT/model/utils/parse_args.py b/GoT/model/utils/parse_args.py
index d2437a3..6cb3b9d 100644
--- a/GoT/model/utils/parse_args.py
+++ b/GoT/model/utils/parse_args.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 
-from GoT.model.utils.hf_formatter import use_gpqa, use_gsm8k, use_hendrycks_math
+from GoT.model.utils.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math
 
 
 def defining_and_parse_args():
@@ -12,7 +12,7 @@ def defining_and_parse_args():
         "--benchmark",
         required=True,
         type=str,
-        choices=["gsm8k", "gpqa", "hendrycks_math"],
+        choices=["gsm8k", "gpqa", "hendrycks_math", "gaia"],
         help="The benchmark to run the model on.",
     )
     parser.add_argument(
@@ -62,3 +62,5 @@ def call_benchmark(args):
         use_gpqa(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "hendrycks_math":
         use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type)
+    elif args.benchmark == "gaia":
+        use_gaia(max_run=max_run, test=test, model_name=mode)

From 929ebd036f103aad615b2192274fcbb744b80b7c Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 17 Apr 2026 15:34:16 +0200
Subject: [PATCH 05/27] chore: simplify codes

---
 GoT/model/utils/hf_formatter.py | 35 +++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py
index a26ecce..5c9b162 100644
--- a/GoT/model/utils/hf_formatter.py
+++ b/GoT/model/utils/hf_formatter.py
@@ -1,7 +1,9 @@
 import json
+import os
 from random import shuffle
 import re
 from datasets import Dataset, load_dataset
+from huggingface_hub import hf_hub_download
 
 from langchain.messages import HumanMessage
 
@@ -14,6 +16,7 @@
     symbolic_equal,
 )
 
+TOKEN = os.getenv("HF_TOKEN")
 
 class ResultEval:
     def __init__(
@@ -210,7 +213,7 @@ def benchmark_run(
     responses = []
     run_counter = 0
     agent = LLM().create_custom_agent(LLM().get_tools())
-    for q in questions[40:]:
+    for q in questions:
         if run_counter >= max_run:
             break
         prompt = q.question
@@ -255,6 +258,16 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]:
     for data in dataset:
         sample = data
         question = sample["Question"]
+        attachment = sample.get("file_name", None)
+        if attachment:
+            abs_path = hf_hub_download(
+                    repo_id="gaia-benchmark/GAIA",
+                    filename=f"2023/validation/{attachment}",
+                    repo_type="dataset",
+                    token=TOKEN
+                )
+            print(abs_path)
+            question += f"\nAttachment file path: {abs_path}"
         correct_answer = sample["Final answer"]
         prompt = (
             "Answer the following question. Think step by step before answering.\n\n"
@@ -288,35 +301,31 @@ def gaia_eval(responses: list[ResultEval]):
     print(f"Correct: {correct}")
 
 def use_gpqa(max_run: int, test: bool, model_name: str):
-    ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
-    data = ds["train"]
-    questions = gpqa_format(data)
+    ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
+    questions = gpqa_format(ds)
     responses = benchmark_run(questions, max_run=max_run, test=test)
     gpqa_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
 
 def use_gsm8k(max_run: int, test: bool, model_name: str):
-    ds = load_dataset("gsm8k", "main")
-    data = ds["test"]
-    questions = gsm8k_format(data)
+    ds = load_dataset("gsm8k", "main", split="test")
+    questions = gsm8k_format(ds)
     responses = benchmark_run(questions, max_run=max_run, test=test)
     gsm8k_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
 
 def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str):
-    ds = load_dataset("EleutherAI/hendrycks_math", type)
-    data = ds["test"]
-    questions = hendrycks_math_format(data)
+    ds = load_dataset("EleutherAI/hendrycks_math", type, split="test")
+    questions = hendrycks_math_format(ds)
     responses = benchmark_run(questions, max_run=max_run, test=test)
     hendrycks_math_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
 def use_gaia(max_run: int, test: bool, model_name: str):
-    ds = load_dataset("gaia-benchmark/GAIA", "2023_level1")
-    data = ds["test"]
-    questions = gaia_format(data)
+    ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation")
+    questions = gaia_format(ds)
     responses = benchmark_run(questions, max_run=max_run, test=test)
     gaia_eval(responses)
     save_eval_results(responses, model_name=model_name)

From 104510152b31e288879e284d27614c40d1cd4a1a Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 17 Apr 2026 15:34:47 +0200
Subject: [PATCH 06/27] chore: add explanation of tool needed

---
 GoT/model/graph_model.py   | 4 ++++
 GoT/model/runtime_graph.py | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py
index db01896..ab6cc1c 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/model/graph_model.py
@@ -82,6 +82,7 @@
     LLM().get_craft_tool(),
     SystemMessage(
         """
+        You are a master coder specialized in crafting tools for other agents.
         You create reusable Python tools for other agents.
 
         The tool must be GENERAL and parameterized.
@@ -236,6 +237,7 @@ def tool_call(messages: MessagesState):
         )
     tool_used = extract_tool_used(res)
     runtime_graph.temp_response.response = parse_response_for_tool_node(res).response
+    runtime_graph.temp_response.explanation = parse_response_for_tool_node(res).explanation
     parsed_res = f"Response: {parse_response_for_tool_node(res).response}\nExplanation: {parse_response_for_tool_node(res).explanation}"
     runtime_graph.resolve_node(call_node, parsed_res)
 
@@ -289,8 +291,10 @@ def crafting(messages: MessagesState):
     runtime_graph.add_node(crafting_node)
     runtime_graph.add_edge(runtime_graph.temp_node, crafting_node)
     runtime_graph.temp_node = crafting_node
+    ai_feedback = runtime_graph.temp_response.explanation
     crafting_messages = [
         HumanMessage(content="Original task:\n" + parse_response(runtime_graph.goal)),
+        AIMessage(content=ai_feedback),
         SystemMessage(
             content="Craft a tool to solve this problem using craft_tool. It must be a function"
         ),
diff --git a/GoT/model/runtime_graph.py b/GoT/model/runtime_graph.py
index 506f3d9..6047e23 100644
--- a/GoT/model/runtime_graph.py
+++ b/GoT/model/runtime_graph.py
@@ -107,10 +107,12 @@ class ResponseNode(RuntimeNode):
     def __init__(
         self,
         response: str,
+        explanation: str = "",
         resolved: bool = False,
     ):
         super().__init__(resolved)
         self.response = response
+        self.explanation = explanation
 
 
 class CraftingNode(RuntimeNode):

From c840f4743490d3dc4a544d0706898f45be169698 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Tue, 21 Apr 2026 10:22:33 +0200
Subject: [PATCH 07/27] chore: remove comments

---
 GoT/model/graph_model.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py
index ab6cc1c..d289e95 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/model/graph_model.py
@@ -158,22 +158,6 @@ def goal(prompt: MessagesState):
     return prompt
 
 
-# def tool_expand(goal: MessagesState):
-#     msg = parse_response(goal)
-#     sys_msg = "Please make a list using '-' to denote each tool in a probabilistic order, don't use this character for other reasons. Select only the tool(s) you want to use to solve this problem."
-#     messages = [
-#         HumanMessage(msg),
-#         SystemMessage(sys_msg),
-#     ]
-#     res = starting_agent.invoke({"messages": messages}, config={"recursion_limit": MAX_INTERACTIONS})
-#     str_res = parse_response(res)
-#     goal["messages"].append(AIMessage(content=str_res))
-#     # tool_list = parse_tool_list(str_res)  # Toglie elementi inutili
-#     # add tool nodes in the runtime graph
-
-#     return goal
-
-
 def tool_reasoning(messages: MessagesState):
     messages["messages"].append(
         HumanMessage(
@@ -438,7 +422,6 @@ def call_graph(prompt: str):
 def invoke_graph():
     graph = StateGraph(MessagesState)
     graph.add_node(goal)
-    # graph.add_node(tool_expand)
     graph.add_node(tool_reasoning)
     graph.add_node(tool_call)
     graph.add_node(backtrack)
@@ -447,7 +430,6 @@ def invoke_graph():
     graph.add_node(response_evaluation)
     graph.add_node(reasoning_mode)
     graph.add_edge(START, "goal")
-    # graph.add_edge("goal", "tool_expand")
     graph.add_edge("goal", "tool_reasoning")
     graph.add_edge("tool_reasoning", "tool_call")
     graph.add_edge("tool_call", "response_evaluation")

From 5bd0c625b26434fb6e3524ac424be1ce96b2002e Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 22 Apr 2026 10:14:56 +0200
Subject: [PATCH 08/27] chore: add method to download mlflow traces

---
 GoT/model/utils/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/GoT/model/utils/utils.py b/GoT/model/utils/utils.py
index ac14a65..3ae103a 100644
--- a/GoT/model/utils/utils.py
+++ b/GoT/model/utils/utils.py
@@ -1,6 +1,7 @@
 import json
 import re
 
+import mlflow
 import numpy as np
 from sympy import simplify, sympify
 
@@ -227,3 +228,7 @@ def print_benchmark_result_loglikehood(
     print(f"Total: {n_total}")
     print(f"Correct: {n_correct}")
     print(f"Wrong: {n_wrong}")
+
+def download_mlflow_traces(n_max: int):
+    traces = mlflow.search_traces(max_results=n_max, order_by=["timestamp DESC"])
+    traces.to_csv("traces.csv", index=False)

From b4a4dda80f734ec444575911e281e31dd64771d6 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 23 Apr 2026 16:27:23 +0200
Subject: [PATCH 09/27] chore: change var name and simplify codes

---
 GoT/model/graph_model.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py
index d289e95..7769781 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/model/graph_model.py
@@ -82,7 +82,7 @@
     LLM().get_craft_tool(),
     SystemMessage(
         """
-        You are a master coder specialized in crafting tools for other agents.
+        You are a specialized in coding and write new useful method.
         You create reusable Python tools for other agents.
 
         The tool must be GENERAL and parameterized.
@@ -311,28 +311,24 @@ def crafting(messages: MessagesState):
 
 
 def test_result(messages: MessagesState):
-    n = runtime_graph.exist_tool_available()
+    is_tool_path_available = runtime_graph.exist_tool_available()
     test_node = runtime_graph.temp_node
     if not isinstance(test_node, TestNode):
         raise TypeError("Expected TestNode for scoring")
-
-    if test_node.score >= (
-        COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity
-    ):
+    threshold = COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity
+    if test_node.score >= threshold:
         runtime_graph.add_edge(test_node, runtime_graph.temp_response)
         runtime_graph.temp_response.resolved = True
         return END
     elif (
-        test_node.score
-        < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity)
-        and n is True
+        test_node.score < threshold
+        and is_tool_path_available is True
         and test_node.need_tool_crafting is True
     ):
         return "crafting"
     elif (
-        test_node.score
-        < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity)
-        and n is True
+        test_node.score < threshold
+        and is_tool_path_available is True
     ):
         if test_node.need_tool_crafting is True:
             test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity."
@@ -340,7 +336,7 @@ def test_result(messages: MessagesState):
     elif (
         test_node.score
         < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity)
-        and n is False
+        and is_tool_path_available is False
         and runtime_graph.exist_reasoning_node_available()
     ):
         return "reasoning_mode"

From 0c93f80a6bb0d352a8447b2bc6cdc8ec7a1c85dd Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 24 Apr 2026 10:47:38 +0200
Subject: [PATCH 10/27] feat: add wikipedia and arxiv tools

---
 GoT/model/ollama_llm.py |  3 +-
 GoT/tools/web_tool.py   | 54 +++++++++++++++++++++++++
 poetry.lock             | 89 ++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml          |  2 +
 4 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 GoT/tools/web_tool.py

diff --git a/GoT/model/ollama_llm.py b/GoT/model/ollama_llm.py
index 65bfcf2..5a59369 100644
--- a/GoT/model/ollama_llm.py
+++ b/GoT/model/ollama_llm.py
@@ -20,6 +20,7 @@
 )
 
 from GoT.tools.craft_tool import craft_tool, install_dependency
+from GoT.tools.web_tool import search_arxiv, search_wikipedia
 
 load_dotenv()
 
@@ -71,7 +72,7 @@ def __init__(self):
             self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL)
 
     def get_tools(self):
-        initial_tools = [summing, minus, square_root, multiply, divide]
+        initial_tools = [summing, minus, square_root, multiply, divide, search_wikipedia, search_arxiv]
         crafted_tools = self.get_crafted_tools()
         return initial_tools + crafted_tools
 
diff --git a/GoT/tools/web_tool.py b/GoT/tools/web_tool.py
new file mode 100644
index 0000000..5987bce
--- /dev/null
+++ b/GoT/tools/web_tool.py
@@ -0,0 +1,54 @@
+import arxiv
+from langchain.tools import tool
+import wikipedia
+
+@tool
+def search_wikipedia(query: str) -> str:
+    """
+    Fetch a brief summary from Wikipedia.
+
+    Args:
+        query (str): The keyword or topic to search for.
+
+    Returns:
+        str: A 3-sentence summary of the topic, the first option if 
+             ambiguous, or an error message if not found.
+    """
+    try:
+        return wikipedia.search(query)
+    except wikipedia.DisambiguationError as e:
+        # happens when query is ambiguous, pick first option
+        return wikipedia.summary(e.options[0], sentences=3)
+    except wikipedia.PageError:
+        return "Page not found"
+    
+@tool
+def search_arxiv(query: str) -> str:
+    """Search ArXiv for scientific papers on a given topic. 
+    Use this when you need to find research papers, abstracts or academic references."""
+    
+    try:
+        client = arxiv.Client()
+        search = arxiv.Search(
+            query=query,
+            max_results=3,
+            sort_by=arxiv.SortCriterion.Relevance
+        )
+        
+        results = []
+        for paper in client.results(search):
+            results.append(
+                f"Title: {paper.title}\n"
+                f"Authors: {', '.join(a.name for a in paper.authors)}\n"
+                f"Published: {paper.published.strftime('%Y-%m-%d')}\n"
+                f"Summary: {paper.summary[:300]}...\n"
+                f"URL: {paper.entry_id}\n"
+            )
+        
+        if not results:
+            return "No papers found for this query."
+        
+        return "\n---\n".join(results)
+    
+    except Exception as e:
+        return f"Error searching ArXiv: {str(e)}"
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index e6e3dce..f5f1ce4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -251,6 +251,22 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
 [package.extras]
 trio = ["trio (>=0.31.0)", "trio (>=0.32.0)"]
 
+[[package]]
+name = "arxiv"
+version = "3.0.0"
+description = "Python wrapper for the arXiv API"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "arxiv-3.0.0-py3-none-any.whl", hash = "sha256:8b4d4e2e336bfeb71ea653623d7dadb260f682f0475cee2aecad0560a23b34db"},
+    {file = "arxiv-3.0.0.tar.gz", hash = "sha256:c8cb0d31208afbc1ceb17bd3f9816c8d4c5ca1e0abf199d211e216715440498d"},
+]
+
+[package.dependencies]
+feedparser = ">=6.0.10,<6.1.0"
+requests = ">=2.32,<2.34"
+typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
+
 [[package]]
 name = "async-timeout"
 version = "5.0.1"
@@ -273,6 +289,28 @@ files = [
     {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
 ]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"},
+    {file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"},
+]
+
+[package.dependencies]
+soupsieve = ">=1.6.1"
+typing-extensions = ">=4.0.0"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "blinker"
 version = "1.9.0"
@@ -1290,6 +1328,20 @@ files = [
 [package.extras]
 devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
 
+[[package]]
+name = "feedparser"
+version = "6.0.12"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324"},
+    {file = "feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228"},
+]
+
+[package.dependencies]
+sgmllib3k = "*"
+
 [[package]]
 name = "filelock"
 version = "3.24.2"
@@ -5611,6 +5663,16 @@ enabler = ["pytest-enabler (>=2.2)"]
 test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
 type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.18.*)", "pytest-mypy"]
 
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+description = "Py3k port of sgmllib."
+optional = false
+python-versions = "*"
+files = [
+    {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
+]
+
 [[package]]
 name = "shellingham"
 version = "1.5.4"
@@ -5676,6 +5738,17 @@ files = [
     {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.8.3"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95"},
+    {file = "soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349"},
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.46"
@@ -6367,6 +6440,20 @@ markupsafe = ">=2.1.1"
 [package.extras]
 watchdog = ["watchdog (>=2.3)"]
 
+[[package]]
+name = "wikipedia"
+version = "1.4.0"
+description = "Wikipedia API for Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wikipedia-1.4.0.tar.gz", hash = "sha256:db0fad1829fdd441b1852306e9856398204dc0786d2996dd2e0c8bb8e26133b2"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+requests = ">=2.0.0,<3.0.0"
+
 [[package]]
 name = "word2number"
 version = "1.1"
@@ -6868,4 +6955,4 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">= 3.10.0 < 3.14.0"
-content-hash = "c7c8e8591227891fa161988b58a7a8e708d8769ce133faf007103f95df4a0ef4"
+content-hash = "afb0288bd77331357bfc74c7862dee098b73e2fce3028027cc78717cf022970b"
diff --git a/pyproject.toml b/pyproject.toml
index 95c44ad..20e60bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,8 @@ dotenv = "^0.9.9"
 mlflow = "^3.9.0"
 lm-eval = {extras = ["math"], version = "^0.4.11"}
 boto3 = "^1.42.51"
+wikipedia = "^1.4.0"
+arxiv = "^3.0.0"
 
 [tool.poetry.group.dev.dependencies]
 coverage = "^7.4.0"

From ea2f8e25ffb4e2b16f88edf4a5b3e0b67876b72e Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 24 Apr 2026 10:47:51 +0200
Subject: [PATCH 11/27] fix: fix names in type arg

---
 GoT/model/utils/parse_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GoT/model/utils/parse_args.py b/GoT/model/utils/parse_args.py
index 6cb3b9d..818903a 100644
--- a/GoT/model/utils/parse_args.py
+++ b/GoT/model/utils/parse_args.py
@@ -39,7 +39,7 @@ def defining_and_parse_args():
             "intermediate_algebra",
             "number_theory",
             "precalculus",
-            "statistics",
+            "prealgebra"
         ],
         help="The type of math problems to run, only for hendrycks_math.",
     )

From 8e8ff331842f12d4340ea5489c9d74b4b350759d Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Sat, 25 Apr 2026 14:24:23 +0200
Subject: [PATCH 12/27] chore: add specific crafter LLM and improve prompt

---
 GoT/model/graph_model.py | 89 ++++++++++++++++++++++++++++++++++------
 GoT/model/ollama_llm.py  |  8 +++-
 2 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py
index 7769781..8fc881a 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/model/graph_model.py
@@ -103,18 +103,80 @@ def multiply(a: float, b: float) -> float:
         '
             return a * b
 
+        
+        Bad example (hardcoded/placeholder result):
+        def search_papers(query: str) -> str:
+            return "Results about " + query  # WRONG: never return hardcoded strings
+
+        Good example (real API call):
+        def search_papers(query: str) -> str:
+            '
+            Arguments:
+            query: the search query string
+            Returns:
+            A string with real results fetched from the API
+            '
+            import arxiv
+            client = arxiv.Client()
+            search = arxiv.Search(query=query, max_results=3)
+            results = [p.title + ": " + p.summary[:200] for p in client.results(search)]
+            return "\\n".join(results)    
+
+        Bad example: Too specific
+        def get_oldest_blu_ray_title(spreadsheet_path: str) -> str:
+            "
+            Analyzes a spreadsheet to find the oldest Blu-Ray title.
+
+            Arguments:
+            spreadsheet_path: The file path to the spreadsheet (e.g., 'C:/Users/user/data.xlsx').
+
+            Returns:
+            The title of the oldest Blu-Ray as it appears in the spreadsheet.
+            "
+            import pandas as pd
+
+            df = pd.read_excel(spreadsheet_path)
+
+            # Assuming 'Format' column for media type and 'Recording Date' for date
+            blu_rays = df[df['Format'] == 'Blu-Ray']
+
+            if blu_rays.empty:
+                return "No Blu-Ray titles found."
+
+            # Ensure 'Recording Date' is in datetime format for proper comparison
+            blu_rays['Recording Date'] = pd.to_datetime(blu_rays['Recording Date'])
+
+            oldest_blu_ray = blu_rays.sort_values(by='Recording Date', ascending=True).iloc[0]
+
+            return oldest_blu_ray['Title']
+
+        Good example
+        def open_excel_files(excel_path: str)
+            Analyzes a spreadsheet.
+
+            Arguments:
+            spreadsheet_path: The file path to the spreadsheet (e.g., 'C:/Users/user/data.xlsx').
+
+            Returns:
+            The excel file in string
+            "
+            import pandas as pd
+
+            df = pd.read_excel(spreadsheet_path)
+            return df.to_string()
+
         Rules:
         - Prefer generic names and parameters, never craft specific functions.
         - If the function contains specific numbers or values, it is wrong.
-        - Craft only one function, it must contains always the docs.
+        - Never return hardcoded or placeholder strings, the function must fetch real data.
+        - Craft a maximum of 3 tools, it must contains always the docs. If the number of tool crafted exceed, you fail.
         - Never craft tool that raise exceptions.
-        - Respond ONLY using the tool available.
         - No natural language.
-        - No comments in the python interpreter.
+        - No more than 1 line comments in the python codes.
         """
     ),
     response_format=Response,
-    type="remote_response_format",
+    type="remote_crafter",
 )
 
 reasoning_agent = LLM().create_custom_agent(
@@ -280,16 +342,19 @@ def crafting(messages: MessagesState):
         HumanMessage(content="Original task:\n" + parse_response(runtime_graph.goal)),
         AIMessage(content=ai_feedback),
         SystemMessage(
-            content="Craft a tool to solve this problem using craft_tool. It must be a function"
+            content="Use the context given to craft a tool to solve this problem using craft_tool. It must be a function"
         ),
     ]
-    craft_res = crafter_agent.invoke(
-        {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS}
-    )
-    runtime_graph.temp_response.response = parse_response_for_tool_node(
-        craft_res
-    ).response
-    parsed_res = f"Response: {parse_response_for_tool_node(craft_res).response}\nExplanation: {parse_response_for_tool_node(craft_res).explanation}"
+    try:    
+        craft_res = crafter_agent.invoke(
+            {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS}
+        )
+        parsed_res = parse_response(craft_res)
+    except Exception:   
+        parsed_res = ""
+    # runtime_graph.temp_response.response = parse_response_for_tool_node(
+    #     craft_res
+    # ).response
     runtime_graph.resolve_node(crafting_node, parsed_res)
     runtime_graph.temp_node = runtime_graph.call_tool_node()
     runtime_graph.add_edge(crafting_node, runtime_graph.temp_node)
diff --git a/GoT/model/ollama_llm.py b/GoT/model/ollama_llm.py
index 5a59369..e06bfe9 100644
--- a/GoT/model/ollama_llm.py
+++ b/GoT/model/ollama_llm.py
@@ -57,6 +57,11 @@ def __init__(self):
                 api_key=os.environ.get("GEMINI_API_KEY"),
                 temperature=1.0,  # Gemini 3.0+ defaults to 1.0
             )
+            self.remoteLLMCrafter = ChatGoogleGenerativeAI(
+                model="gemini-3-flash-preview",
+                api_key=os.environ.get("GEMINI_API_KEY"),
+                temperature=1.0,  # Gemini 3.0+ defaults to 1.0
+            )
             self.remoteLLMScoreFormat = ChatGoogleGenerativeAI(
                 model="gemini-2.5-flash",
                 api_key=os.environ.get("GEMINI_API_KEY"),
@@ -67,12 +72,13 @@ def __init__(self):
                 "remote_standard": self.remoteLLMStandard,
                 "remote_response_format": self.remoteLLMResponseFormat,
                 "remote_score_format": self.remoteLLMScoreFormat,
+                "remote_crafter": self.remoteLLMCrafter
             }
 
             self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL)
 
     def get_tools(self):
-        initial_tools = [summing, minus, square_root, multiply, divide, search_wikipedia, search_arxiv]
+        initial_tools = [summing, minus, square_root, multiply, divide]
         crafted_tools = self.get_crafted_tools()
         return initial_tools + crafted_tools
 

From f0cec829dd2559beae94b771245869e7df0ea768 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Sat, 25 Apr 2026 15:21:25 +0200
Subject: [PATCH 13/27] style: change system folder architecture

---
 GoT/__init__.py                                  | 8 ++++----
 GoT/{tools => agent_tools}/ai_tool.py            | 0
 GoT/{tools => agent_tools}/craft_tool.py         | 0
 GoT/{tools => agent_tools}/math_tool.py          | 0
 GoT/{tools => agent_tools}/runtime_graph_tool.py | 6 +++---
 GoT/{tools => agent_tools}/web_tool.py           | 0
 GoT/{model/utils => cli}/parse_args.py           | 2 +-
 GoT/{model => core}/graph_model.py               | 8 ++++----
 GoT/{model/ollama_llm.py => core/llm.py}         | 7 +++----
 GoT/{model => core}/runtime_graph.py             | 0
 GoT/{model/utils => experiments}/hf_formatter.py | 6 +++---
 GoT/{model => experiments}/lm_wrapper.py         | 6 +++---
 GoT/{model => }/utils/utils.py                   | 2 +-
 13 files changed, 22 insertions(+), 23 deletions(-)
 rename GoT/{tools => agent_tools}/ai_tool.py (100%)
 rename GoT/{tools => agent_tools}/craft_tool.py (100%)
 rename GoT/{tools => agent_tools}/math_tool.py (100%)
 rename GoT/{tools => agent_tools}/runtime_graph_tool.py (95%)
 rename GoT/{tools => agent_tools}/web_tool.py (100%)
 rename GoT/{model/utils => cli}/parse_args.py (96%)
 rename GoT/{model => core}/graph_model.py (99%)
 rename GoT/{model/ollama_llm.py => core/llm.py} (94%)
 rename GoT/{model => core}/runtime_graph.py (100%)
 rename GoT/{model/utils => experiments}/hf_formatter.py (98%)
 rename GoT/{model => experiments}/lm_wrapper.py (98%)
 rename GoT/{model => }/utils/utils.py (99%)

diff --git a/GoT/__init__.py b/GoT/__init__.py
index c9d8fc7..a5df598 100644
--- a/GoT/__init__.py
+++ b/GoT/__init__.py
@@ -3,10 +3,10 @@
 from dotenv import load_dotenv
 
 from lm_eval import evaluator, tasks
-from GoT.model.graph_model import call_graph
-from GoT.model.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
-from GoT.model.utils.parse_args import call_benchmark, defining_and_parse_args
-from GoT.model.utils.utils import (
+from GoT.core.graph_model import call_graph
+from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
+from GoT.cli.parse_args import call_benchmark, defining_and_parse_args
+from GoT.utils.utils import (
     print_benchmark_result,
     print_benchmark_result_loglikehood,
 )
diff --git a/GoT/tools/ai_tool.py b/GoT/agent_tools/ai_tool.py
similarity index 100%
rename from GoT/tools/ai_tool.py
rename to GoT/agent_tools/ai_tool.py
diff --git a/GoT/tools/craft_tool.py b/GoT/agent_tools/craft_tool.py
similarity index 100%
rename from GoT/tools/craft_tool.py
rename to GoT/agent_tools/craft_tool.py
diff --git a/GoT/tools/math_tool.py b/GoT/agent_tools/math_tool.py
similarity index 100%
rename from GoT/tools/math_tool.py
rename to GoT/agent_tools/math_tool.py
diff --git a/GoT/tools/runtime_graph_tool.py b/GoT/agent_tools/runtime_graph_tool.py
similarity index 95%
rename from GoT/tools/runtime_graph_tool.py
rename to GoT/agent_tools/runtime_graph_tool.py
index 6b2a7ef..7db745c 100644
--- a/GoT/tools/runtime_graph_tool.py
+++ b/GoT/agent_tools/runtime_graph_tool.py
@@ -1,9 +1,9 @@
 from langchain.messages import HumanMessage, SystemMessage
 from langchain.tools import tool
 
-from GoT.model.ollama_llm import LLM
-from GoT.model.runtime_graph import ReasoningNode, RuntimeGraph
-from GoT.model.utils.utils import parse_response
+from GoT.core.llm import LLM
+from GoT.core.runtime_graph import ReasoningNode, RuntimeGraph
+from GoT.utils.utils import parse_response
 
 MAX_INTERACTIONS = 10
 
diff --git a/GoT/tools/web_tool.py b/GoT/agent_tools/web_tool.py
similarity index 100%
rename from GoT/tools/web_tool.py
rename to GoT/agent_tools/web_tool.py
diff --git a/GoT/model/utils/parse_args.py b/GoT/cli/parse_args.py
similarity index 96%
rename from GoT/model/utils/parse_args.py
rename to GoT/cli/parse_args.py
index 818903a..b7605b4 100644
--- a/GoT/model/utils/parse_args.py
+++ b/GoT/cli/parse_args.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 
-from GoT.model.utils.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math
+from GoT.experiments.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math
 
 
 def defining_and_parse_args():
diff --git a/GoT/model/graph_model.py b/GoT/core/graph_model.py
similarity index 99%
rename from GoT/model/graph_model.py
rename to GoT/core/graph_model.py
index 8fc881a..b1ab35b 100644
--- a/GoT/model/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -2,8 +2,8 @@
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
 from langgraph.graph import StateGraph, MessagesState, START, END
 
-from GoT.model.ollama_llm import LLM
-from GoT.model.runtime_graph import (
+from GoT.core.llm import LLM
+from GoT.core.runtime_graph import (
     BacktrackNode,
     CompletitionNode,
     CraftingNode,
@@ -15,13 +15,13 @@
     TestNode,
     ToolNode,
 )
-from GoT.model.utils.utils import (
+from GoT.utils.utils import (
     extract_tool_used,
     parse_response,
     parse_response_for_tool_node,
     parse_score,
 )
-from GoT.tools.runtime_graph_tool import divide_thought
+from GoT.agent_tools.runtime_graph_tool import divide_thought
 
 SCORE_THRESHOLD = 5
 COMPLEXITY_COEFFICIENT = 0.5
diff --git a/GoT/model/ollama_llm.py b/GoT/core/llm.py
similarity index 94%
rename from GoT/model/ollama_llm.py
rename to GoT/core/llm.py
index e06bfe9..29d4513 100644
--- a/GoT/model/ollama_llm.py
+++ b/GoT/core/llm.py
@@ -11,7 +11,7 @@
 from langchain.agents import create_agent
 import mlflow
 
-from GoT.tools.math_tool import (
+from GoT.agent_tools.math_tool import (
     multiply,
     summing,
     minus,
@@ -19,8 +19,7 @@
     divide,
 )
 
-from GoT.tools.craft_tool import craft_tool, install_dependency
-from GoT.tools.web_tool import search_arxiv, search_wikipedia
+from GoT.agent_tools.craft_tool import craft_tool, install_dependency
 
 load_dotenv()
 
@@ -86,7 +85,7 @@ def get_craft_tool(self):
         return [craft_tool, install_dependency]
 
     def get_crafted_tools(self) -> list[BaseTool]:
-        module_name = "GoT.tools.ai_tool"
+        module_name = "GoT.agent_tools.ai_tool"
         if module_name in sys.modules:
             module = importlib.reload(sys.modules[module_name])
         else:
diff --git a/GoT/model/runtime_graph.py b/GoT/core/runtime_graph.py
similarity index 100%
rename from GoT/model/runtime_graph.py
rename to GoT/core/runtime_graph.py
diff --git a/GoT/model/utils/hf_formatter.py b/GoT/experiments/hf_formatter.py
similarity index 98%
rename from GoT/model/utils/hf_formatter.py
rename to GoT/experiments/hf_formatter.py
index 5c9b162..f7127a4 100644
--- a/GoT/model/utils/hf_formatter.py
+++ b/GoT/experiments/hf_formatter.py
@@ -7,9 +7,9 @@
 
 from langchain.messages import HumanMessage
 
-from GoT.model.graph_model import call_graph
-from GoT.model.ollama_llm import LLM
-from GoT.model.utils.utils import (
+from GoT.core.graph_model import call_graph
+from GoT.core.llm import LLM
+from GoT.utils.utils import (
     extract_output,
     normalize_list,
     normalize_number,
diff --git a/GoT/model/lm_wrapper.py b/GoT/experiments/lm_wrapper.py
similarity index 98%
rename from GoT/model/lm_wrapper.py
rename to GoT/experiments/lm_wrapper.py
index 1013c0b..8f9bc42 100644
--- a/GoT/model/lm_wrapper.py
+++ b/GoT/experiments/lm_wrapper.py
@@ -1,10 +1,10 @@
-from GoT.model.graph_model import call_graph
+from GoT.core.graph_model import call_graph
 from lm_eval.api.registry import register_model
 from lm_eval.api.model import LM
 
-from GoT.model.ollama_llm import LLM
+from GoT.core.llm import LLM
 from langchain_core.messages import HumanMessage
-from GoT.model.utils.utils import extract_output, normalize_number, parse_response
+from GoT.utils.utils import extract_output, normalize_number, parse_response
 
 
 class LangGraphLM:
diff --git a/GoT/model/utils/utils.py b/GoT/utils/utils.py
similarity index 99%
rename from GoT/model/utils/utils.py
rename to GoT/utils/utils.py
index 3ae103a..d156a87 100644
--- a/GoT/model/utils/utils.py
+++ b/GoT/utils/utils.py
@@ -5,7 +5,7 @@
 import numpy as np
 from sympy import simplify, sympify
 
-from GoT.model.runtime_graph import Response, Score
+from GoT.core.runtime_graph import Response, Score
 from langgraph.graph import MessagesState
 from langchain_core.messages import AIMessage
 

From fab5a9cb58b91fa1396fb191f13e619ff78a3253 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Sat, 25 Apr 2026 15:26:29 +0200
Subject: [PATCH 14/27] style: ruff format + ignores arxiv, wikipedia stubs

---
 GoT/agent_tools/web_tool.py     | 24 ++++++++++++------------
 GoT/cli/parse_args.py           |  9 +++++++--
 GoT/core/graph_model.py         | 20 +++++++++++---------
 GoT/core/llm.py                 |  2 +-
 GoT/experiments/hf_formatter.py | 15 ++++++++++-----
 GoT/utils/utils.py              |  1 +
 pyproject.toml                  |  2 +-
 7 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/GoT/agent_tools/web_tool.py b/GoT/agent_tools/web_tool.py
index 5987bce..b56413d 100644
--- a/GoT/agent_tools/web_tool.py
+++ b/GoT/agent_tools/web_tool.py
@@ -2,6 +2,7 @@
 from langchain.tools import tool
 import wikipedia
 
+
 @tool
 def search_wikipedia(query: str) -> str:
     """
@@ -11,7 +12,7 @@ def search_wikipedia(query: str) -> str:
         query (str): The keyword or topic to search for.
 
     Returns:
-        str: A 3-sentence summary of the topic, the first option if 
+        str: A 3-sentence summary of the topic, the first option if
              ambiguous, or an error message if not found.
     """
     try:
@@ -21,20 +22,19 @@ def search_wikipedia(query: str) -> str:
         return wikipedia.summary(e.options[0], sentences=3)
     except wikipedia.PageError:
         return "Page not found"
-    
+
+
 @tool
 def search_arxiv(query: str) -> str:
-    """Search ArXiv for scientific papers on a given topic. 
+    """Search ArXiv for scientific papers on a given topic.
     Use this when you need to find research papers, abstracts or academic references."""
-    
+
     try:
         client = arxiv.Client()
         search = arxiv.Search(
-            query=query,
-            max_results=3,
-            sort_by=arxiv.SortCriterion.Relevance
+            query=query, max_results=3, sort_by=arxiv.SortCriterion.Relevance
         )
-        
+
         results = []
         for paper in client.results(search):
             results.append(
@@ -44,11 +44,11 @@ def search_arxiv(query: str) -> str:
                 f"Summary: {paper.summary[:300]}...\n"
                 f"URL: {paper.entry_id}\n"
             )
-        
+
         if not results:
             return "No papers found for this query."
-        
+
         return "\n---\n".join(results)
-    
+
     except Exception as e:
-        return f"Error searching ArXiv: {str(e)}"
\ No newline at end of file
+        return f"Error searching ArXiv: {str(e)}"
diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py
index b7605b4..f52e05e 100644
--- a/GoT/cli/parse_args.py
+++ b/GoT/cli/parse_args.py
@@ -1,7 +1,12 @@
 import argparse
 import sys
 
-from GoT.experiments.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math
+from GoT.experiments.hf_formatter import (
+    use_gaia,
+    use_gpqa,
+    use_gsm8k,
+    use_hendrycks_math,
+)
 
 
 def defining_and_parse_args():
@@ -39,7 +44,7 @@ def defining_and_parse_args():
             "intermediate_algebra",
             "number_theory",
             "precalculus",
-            "prealgebra"
+            "prealgebra",
         ],
         help="The type of math problems to run, only for hendrycks_math.",
     )
diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py
index b1ab35b..c64e3b1 100644
--- a/GoT/core/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -283,7 +283,9 @@ def tool_call(messages: MessagesState):
         )
     tool_used = extract_tool_used(res)
     runtime_graph.temp_response.response = parse_response_for_tool_node(res).response
-    runtime_graph.temp_response.explanation = parse_response_for_tool_node(res).explanation
+    runtime_graph.temp_response.explanation = parse_response_for_tool_node(
+        res
+    ).explanation
     parsed_res = f"Response: {parse_response_for_tool_node(res).response}\nExplanation: {parse_response_for_tool_node(res).explanation}"
     runtime_graph.resolve_node(call_node, parsed_res)
 
@@ -345,12 +347,13 @@ def crafting(messages: MessagesState):
             content="Use the context given to craft a tool to solve this problem using craft_tool. It must be a function"
         ),
     ]
-    try:    
+    try:
         craft_res = crafter_agent.invoke(
-            {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS}
+            {"messages": crafting_messages},
+            config={"recursion_limit": MAX_INTERACTIONS},
         )
         parsed_res = parse_response(craft_res)
-    except Exception:   
+    except Exception:
         parsed_res = ""
     # runtime_graph.temp_response.response = parse_response_for_tool_node(
     #     craft_res
@@ -380,7 +383,9 @@ def test_result(messages: MessagesState):
     test_node = runtime_graph.temp_node
     if not isinstance(test_node, TestNode):
         raise TypeError("Expected TestNode for scoring")
-    threshold = COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity
+    threshold = (
+        COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity
+    )
     if test_node.score >= threshold:
         runtime_graph.add_edge(test_node, runtime_graph.temp_response)
         runtime_graph.temp_response.resolved = True
@@ -391,10 +396,7 @@ def test_result(messages: MessagesState):
         and test_node.need_tool_crafting is True
     ):
         return "crafting"
-    elif (
-        test_node.score < threshold
-        and is_tool_path_available is True
-    ):
+    elif test_node.score < threshold and is_tool_path_available is True:
         if test_node.need_tool_crafting is True:
             test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity."
         return "backtrack"
diff --git a/GoT/core/llm.py b/GoT/core/llm.py
index 29d4513..f76b50f 100644
--- a/GoT/core/llm.py
+++ b/GoT/core/llm.py
@@ -71,7 +71,7 @@ def __init__(self):
                 "remote_standard": self.remoteLLMStandard,
                 "remote_response_format": self.remoteLLMResponseFormat,
                 "remote_score_format": self.remoteLLMScoreFormat,
-                "remote_crafter": self.remoteLLMCrafter
+                "remote_crafter": self.remoteLLMCrafter,
             }
 
             self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL)
diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py
index f7127a4..6e2ebc8 100644
--- a/GoT/experiments/hf_formatter.py
+++ b/GoT/experiments/hf_formatter.py
@@ -18,6 +18,7 @@
 
 TOKEN = os.getenv("HF_TOKEN")
 
+
 class ResultEval:
     def __init__(
         self,
@@ -207,6 +208,7 @@ def hendrycks_math_eval(responses: list[ResultEval]):
     print(f"Total: {len(responses)}")
     print(f"Correct: {correct}")
 
+
 def benchmark_run(
     questions: list[ResultEval], max_run: int, test: bool
 ) -> list[ResultEval]:
@@ -261,11 +263,11 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]:
         attachment = sample.get("file_name", None)
         if attachment:
             abs_path = hf_hub_download(
-                    repo_id="gaia-benchmark/GAIA",
-                    filename=f"2023/validation/{attachment}",
-                    repo_type="dataset",
-                    token=TOKEN
-                )
+                repo_id="gaia-benchmark/GAIA",
+                filename=f"2023/validation/{attachment}",
+                repo_type="dataset",
+                token=TOKEN,
+            )
             print(abs_path)
             question += f"\nAttachment file path: {abs_path}"
         correct_answer = sample["Final answer"]
@@ -283,6 +285,7 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]:
 
     return questions
 
+
 def gaia_eval(responses: list[ResultEval]):
     correct = 0
 
@@ -300,6 +303,7 @@ def gaia_eval(responses: list[ResultEval]):
     print(f"Total: {len(responses)}")
     print(f"Correct: {correct}")
 
+
 def use_gpqa(max_run: int, test: bool, model_name: str):
     ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
     questions = gpqa_format(ds)
@@ -323,6 +327,7 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str):
     hendrycks_math_eval(responses)
     save_eval_results(responses, model_name=model_name)
 
+
 def use_gaia(max_run: int, test: bool, model_name: str):
     ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation")
     questions = gaia_format(ds)
diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py
index d156a87..1b936b4 100644
--- a/GoT/utils/utils.py
+++ b/GoT/utils/utils.py
@@ -229,6 +229,7 @@ def print_benchmark_result_loglikehood(
     print(f"Correct: {n_correct}")
     print(f"Wrong: {n_wrong}")
 
+
 def download_mlflow_traces(n_max: int):
     traces = mlflow.search_traces(max_results=n_max, order_by=["timestamp DESC"])
     traces.to_csv("traces.csv", index=False)
diff --git a/pyproject.toml b/pyproject.toml
index 20e60bf..27f02e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,5 +51,5 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [[tool.mypy.overrides]]
-module = ["lm_eval.*", "datasets.*", "sympy.*"]
+module = ["lm_eval.*", "datasets.*", "sympy.*", "arxiv.*", "wikipedia.*"]
 follow_untyped_imports = true

From 345ce80b33533f88ecf79d90b2e7d24a9d254f89 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 7 May 2026 15:36:26 +0200
Subject: [PATCH 15/27] chore: improve evaluation

---
 GoT/experiments/hf_formatter.py |  4 ++--
 GoT/utils/utils.py              | 42 +++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py
index 6e2ebc8..42c333b 100644
--- a/GoT/experiments/hf_formatter.py
+++ b/GoT/experiments/hf_formatter.py
@@ -10,6 +10,7 @@
 from GoT.core.graph_model import call_graph
 from GoT.core.llm import LLM
 from GoT.utils.utils import (
+    extract_answer_from_response,
     extract_output,
     normalize_list,
     normalize_number,
@@ -190,8 +191,7 @@ def hendrycks_math_eval(responses: list[ResultEval]):
     correct = 0
 
     for res in responses:
-        opt_res = re.search(r"\\boxed\{(.*)\}", res.response)
-        norm_res = opt_res.group(1) if opt_res else "N/A"
+        norm_res = extract_answer_from_response(res.response)
         norm_correct = normalize_number(res.correct_answer)
         res.filtered_answer = norm_res
 
diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py
index 1b936b4..c0714cd 100644
--- a/GoT/utils/utils.py
+++ b/GoT/utils/utils.py
@@ -83,8 +83,14 @@ def parse_response_for_tool_node(response: MessagesState) -> Response:
     if isinstance(structured_response, Response):
         return structured_response
     elif score_res is not None:
-        data = json.loads(score_res)
-        return Response.model_validate(data)
+        try:
+            data = json.loads(score_res)
+            return Response.model_validate(data)
+        except json.JSONDecodeError:
+            return Response(
+                response=score_res,
+                explanation="",
+            )
     else:
         return Response(
             response="Failed to parse response",
@@ -180,6 +186,38 @@ def symbolic_equal(a, b):
     except Exception:
         return False
 
+def extract_answer_from_response(response: str) -> str:
+    """
+    Extract the answer from the LLM response. 
+
+    :param response: The LLM response
+    :type response: str
+    :return: The extracted answer
+    :rtype: str
+    """
+    # Try to extract using \\boxed{answer}
+    boxed_match = re.search(r"\\boxed\{([^}]*)\}", response)
+    if boxed_match:
+        return boxed_match.group(1).strip()
+    
+    # Try to extract using boxed{answer}
+    boxed_match_alt = re.search(r"oxed\{([^}]*)\}", response)
+    if boxed_match_alt:
+        return boxed_match_alt.group(1).strip()
+
+    # Try to extract using Answer: answer
+    answer_match = re.search(r"Answer:\s*(.*)", response)
+    if answer_match:
+        return answer_match.group(1).strip()
+    
+    try:
+        if float(response.strip()):
+            return response.strip()
+    except ValueError:
+        return "N/A"
+    # If no pattern matched, return the original response
+    return "N/A"
+
 
 def print_benchmark_result(results: dict, task_name: str, filter: str) -> None:
     samples = results["samples"][task_name]

From 2b4f6171715bd334781ddc67e95bd7f062cc31c0 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 8 May 2026 11:01:05 +0200
Subject: [PATCH 16/27] chore: code refinement

---
 GoT/core/graph_model.py   |  3 +--
 GoT/core/runtime_graph.py | 28 +++-------------------------
 2 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py
index c64e3b1..9fc8460 100644
--- a/GoT/core/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -168,6 +168,7 @@ def open_excel_files(excel_path: str)
         Rules:
         - Prefer generic names and parameters, never craft specific functions.
         - If the function contains specific numbers or values, it is wrong.
+        - The Tool must follow the Json schema protocol, Tuple is banned.
         - Never return hardcoded or placeholder strings, the function must fetch real data.
         - Craft a maximum of 3 tools, it must contains always the docs. If the number of tool crafted exceed, you fail.
         - Never craft tool that raise exceptions.
@@ -205,7 +206,6 @@ def goal(prompt: MessagesState):
         call_node = ToolNode(
             "Please, resolve the problem with the tools given, you MUST follow the previous reasoning.",
             "",
-            tool_name="",
         )
         reasoning_node = ReasoningNode("")
         runtime_graph.add_node(reasoning_node)
@@ -454,7 +454,6 @@ def backtrack(messages: MessagesState):
     runtime_graph.add_edge(
         backtrack_node, runtime_graph.temp_node
     )  # tool call node that we want to resolve
-    # messages = runtime_graph.append_prompt_to_messages_state(runtime_graph.temp_node)
     messages.get("messages", []).append(AIMessage(backtrack_node.feedback))
     return messages
 
diff --git a/GoT/core/runtime_graph.py b/GoT/core/runtime_graph.py
index 6047e23..56225fe 100644
--- a/GoT/core/runtime_graph.py
+++ b/GoT/core/runtime_graph.py
@@ -1,17 +1,16 @@
 from typing import Dict, List
 from langgraph.graph import MessagesState
-from langchain_core.messages import AnyMessage, HumanMessage
 from pydantic import BaseModel, Field
 
 
 class RuntimeNode:
-    _id_counter = 0  # Contatore globale per ID unici
+    _id_counter = 0  # global ID counter
 
     def __init__(
         self,
         resolved: bool = False,
     ):
-        self.id = RuntimeNode._id_counter  # ID unico per ogni nodo
+        self.id = RuntimeNode._id_counter
         RuntimeNode._id_counter += 1
         self.resolved = resolved
 
@@ -52,13 +51,11 @@ def __init__(
         self,
         prompt: str,
         response: str,
-        tool_name: str,
         resolved: bool = False,
     ):
         super().__init__(resolved)
         self.prompt = prompt
         self.response = response
-        self.tool_name = tool_name
 
 
 class GoalNode(RuntimeNode):
@@ -170,7 +167,6 @@ class RuntimeGraph:
     def __init__(self):
         self.goal: MessagesState = MessagesState(messages=[])
         self.nodes: Dict[RuntimeNode, List[RuntimeNode]] = {}
-        self.tools_available: Dict[RuntimeNode, str] = {}
         self.temp_node: RuntimeNode = RuntimeNode()
         self.temp_response: ResponseNode = ResponseNode(response="", resolved=False)
 
@@ -181,9 +177,6 @@ def add_edge(self, n1: RuntimeNode, n2: RuntimeNode):
         self.nodes.setdefault(n1, []).append(n2)
         self.nodes.setdefault(n2, [])
 
-    def add_tool_link(self, call_node: RuntimeNode, tool_name: str):
-        self.tools_available.setdefault(call_node, tool_name)
-
     def resolve_node(self, node: RuntimeNode, response: str) -> None:
         if isinstance(node, (ToolNode, TestNode, CompletitionNode)):
             node.response = response
@@ -208,31 +201,16 @@ def exist_tool_available(self) -> bool:
         call_nodes = [n for n in nodes if (isinstance(n, ToolNode) and not n.resolved)]
         return True if call_nodes else False
 
-    def get_resolved_tools(self):
-        resolved_nodes = [t for t in self.tools_available.keys() if t.resolved is True]
-        return [self.tools_available[n] for n in resolved_nodes]
-
-    def is_craftin_node_resolved(self) -> bool:
+    def is_crafting_node_resolved(self) -> bool:
         nodes = list(self.nodes.keys())
         crafting_nodes = [
             n for n in nodes if (isinstance(n, CraftingNode) and n.resolved)
         ]
         return True if crafting_nodes else False
 
-    def append_prompt_to_messages_state(
-        self, node: TestNode | ToolNode | CompletitionNode | GoalNode
-    ) -> MessagesState:
-        messages: list[AnyMessage] = []
-
-        if node.prompt:
-            messages.append(HumanMessage(content=node.prompt))
-
-        return MessagesState(messages=messages)
-
     def clear(self):
         RuntimeNode._id_counter = 0
         self.nodes = {}
-        self.tools_available = {}
         self.temp_node = RuntimeNode()
         self.temp_response = ResponseNode(response="", resolved=False)
 

From 47ef8b4a50dd5072bf7b145b35f6ed29e566caa1 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Thu, 14 May 2026 14:46:39 +0200
Subject: [PATCH 17/27] feat: add custom runs on cli args

---
 GoT/__init__.py                  |  7 +------
 GoT/cli/parse_args.py            | 11 ++++++++++-
 GoT/experiments/runner_custom.py | 15 +++++++++++++++
 3 files changed, 26 insertions(+), 7 deletions(-)
 create mode 100644 GoT/experiments/runner_custom.py

diff --git a/GoT/__init__.py b/GoT/__init__.py
index a5df598..8a62a54 100644
--- a/GoT/__init__.py
+++ b/GoT/__init__.py
@@ -3,7 +3,6 @@
 from dotenv import load_dotenv
 
 from lm_eval import evaluator, tasks
-from GoT.core.graph_model import call_graph
 from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
 from GoT.cli.parse_args import call_benchmark, defining_and_parse_args
 from GoT.utils.utils import (
@@ -62,14 +61,10 @@ def lm_eval_graph_benchmark():
     print_benchmark_result_loglikehood(results, task_name, filter_val="none")
 
 
-def custom_test():
-    call_graph("Solve this integral ∫x2⋅ex2dx")
-
-
 def main():
-    # It could be changed with custom_test() to test a custom problem instead of the benchmark
     args = defining_and_parse_args()
     call_benchmark(args)
+    # download_mlflow_traces(50)
 
 
 # let this be the last line of this file
diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py
index f52e05e..635b67f 100644
--- a/GoT/cli/parse_args.py
+++ b/GoT/cli/parse_args.py
@@ -7,6 +7,7 @@
     use_gsm8k,
     use_hendrycks_math,
 )
+from GoT.experiments.runner_custom import custom_test
 
 
 def defining_and_parse_args():
@@ -17,7 +18,7 @@ def defining_and_parse_args():
         "--benchmark",
         required=True,
         type=str,
-        choices=["gsm8k", "gpqa", "hendrycks_math", "gaia"],
+        choices=["gsm8k", "gpqa", "hendrycks_math", "gaia", "custom"],
         help="The benchmark to run the model on.",
     )
     parser.add_argument(
@@ -27,6 +28,12 @@ def defining_and_parse_args():
         choices=["graph", "standard"],
         help="Whether to run the standard model or the graph model.",
     )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="",
+        help="Insert a prompt during a custom run."
+    )
     parser.add_argument(
         "--max_run",
         type=int,
@@ -69,3 +76,5 @@ def call_benchmark(args):
         use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type)
     elif args.benchmark == "gaia":
         use_gaia(max_run=max_run, test=test, model_name=mode)
+    elif args.benchmark == "custom" and args.prompt != "":
+        custom_test(args.prompt, test)
diff --git a/GoT/experiments/runner_custom.py b/GoT/experiments/runner_custom.py
new file mode 100644
index 0000000..5db80a9
--- /dev/null
+++ b/GoT/experiments/runner_custom.py
@@ -0,0 +1,15 @@
+from langchain.messages import HumanMessage
+
+from GoT.core.graph_model import call_graph
+from GoT.core.llm import LLM
+
+
+def custom_test(text: str, is_graph_mode: bool):
+    if not is_graph_mode:
+        call_graph(text)
+    else:
+        agent = LLM().create_custom_agent(LLM().get_tools())
+        agent.invoke(
+            {"messages": [HumanMessage(content=text)]},
+            config={"recursion_limit": 20},
+        )
\ No newline at end of file

From 89e468de056c8cfc899e5de1990ca998f6d5d52c Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 15 May 2026 14:38:20 +0200
Subject: [PATCH 18/27] chore: change argument 'type' in 'category'

---
 GoT/cli/parse_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py
index 635b67f..2cd4275 100644
--- a/GoT/cli/parse_args.py
+++ b/GoT/cli/parse_args.py
@@ -41,7 +41,7 @@ def defining_and_parse_args():
         help="The maximum number of runs for the benchmark.",
     )
     parser.add_argument(
-        "--type",
+        "--category",
         type=str,
         default="algebra",
         choices=[
@@ -73,7 +73,7 @@ def call_benchmark(args):
     elif args.benchmark == "gpqa":
         use_gpqa(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "hendrycks_math":
-        use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type)
+        use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.category)
     elif args.benchmark == "gaia":
         use_gaia(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "custom" and args.prompt != "":

From d2f886fbf1d629ce9effd7ec5dc0e441bfd89456 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 10:31:46 +0200
Subject: [PATCH 19/27] chore: memorize tool crafted in each CraftingNode

---
 GoT/core/graph_model.py   | 21 +++++++--------------
 GoT/core/runtime_graph.py |  4 ++--
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py
index 9fc8460..4b1a76b 100644
--- a/GoT/core/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -17,6 +17,7 @@
 )
 from GoT.utils.utils import (
     extract_tool_used,
+    extract_tools_crafted,
     parse_response,
     parse_response_for_tool_node,
     parse_score,
@@ -54,16 +55,16 @@
         Your duty is to score, from 0 to 5, the response that user gives and assign a score.
 
         Rules:
+        - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem.
         - You MUST respond ONLY using the Score function.
         - You must consider if the format of the answer follow the instruction
         - You cannot give the full solution, only hints.
-        - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem.
         - Do not write natural language outside the function.
         - Always consider creating a tool if it makes the response correct or reusable.
 
         Score meanings:
         0: Impossible to understand / completely wrong
-        1: Nearly completely wrong
+        1: Nearly completely wrong / need to craft a tool
         2: Correct language but does not follow instruction
         3: Tries to solve but fails instruction / wrong
         4: Follows instruction but result wrong or incomplete
@@ -203,11 +204,11 @@ def goal(prompt: MessagesState):
     runtime_graph.add_node(goal_node)
     runtime_graph.temp_node = goal_node
     for i in range(0, 3):
+        reasoning_node = ReasoningNode("")
         call_node = ToolNode(
             "Please, resolve the problem with the tools given, you MUST follow the previous reasoning.",
             "",
         )
-        reasoning_node = ReasoningNode("")
         runtime_graph.add_node(reasoning_node)
         runtime_graph.add_node(call_node)
         runtime_graph.add_edge(goal_node, reasoning_node)
@@ -242,7 +243,7 @@ def tool_call(messages: MessagesState):
     # It calls the llm and it resolves the call node
     call_node = runtime_graph.temp_node
     tool_agent = LLM().create_custom_agent(
-        LLM().get_tools() + [divide_thought],
+        LLM().get_tools(),
         SystemMessage(
             "You are an assistant specialized in tools. Your goal is to resolve the problem with "
             " the tool that the user indicates to you. You HAVE to use or craft the tool that the assistant indicates to you."
@@ -335,7 +336,7 @@ def response_evaluation(messages: MessagesState):
 
 
 def crafting(messages: MessagesState):
-    crafting_node = CraftingNode(response="", tool_crafted="", resolved=False)
+    crafting_node = CraftingNode(response="", tools_crafted="", resolved=False)
     runtime_graph.add_node(crafting_node)
     runtime_graph.add_edge(runtime_graph.temp_node, crafting_node)
     runtime_graph.temp_node = crafting_node
@@ -352,12 +353,10 @@ def crafting(messages: MessagesState):
             {"messages": crafting_messages},
             config={"recursion_limit": MAX_INTERACTIONS},
         )
+        crafting_node.tools_crafted = extract_tools_crafted(craft_res)
         parsed_res = parse_response(craft_res)
     except Exception:
         parsed_res = ""
-    # runtime_graph.temp_response.response = parse_response_for_tool_node(
-    #     craft_res
-    # ).response
     runtime_graph.resolve_node(crafting_node, parsed_res)
     runtime_graph.temp_node = runtime_graph.call_tool_node()
     runtime_graph.add_edge(crafting_node, runtime_graph.temp_node)
@@ -390,12 +389,6 @@ def test_result(messages: MessagesState):
         runtime_graph.add_edge(test_node, runtime_graph.temp_response)
         runtime_graph.temp_response.resolved = True
         return END
-    elif (
-        test_node.score < threshold
-        and is_tool_path_available is True
-        and test_node.need_tool_crafting is True
-    ):
-        return "crafting"
     elif test_node.score < threshold and is_tool_path_available is True:
         if test_node.need_tool_crafting is True:
             test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity."
diff --git a/GoT/core/runtime_graph.py b/GoT/core/runtime_graph.py
index 56225fe..6661e15 100644
--- a/GoT/core/runtime_graph.py
+++ b/GoT/core/runtime_graph.py
@@ -116,12 +116,12 @@ class CraftingNode(RuntimeNode):
     def __init__(
         self,
         response: str,
-        tool_crafted: str = "",
+        tools_crafted: list[str] = [],
         resolved: bool = False,
     ):
         super().__init__(resolved)
         self.response = response
-        self.tool_crafted = tool_crafted
+        self.tools_crafted = tools_crafted
 
 
 class Score(BaseModel):

From 2558b007b45aebd697ac5767291e8d8c96d86312 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 10:32:25 +0200
Subject: [PATCH 20/27] chore: improve parsing

---
 GoT/experiments/hf_formatter.py |  3 +-
 GoT/utils/utils.py              | 71 ++++++++++++++++++++++++---------
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py
index 42c333b..a3973c9 100644
--- a/GoT/experiments/hf_formatter.py
+++ b/GoT/experiments/hf_formatter.py
@@ -14,6 +14,7 @@
     extract_output,
     normalize_list,
     normalize_number,
+    parse_response,
     symbolic_equal,
 )
 
@@ -222,7 +223,7 @@ def benchmark_run(
         correct_answer = q.correct_answer
         try:
             if test:
-                response = extract_output(
+                response = parse_response(
                     agent.invoke(
                         {"messages": [HumanMessage(content=prompt)]},
                         config={"recursion_limit": 20},
diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py
index c0714cd..fd88910 100644
--- a/GoT/utils/utils.py
+++ b/GoT/utils/utils.py
@@ -17,7 +17,10 @@ def parse_response(res) -> str:
     :param res: the MessagesState
     :return: The response in string
     """
-    return res["messages"][-1].content
+    response = res["messages"][-1].text
+    if response is None or response == "":
+        response = res["messages"][-1].content
+    return response
 
 
 def parse_tool_list(response: str) -> list[str]:
@@ -114,6 +117,45 @@ def extract_tool_used(response: MessagesState) -> list[str]:
                 tools_used.append(tool_call["name"])
     return tools_used
 
+def extract_function_signature(tool_crafted: dict) -> str:
+    """
+    Extract name and arguments from function string.
+    """
+    func_str = tool_crafted.get("tool_function", "")
+    match = re.search(r"def (\w+)\(([^)]*)\)", func_str)
+    if not match:
+        return ""
+    
+    func_name = match.group(1)
+    args = match.group(2)
+    
+    clean_args = ", ".join(
+        arg.split("=")[0].strip()
+        for arg in args.split(",")
+        if arg.strip() and arg.strip() != "self"
+        )
+    
+    return f"{func_name}({clean_args})"
+
+def extract_tools_crafted(response: MessagesState) -> list[str]:
+    """
+    Extract the tools that LLM has crafted.
+
+    :param response: The LLM response
+    :type response: MessagesState
+    :return: The list of tools crafted
+    :rtype: list[str]
+    """
+    tools_crafted = []
+    for msg in response.get("messages", []):
+        if isinstance(msg, AIMessage):
+            for tool_call in msg.tool_calls:
+                tool_crafted = tool_call["args"]
+                signature = extract_function_signature(tool_crafted)
+                if signature != '':
+                    tools_crafted.append(signature)
+    return tools_crafted
+
 
 def remove_tools_from_list(tool_list, tools_to_remove):
     """
@@ -187,36 +229,27 @@ def symbolic_equal(a, b):
         return False
 
 def extract_answer_from_response(response: str) -> str:
-    """
-    Extract the answer from the LLM response. 
-
-    :param response: The LLM response
-    :type response: str
-    :return: The extracted answer
-    :rtype: str
-    """
-    # Try to extract using \\boxed{answer}
-    boxed_match = re.search(r"\\boxed\{([^}]*)\}", response)
+    boxed_match = re.search(r"\\boxed\{(.*)\}", response)
     if boxed_match:
         return boxed_match.group(1).strip()
     
-    # Try to extract using boxed{answer}
-    boxed_match_alt = re.search(r"oxed\{([^}]*)\}", response)
+    boxed_match_alt = re.search(r"boxed\{(.*)\}", response)
     if boxed_match_alt:
         return boxed_match_alt.group(1).strip()
 
-    # Try to extract using Answer: answer
-    answer_match = re.search(r"Answer:\s*(.*)", response)
+    answer_match = re.search(r"Answer:\s*(.*)", response, re.IGNORECASE)
     if answer_match:
         return answer_match.group(1).strip()
     
+    clean_response = response.strip()
+    if not clean_response:
+        return "N/A"
+        
     try:
-        if float(response.strip()):
-            return response.strip()
+        float(clean_response)
+        return clean_response
     except ValueError:
         return "N/A"
-    # If no pattern matched, return the original response
-    return "N/A"
 
 
 def print_benchmark_result(results: dict, task_name: str, filter: str) -> None:

From 0468e3ee1cc319e7caf66d0123ccb86e0c0ada5d Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 10:59:04 +0200
Subject: [PATCH 21/27] style: change var names

---
 GoT/core/llm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GoT/core/llm.py b/GoT/core/llm.py
index f76b50f..6ae2014 100644
--- a/GoT/core/llm.py
+++ b/GoT/core/llm.py
@@ -51,26 +51,26 @@ def __init__(self):
                 api_key=os.environ.get("GEMINI_API_KEY"),
                 temperature=1.0,  # Gemini 3.0+ defaults to 1.0
             )
-            self.remoteLLMResponseFormat = ChatGoogleGenerativeAI(
+            self.remoteLLMReasoning = ChatGoogleGenerativeAI(
                 model="gemini-2.5-flash",
                 api_key=os.environ.get("GEMINI_API_KEY"),
                 temperature=1.0,  # Gemini 3.0+ defaults to 1.0
             )
             self.remoteLLMCrafter = ChatGoogleGenerativeAI(
-                model="gemini-3-flash-preview",
+                model="gemini-2.5-flash",
                 api_key=os.environ.get("GEMINI_API_KEY"),
                 temperature=1.0,  # Gemini 3.0+ defaults to 1.0
             )
-            self.remoteLLMScoreFormat = ChatGoogleGenerativeAI(
+            self.remoteLLMEvaluator = ChatGoogleGenerativeAI(
                 model="gemini-2.5-flash",
                 api_key=os.environ.get("GEMINI_API_KEY"),
-                temperature=0.7,  # Gemini 3.0+ defaults to 1.0
+                temperature=1.0,  # Gemini 3.0+ defaults to 1.0
             )
 
             self.remoteLLMs = {
                 "remote_standard": self.remoteLLMStandard,
-                "remote_response_format": self.remoteLLMResponseFormat,
-                "remote_score_format": self.remoteLLMScoreFormat,
+                "remote_response_format": self.remoteLLMReasoning,
+                "remote_score_format": self.remoteLLMEvaluator,
                 "remote_crafter": self.remoteLLMCrafter,
             }
 

From ac7289b8cc5b6134953c8dbeeb2c40d67e06dba0 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 10:59:50 +0200
Subject: [PATCH 22/27] chore: comment pypi release

---
 .github/workflows/deploy.yml |  2 +-
 release.config.mjs           | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 555263c..0bfbde1 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -62,6 +62,6 @@ jobs:
         env:
           # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
-          RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
+          # RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
           # dry run if not on main/master branch, or if initial commit
           RELEASE_DRY_RUN: ${{ steps.skip_release.outputs.skip == 'true' || (github.ref_name != 'master' && github.ref_name != 'main') }}
diff --git a/release.config.mjs b/release.config.mjs
index 2501f2f..2a1bc9b 100644
--- a/release.config.mjs
+++ b/release.config.mjs
@@ -1,25 +1,25 @@
 let dryRun = (process.env.RELEASE_DRY_RUN || "false").toLowerCase() === "true";
-let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true";
-const pypiToken = process.env.PYPI_TOKEN;
+// let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true";
+// const pypiToken = process.env.PYPI_TOKEN;
 
 let prepareCmd = "poetry version -- \${nextRelease.version}" + ` && poetry config pypi-token.pypi ${pypiToken}`;
-let publishCmd = `poetry publish --build`;
+// let publishCmd = `poetry publish --build`;
 
-if (testPypi) {
-    publishCmd += ` --repository testpypi`;
-    prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi");
-}
+// if (testPypi) {
+//     publishCmd += ` --repository testpypi`;
+//     prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi");
+// }
 
-if (dryRun) {
-    publishCmd += " --dry-run";
-}
+// if (dryRun) {
+//     publishCmd += " --dry-run";
+// }
 
 import config from 'semantic-release-preconfigured-conventional-commits' with {type: 'json'};
 
 config.plugins.push(
     ["@semantic-release/exec", {
         "prepareCmd" : prepareCmd,
-        "publishCmd": publishCmd,
+        // "publishCmd": publishCmd,
     }]
 )
 

From b787ac37e5baa33e0d94fe6cbcd882da7bcafef3 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 11:04:38 +0200
Subject: [PATCH 23/27] fix: accidentally delete crafting condition

---
 GoT/core/graph_model.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py
index 4b1a76b..194d551 100644
--- a/GoT/core/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -389,6 +389,12 @@ def test_result(messages: MessagesState):
         runtime_graph.add_edge(test_node, runtime_graph.temp_response)
         runtime_graph.temp_response.resolved = True
         return END
+    elif (
+        test_node.score < threshold
+        and is_tool_path_available is True
+        and test_node.need_tool_crafting is True
+    ):
+        return "crafting"
     elif test_node.score < threshold and is_tool_path_available is True:
         if test_node.need_tool_crafting is True:
             test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity."

From 868b3d07a2c339ad819dc8b679ffeee14863167d Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Wed, 20 May 2026 17:49:12 +0200
Subject: [PATCH 24/27] chore: add docstring control

---
 GoT/agent_tools/craft_tool.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/GoT/agent_tools/craft_tool.py b/GoT/agent_tools/craft_tool.py
index dc1d2d1..d2569b4 100644
--- a/GoT/agent_tools/craft_tool.py
+++ b/GoT/agent_tools/craft_tool.py
@@ -83,6 +83,13 @@ def sanitize_input(query: str) -> str:
 
     func = functions[0]
 
+    try:
+        docstring = ast.get_docstring(func)
+        if not docstring or not docstring.strip():
+            return "Error: missing docstring. A description of the function is mandatory for Gemini tools."
+    except TypeError:
+        return "Error: missing docstring. A description of the function is mandatory for Gemini tools."
+
     for arg in func.args.args:
         if arg.annotation is None:
             return f"Error: missing type annotation for '{arg.arg}'"

From b417846230ca4845db12ab4fac65293c76008231 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 22 May 2026 13:44:02 +0200
Subject: [PATCH 25/27] style: ruff format

---
 GoT/cli/parse_args.py            |  9 ++++-----
 GoT/experiments/runner_custom.py |  2 +-
 GoT/utils/utils.py               | 19 +++++++++++--------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py
index 2cd4275..8169319 100644
--- a/GoT/cli/parse_args.py
+++ b/GoT/cli/parse_args.py
@@ -29,10 +29,7 @@ def defining_and_parse_args():
         help="Whether to run the standard model or the graph model.",
     )
     parser.add_argument(
-        "--prompt",
-        type=str,
-        default="",
-        help="Insert a prompt during a custom run."
+        "--prompt", type=str, default="", help="Insert a prompt during a custom run."
     )
     parser.add_argument(
         "--max_run",
@@ -73,7 +70,9 @@ def call_benchmark(args):
     elif args.benchmark == "gpqa":
         use_gpqa(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "hendrycks_math":
-        use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.category)
+        use_hendrycks_math(
+            max_run=max_run, test=test, model_name=mode, type=args.category
+        )
     elif args.benchmark == "gaia":
         use_gaia(max_run=max_run, test=test, model_name=mode)
     elif args.benchmark == "custom" and args.prompt != "":
diff --git a/GoT/experiments/runner_custom.py b/GoT/experiments/runner_custom.py
index 5db80a9..ab9ec3e 100644
--- a/GoT/experiments/runner_custom.py
+++ b/GoT/experiments/runner_custom.py
@@ -12,4 +12,4 @@ def custom_test(text: str, is_graph_mode: bool):
         agent.invoke(
             {"messages": [HumanMessage(content=text)]},
             config={"recursion_limit": 20},
-        )
\ No newline at end of file
+        )
diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py
index fd88910..37f8e64 100644
--- a/GoT/utils/utils.py
+++ b/GoT/utils/utils.py
@@ -117,6 +117,7 @@ def extract_tool_used(response: MessagesState) -> list[str]:
                 tools_used.append(tool_call["name"])
     return tools_used
 
+
 def extract_function_signature(tool_crafted: dict) -> str:
     """
     Extract name and arguments from function string.
@@ -125,18 +126,19 @@ def extract_function_signature(tool_crafted: dict) -> str:
     match = re.search(r"def (\w+)\(([^)]*)\)", func_str)
     if not match:
         return ""
-    
+
     func_name = match.group(1)
     args = match.group(2)
-    
+
     clean_args = ", ".join(
         arg.split("=")[0].strip()
         for arg in args.split(",")
         if arg.strip() and arg.strip() != "self"
-        )
-    
+    )
+
     return f"{func_name}({clean_args})"
 
+
 def extract_tools_crafted(response: MessagesState) -> list[str]:
     """
     Extract the tools that LLM has crafted.
@@ -152,7 +154,7 @@ def extract_tools_crafted(response: MessagesState) -> list[str]:
             for tool_call in msg.tool_calls:
                 tool_crafted = tool_call["args"]
                 signature = extract_function_signature(tool_crafted)
-                if signature != '':
+                if signature != "":
                     tools_crafted.append(signature)
     return tools_crafted
 
@@ -228,11 +230,12 @@ def symbolic_equal(a, b):
     except Exception:
         return False
 
+
 def extract_answer_from_response(response: str) -> str:
     boxed_match = re.search(r"\\boxed\{(.*)\}", response)
     if boxed_match:
         return boxed_match.group(1).strip()
-    
+
     boxed_match_alt = re.search(r"boxed\{(.*)\}", response)
     if boxed_match_alt:
         return boxed_match_alt.group(1).strip()
@@ -240,11 +243,11 @@ def extract_answer_from_response(response: str) -> str:
     answer_match = re.search(r"Answer:\s*(.*)", response, re.IGNORECASE)
     if answer_match:
         return answer_match.group(1).strip()
-    
+
     clean_response = response.strip()
     if not clean_response:
         return "N/A"
-        
+
     try:
         float(clean_response)
         return clean_response

From 52d5dbbf59caa0e4ea3751d8a19753f87e5f76d5 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 22 May 2026 13:47:21 +0200
Subject: [PATCH 26/27] chore: mypy check

---
 GoT/core/graph_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py
index 194d551..6ace4dc 100644
--- a/GoT/core/graph_model.py
+++ b/GoT/core/graph_model.py
@@ -336,7 +336,7 @@ def response_evaluation(messages: MessagesState):
 
 
 def crafting(messages: MessagesState):
-    crafting_node = CraftingNode(response="", tools_crafted="", resolved=False)
+    crafting_node = CraftingNode(response="", tools_crafted=[], resolved=False)
     runtime_graph.add_node(crafting_node)
     runtime_graph.add_edge(runtime_graph.temp_node, crafting_node)
     runtime_graph.temp_node = crafting_node

From 6bfc5c437e7ff4087a2f6e1bdde2825923169786 Mon Sep 17 00:00:00 2001
From: Raggini Marco <marco.raggini2@studio.unibo.it>
Date: Fri, 22 May 2026 14:59:29 +0200
Subject: [PATCH 27/27] chore: uncomment pypi config

---
 .github/workflows/deploy.yml |  4 ++--
 release.config.mjs           | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 0bfbde1..77a591e 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -60,8 +60,8 @@ jobs:
           npm install
           npx semantic-release
         env:
-          # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
-          # RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
+          RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
           # dry run if not on main/master branch, or if initial commit
           RELEASE_DRY_RUN: ${{ steps.skip_release.outputs.skip == 'true' || (github.ref_name != 'master' && github.ref_name != 'main') }}
diff --git a/release.config.mjs b/release.config.mjs
index 2a1bc9b..2501f2f 100644
--- a/release.config.mjs
+++ b/release.config.mjs
@@ -1,25 +1,25 @@
 let dryRun = (process.env.RELEASE_DRY_RUN || "false").toLowerCase() === "true";
-// let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true";
-// const pypiToken = process.env.PYPI_TOKEN;
+let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true";
+const pypiToken = process.env.PYPI_TOKEN;
 
 let prepareCmd = "poetry version -- \${nextRelease.version}" + ` && poetry config pypi-token.pypi ${pypiToken}`;
-// let publishCmd = `poetry publish --build`;
+let publishCmd = `poetry publish --build`;
 
-// if (testPypi) {
-//     publishCmd += ` --repository testpypi`;
-//     prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi");
-// }
+if (testPypi) {
+    publishCmd += ` --repository testpypi`;
+    prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi");
+}
 
-// if (dryRun) {
-//     publishCmd += " --dry-run";
-// }
+if (dryRun) {
+    publishCmd += " --dry-run";
+}
 
 import config from 'semantic-release-preconfigured-conventional-commits' with {type: 'json'};
 
 config.plugins.push(
     ["@semantic-release/exec", {
         "prepareCmd" : prepareCmd,
-        // "publishCmd": publishCmd,
+        "publishCmd": publishCmd,
     }]
 )