From fa71e8fdff52490915fdd7e937786e96550343f5 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 15 Apr 2026 10:35:13 +0200 Subject: [PATCH 01/27] chore: improve prompt --- GoT/model/graph_model.py | 1 + GoT/tools/runtime_graph_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py index 57491ad..db01896 100644 --- a/GoT/model/graph_model.py +++ b/GoT/model/graph_model.py @@ -55,6 +55,7 @@ Rules: - You MUST respond ONLY using the Score function. + - You must consider if the format of the answer follow the instruction - You cannot give the full solution, only hints. - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem. - Do not write natural language outside the function. diff --git a/GoT/tools/runtime_graph_tool.py b/GoT/tools/runtime_graph_tool.py index 77df52a..6b2a7ef 100644 --- a/GoT/tools/runtime_graph_tool.py +++ b/GoT/tools/runtime_graph_tool.py @@ -21,6 +21,8 @@ def divide_thought( HOW TO USE THIS TOOL: - Call it when you think the problem is complex. - The two parts must be as independent as possible. + IMPORTANT NOTES: + - You can't use the result of the first part to reason about the second part, and vice versa. The two parts must be as independent as possible. Arguments: - first_part: the first part of the thought process - second_part: the second part of the thought process From 5569cb605f452347588f29c8416183488fca91b2 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 16 Apr 2026 10:12:49 +0200 Subject: [PATCH 02/27] chore: remove craft tools in hf formatter --- GoT/model/utils/hf_formatter.py | 8 ++++---- GoT/tools/craft_tool.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py index 7b1eddf..eadd6b0 100644 --- a/GoT/model/utils/hf_formatter.py +++ b/GoT/model/utils/hf_formatter.py @@ -103,7 +103,7 @@ def gpqa_format(dataset: Dataset) -> list[ResultEval]: def gpqa_run(questions: list[ResultEval], max_run: int, test: bool) -> list[ResultEval]: responses = [] run_counter = 0 - agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool()) + agent = LLM().create_custom_agent(LLM().get_tools()) for q in questions[25:]: if run_counter >= max_run: break @@ -188,7 +188,7 @@ def gsm8k_run( ) -> list[ResultEval]: responses = [] run_counter = 0 - agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool()) + agent = LLM().create_custom_agent(LLM().get_tools()) for q in questions: if run_counter >= max_run: break @@ -277,8 +277,8 @@ def hendrycks_math_run( ) -> list[ResultEval]: responses = [] run_counter = 0 - agent = LLM().create_custom_agent(LLM().get_tools() + LLM().get_craft_tool()) - for q in questions: + agent = LLM().create_custom_agent(LLM().get_tools()) + for q in questions[10:]: if run_counter >= max_run: break prompt = q.question diff --git a/GoT/tools/craft_tool.py b/GoT/tools/craft_tool.py index 8db13dd..dc1d2d1 100644 --- a/GoT/tools/craft_tool.py +++ b/GoT/tools/craft_tool.py @@ -56,6 +56,8 @@ def craft_tool(tool_function: str) -> str: """Save the function definition provided by the LLM as a tool that can be used by other agents. The function should be defined as a python function. The function should be general and reusable, and should not be specific to the current problem. + The function must not use tuple as args type. + The function must be defined as gemini api format, with type annotations for all arguments and return type. The function should be defined in a way that it can be imported and used by other agents.""" def sanitize_input(query: str) -> str: From 53344dd0d4227fd77b3bbccf8f055e5a0ffc7fca Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 16 Apr 2026 16:55:26 +0200 Subject: [PATCH 03/27] chore: create a single benchmark_run for all datasets --- GoT/model/utils/hf_formatter.py | 145 ++++++-------------------------- 1 file changed, 27 insertions(+), 118 deletions(-) diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py index eadd6b0..fbbdb59 100644 --- a/GoT/model/utils/hf_formatter.py +++ b/GoT/model/utils/hf_formatter.py @@ -100,50 +100,6 @@ def gpqa_format(dataset: Dataset) -> list[ResultEval]: return questions -def gpqa_run(questions: list[ResultEval], max_run: int, test: bool) -> list[ResultEval]: - responses = [] - run_counter = 0 - agent = LLM().create_custom_agent(LLM().get_tools()) - for q in questions[25:]: - if run_counter >= max_run: - break - prompt = q.question - correct_letter = q.correct_answer - try: - if test: - response = extract_output( - agent.invoke( - {"messages": [HumanMessage(content=prompt)]}, - config={"recursion_limit": 10}, - ) - ) - else: - response = extract_output(call_graph(prompt)) - norm_res = normalize_number(response) - responses.append( - ResultEval( - question=prompt, - response=norm_res, - filtered_answer="", - correct_answer=correct_letter, - answer_success=0.0, - ) - ) - except Exception as e: - print(f"Error processing question: {e}") - responses.append( - ResultEval( - question=prompt, - response="Error", - filtered_answer="", - correct_answer=correct_letter, - answer_success=0.0, - ) - ) - run_counter += 1 - return responses - - def gpqa_eval(responses: list[ResultEval]): correct = 0 @@ -183,52 +139,6 @@ def gsm8k_format(dataset: Dataset) -> list[ResultEval]: return questions -def gsm8k_run( - questions: list[ResultEval], max_run: int, test: bool -) -> list[ResultEval]: - responses = [] - run_counter = 0 - agent = LLM().create_custom_agent(LLM().get_tools()) - for q in questions: - if run_counter >= max_run: - break - prompt = q.question - correct_answer = q.correct_answer - try: - if test: - response = extract_output( - agent.invoke( - {"messages": [HumanMessage(content=prompt)]}, - config={"recursion_limit": 20}, - ) - ) - else: - response = extract_output(call_graph(prompt)) - norm_res = normalize_number(response) - responses.append( - ResultEval( - question=prompt, - response=norm_res, - filtered_answer="", - correct_answer=correct_answer, - answer_success=0.0, - ) - ) - except Exception as e: - print(f"Error processing question: {e}") - responses.append( - ResultEval( - question=prompt, - response="Error", - filtered_answer="", - correct_answer=correct_answer, - answer_success=0.0, - ) - ) - run_counter += 1 - return responses - - def gsm8k_eval(responses: list[ResultEval]): correct = 0 @@ -272,13 +182,35 @@ def hendrycks_math_format(dataset: Dataset) -> list[ResultEval]: return questions -def hendrycks_math_run( +def hendrycks_math_eval(responses: list[ResultEval]): + correct = 0 + + for res in responses: + opt_res = re.search(r"\\boxed\{(.*)\}", res.response) + norm_res = opt_res.group(1) if opt_res else "N/A" + norm_correct = normalize_number(res.correct_answer) + res.filtered_answer = norm_res + + if ( + (norm_res in norm_correct) + or (normalize_list(norm_res) == normalize_list(norm_correct)) + or (symbolic_equal(norm_res, norm_correct)) + ): + correct += 1 + res.answer_success = 1.0 + + accuracy = correct / len(responses) * 100 + print(f"Accuracy: {accuracy:.2f}%") + print(f"Total: {len(responses)}") + print(f"Correct: {correct}") + +def benchmark_run( questions: list[ResultEval], max_run: int, test: bool ) -> list[ResultEval]: responses = [] run_counter = 0 agent = LLM().create_custom_agent(LLM().get_tools()) - for q in questions[10:]: + for q in questions[40:]: if run_counter >= max_run: break prompt = q.question @@ -318,34 +250,11 @@ def hendrycks_math_run( return responses -def hendrycks_math_eval(responses: list[ResultEval]): - correct = 0 - - for res in responses: - opt_res = re.search(r"\\boxed\{(.*)\}", res.response) - norm_res = opt_res.group(1) if opt_res else "N/A" - norm_correct = normalize_number(res.correct_answer) - res.filtered_answer = norm_res - - if ( - (norm_res in norm_correct) - or (normalize_list(norm_res) == normalize_list(norm_correct)) - or (symbolic_equal(norm_res, norm_correct)) - ): - correct += 1 - res.answer_success = 1.0 - - accuracy = correct / len(responses) * 100 - print(f"Accuracy: {accuracy:.2f}%") - print(f"Total: {len(responses)}") - print(f"Correct: {correct}") - - def use_gpqa(max_run: int, test: bool, model_name: str): ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond") data = ds["train"] questions = gpqa_format(data) - responses = gpqa_run(questions, max_run=max_run, test=test) + responses = benchmark_run(questions, max_run=max_run, test=test) gpqa_eval(responses) save_eval_results(responses, model_name=model_name) @@ -354,7 +263,7 @@ def use_gsm8k(max_run: int, test: bool, model_name: str): ds = load_dataset("gsm8k", "main") data = ds["test"] questions = gsm8k_format(data) - responses = gsm8k_run(questions, max_run=max_run, test=test) + responses = benchmark_run(questions, max_run=max_run, test=test) gsm8k_eval(responses) save_eval_results(responses, model_name=model_name) @@ -363,6 +272,6 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str): ds = load_dataset("EleutherAI/hendrycks_math", type) data = ds["test"] questions = hendrycks_math_format(data) - responses = hendrycks_math_run(questions, max_run=max_run, test=test) + responses = benchmark_run(questions, max_run=max_run, test=test) hendrycks_math_eval(responses) save_eval_results(responses, model_name=model_name) From 4cf3a015caf88f9aaea3016555d9c1bbb241a387 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 16 Apr 2026 17:15:31 +0200 Subject: [PATCH 04/27] feat: gaia benchmark added --- GoT/model/utils/hf_formatter.py | 45 +++++++++++++++++++++++++++++++++ GoT/model/utils/parse_args.py | 6 +++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py index fbbdb59..a26ecce 100644 --- a/GoT/model/utils/hf_formatter.py +++ b/GoT/model/utils/hf_formatter.py @@ -250,6 +250,43 @@ def benchmark_run( return responses +def gaia_format(dataset: Dataset) -> list[ResultEval]: + questions = [] + for data in dataset: + sample = data + question = sample["Question"] + correct_answer = sample["Final answer"] + prompt = ( + "Answer the following question. Think step by step before answering.\n\n" + f"{question}\n" + "Answer:" + ) + + questions.append( + ResultEval.create_empty_result( + question=prompt, correct_answer=correct_answer + ) + ) + + return questions + +def gaia_eval(responses: list[ResultEval]): + correct = 0 + + for res in responses: + norm_res = normalize_number(res.response) + norm_correct = normalize_number(res.correct_answer) + res.filtered_answer = norm_res + + if norm_res in norm_correct: + correct += 1 + res.answer_success = 1.0 + + accuracy = correct / len(responses) * 100 + print(f"Accuracy: {accuracy:.2f}%") + print(f"Total: {len(responses)}") + print(f"Correct: {correct}") + def use_gpqa(max_run: int, test: bool, model_name: str): ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond") data = ds["train"] @@ -275,3 +312,11 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str): responses = benchmark_run(questions, max_run=max_run, test=test) hendrycks_math_eval(responses) save_eval_results(responses, model_name=model_name) + +def use_gaia(max_run: int, test: bool, model_name: str): + ds = load_dataset("gaia-benchmark/GAIA", "2023_level1") + data = ds["test"] + questions = gaia_format(data) + responses = benchmark_run(questions, max_run=max_run, test=test) + gaia_eval(responses) + save_eval_results(responses, model_name=model_name) diff --git a/GoT/model/utils/parse_args.py b/GoT/model/utils/parse_args.py index d2437a3..6cb3b9d 100644 --- a/GoT/model/utils/parse_args.py +++ b/GoT/model/utils/parse_args.py @@ -1,7 +1,7 @@ import argparse import sys -from GoT.model.utils.hf_formatter import use_gpqa, use_gsm8k, use_hendrycks_math +from GoT.model.utils.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math def defining_and_parse_args(): @@ -12,7 +12,7 @@ def defining_and_parse_args(): "--benchmark", required=True, type=str, - choices=["gsm8k", "gpqa", "hendrycks_math"], + choices=["gsm8k", "gpqa", "hendrycks_math", "gaia"], help="The benchmark to run the model on.", ) parser.add_argument( @@ -62,3 +62,5 @@ def call_benchmark(args): use_gpqa(max_run=max_run, test=test, model_name=mode) elif args.benchmark == "hendrycks_math": use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type) + elif args.benchmark == "gaia": + use_gaia(max_run=max_run, test=test, model_name=mode) From 929ebd036f103aad615b2192274fcbb744b80b7c Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 17 Apr 2026 15:34:16 +0200 Subject: [PATCH 05/27] chore: simplify codes --- GoT/model/utils/hf_formatter.py | 35 +++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/GoT/model/utils/hf_formatter.py b/GoT/model/utils/hf_formatter.py index a26ecce..5c9b162 100644 --- a/GoT/model/utils/hf_formatter.py +++ b/GoT/model/utils/hf_formatter.py @@ -1,7 +1,9 @@ import json +import os from random import shuffle import re from datasets import Dataset, load_dataset +from huggingface_hub import hf_hub_download from langchain.messages import HumanMessage @@ -14,6 +16,7 @@ symbolic_equal, ) +TOKEN = os.getenv("HF_TOKEN") class ResultEval: def __init__( @@ -210,7 +213,7 @@ def benchmark_run( responses = [] run_counter = 0 agent = LLM().create_custom_agent(LLM().get_tools()) - for q in questions[40:]: + for q in questions: if run_counter >= max_run: break prompt = q.question @@ -255,6 +258,16 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]: for data in dataset: sample = data question = sample["Question"] + attachment = sample.get("file_name", None) + if attachment: + abs_path = hf_hub_download( + repo_id="gaia-benchmark/GAIA", + filename=f"2023/validation/{attachment}", + repo_type="dataset", + token=TOKEN + ) + print(abs_path) + question += f"\nAttachment file path: {abs_path}" correct_answer = sample["Final answer"] prompt = ( "Answer the following question. Think step by step before answering.\n\n" @@ -288,35 +301,31 @@ def gaia_eval(responses: list[ResultEval]): print(f"Correct: {correct}") def use_gpqa(max_run: int, test: bool, model_name: str): - ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond") - data = ds["train"] - questions = gpqa_format(data) + ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") + questions = gpqa_format(ds) responses = benchmark_run(questions, max_run=max_run, test=test) gpqa_eval(responses) save_eval_results(responses, model_name=model_name) def use_gsm8k(max_run: int, test: bool, model_name: str): - ds = load_dataset("gsm8k", "main") - data = ds["test"] - questions = gsm8k_format(data) + ds = load_dataset("gsm8k", "main", split="test") + questions = gsm8k_format(ds) responses = benchmark_run(questions, max_run=max_run, test=test) gsm8k_eval(responses) save_eval_results(responses, model_name=model_name) def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str): - ds = load_dataset("EleutherAI/hendrycks_math", type) - data = ds["test"] - questions = hendrycks_math_format(data) + ds = load_dataset("EleutherAI/hendrycks_math", type, split="test") + questions = hendrycks_math_format(ds) responses = benchmark_run(questions, max_run=max_run, test=test) hendrycks_math_eval(responses) save_eval_results(responses, model_name=model_name) def use_gaia(max_run: int, test: bool, model_name: str): - ds = load_dataset("gaia-benchmark/GAIA", "2023_level1") - data = ds["test"] - questions = gaia_format(data) + ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation") + questions = gaia_format(ds) responses = benchmark_run(questions, max_run=max_run, test=test) gaia_eval(responses) save_eval_results(responses, model_name=model_name) From 104510152b31e288879e284d27614c40d1cd4a1a Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 17 Apr 2026 15:34:47 +0200 Subject: [PATCH 06/27] chore: add explanation of tool needed --- GoT/model/graph_model.py | 4 ++++ GoT/model/runtime_graph.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py index db01896..ab6cc1c 100644 --- a/GoT/model/graph_model.py +++ b/GoT/model/graph_model.py @@ -82,6 +82,7 @@ LLM().get_craft_tool(), SystemMessage( """ + You are a master coder specialized in crafting tools for other agents. You create reusable Python tools for other agents. The tool must be GENERAL and parameterized. @@ -236,6 +237,7 @@ def tool_call(messages: MessagesState): ) tool_used = extract_tool_used(res) runtime_graph.temp_response.response = parse_response_for_tool_node(res).response + runtime_graph.temp_response.explanation = parse_response_for_tool_node(res).explanation parsed_res = f"Response: {parse_response_for_tool_node(res).response}\nExplanation: {parse_response_for_tool_node(res).explanation}" runtime_graph.resolve_node(call_node, parsed_res) @@ -289,8 +291,10 @@ def crafting(messages: MessagesState): runtime_graph.add_node(crafting_node) runtime_graph.add_edge(runtime_graph.temp_node, crafting_node) runtime_graph.temp_node = crafting_node + ai_feedback = runtime_graph.temp_response.explanation crafting_messages = [ HumanMessage(content="Original task:\n" + parse_response(runtime_graph.goal)), + AIMessage(content=ai_feedback), SystemMessage( content="Craft a tool to solve this problem using craft_tool. It must be a function" ), diff --git a/GoT/model/runtime_graph.py b/GoT/model/runtime_graph.py index 506f3d9..6047e23 100644 --- a/GoT/model/runtime_graph.py +++ b/GoT/model/runtime_graph.py @@ -107,10 +107,12 @@ class ResponseNode(RuntimeNode): def __init__( self, response: str, + explanation: str = "", resolved: bool = False, ): super().__init__(resolved) self.response = response + self.explanation = explanation class CraftingNode(RuntimeNode): From c840f4743490d3dc4a544d0706898f45be169698 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Tue, 21 Apr 2026 10:22:33 +0200 Subject: [PATCH 07/27] chore: remove comments --- GoT/model/graph_model.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py index ab6cc1c..d289e95 100644 --- a/GoT/model/graph_model.py +++ b/GoT/model/graph_model.py @@ -158,22 +158,6 @@ def goal(prompt: MessagesState): return prompt -# def tool_expand(goal: MessagesState): -# msg = parse_response(goal) -# sys_msg = "Please make a list using '-' to denote each tool in a probabilistic order, don't use this character for other reasons. Select only the tool(s) you want to use to solve this problem." -# messages = [ -# HumanMessage(msg), -# SystemMessage(sys_msg), -# ] -# res = starting_agent.invoke({"messages": messages}, config={"recursion_limit": MAX_INTERACTIONS}) -# str_res = parse_response(res) -# goal["messages"].append(AIMessage(content=str_res)) -# # tool_list = parse_tool_list(str_res) # Toglie elementi inutili -# # add tool nodes in the runtime graph - -# return goal - - def tool_reasoning(messages: MessagesState): messages["messages"].append( HumanMessage( @@ -438,7 +422,6 @@ def call_graph(prompt: str): def invoke_graph(): graph = StateGraph(MessagesState) graph.add_node(goal) - # graph.add_node(tool_expand) graph.add_node(tool_reasoning) graph.add_node(tool_call) graph.add_node(backtrack) @@ -447,7 +430,6 @@ def invoke_graph(): graph.add_node(response_evaluation) graph.add_node(reasoning_mode) graph.add_edge(START, "goal") - # graph.add_edge("goal", "tool_expand") graph.add_edge("goal", "tool_reasoning") graph.add_edge("tool_reasoning", "tool_call") graph.add_edge("tool_call", "response_evaluation") From 5bd0c625b26434fb6e3524ac424be1ce96b2002e Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 22 Apr 2026 10:14:56 +0200 Subject: [PATCH 08/27] chore: add method to download mlflow traces --- GoT/model/utils/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/GoT/model/utils/utils.py b/GoT/model/utils/utils.py index ac14a65..3ae103a 100644 --- a/GoT/model/utils/utils.py +++ b/GoT/model/utils/utils.py @@ -1,6 +1,7 @@ import json import re +import mlflow import numpy as np from sympy import simplify, sympify @@ -227,3 +228,7 @@ def print_benchmark_result_loglikehood( print(f"Total: {n_total}") print(f"Correct: {n_correct}") print(f"Wrong: {n_wrong}") + +def download_mlflow_traces(n_max: int): + traces = mlflow.search_traces(max_results=n_max, order_by=["timestamp DESC"]) + traces.to_csv("traces.csv", index=False) From b4a4dda80f734ec444575911e281e31dd64771d6 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 23 Apr 2026 16:27:23 +0200 Subject: [PATCH 09/27] chore: change var name and simplify codes --- GoT/model/graph_model.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py index d289e95..7769781 100644 --- a/GoT/model/graph_model.py +++ b/GoT/model/graph_model.py @@ -82,7 +82,7 @@ LLM().get_craft_tool(), SystemMessage( """ - You are a master coder specialized in crafting tools for other agents. + You are a specialized in coding and write new useful method. You create reusable Python tools for other agents. The tool must be GENERAL and parameterized. @@ -311,28 +311,24 @@ def crafting(messages: MessagesState): def test_result(messages: MessagesState): - n = runtime_graph.exist_tool_available() + is_tool_path_available = runtime_graph.exist_tool_available() test_node = runtime_graph.temp_node if not isinstance(test_node, TestNode): raise TypeError("Expected TestNode for scoring") - - if test_node.score >= ( - COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity - ): + threshold = COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity + if test_node.score >= threshold: runtime_graph.add_edge(test_node, runtime_graph.temp_response) runtime_graph.temp_response.resolved = True return END elif ( - test_node.score - < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity) - and n is True + test_node.score < threshold + and is_tool_path_available is True and test_node.need_tool_crafting is True ): return "crafting" elif ( - test_node.score - < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity) - and n is True + test_node.score < threshold + and is_tool_path_available is True ): if test_node.need_tool_crafting is True: test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity." @@ -340,7 +336,7 @@ def test_result(messages: MessagesState): elif ( test_node.score < (COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity) - and n is False + and is_tool_path_available is False and runtime_graph.exist_reasoning_node_available() ): return "reasoning_mode" From 0c93f80a6bb0d352a8447b2bc6cdc8ec7a1c85dd Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 24 Apr 2026 10:47:38 +0200 Subject: [PATCH 10/27] feat: add wikipedia and arxiv tools --- GoT/model/ollama_llm.py | 3 +- GoT/tools/web_tool.py | 54 +++++++++++++++++++++++++ poetry.lock | 89 ++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 GoT/tools/web_tool.py diff --git a/GoT/model/ollama_llm.py b/GoT/model/ollama_llm.py index 65bfcf2..5a59369 100644 --- a/GoT/model/ollama_llm.py +++ b/GoT/model/ollama_llm.py @@ -20,6 +20,7 @@ ) from GoT.tools.craft_tool import craft_tool, install_dependency +from GoT.tools.web_tool import search_arxiv, search_wikipedia load_dotenv() @@ -71,7 +72,7 @@ def __init__(self): self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL) def get_tools(self): - initial_tools = [summing, minus, square_root, multiply, divide] + initial_tools = [summing, minus, square_root, multiply, divide, search_wikipedia, search_arxiv] crafted_tools = self.get_crafted_tools() return initial_tools + crafted_tools diff --git a/GoT/tools/web_tool.py b/GoT/tools/web_tool.py new file mode 100644 index 0000000..5987bce --- /dev/null +++ b/GoT/tools/web_tool.py @@ -0,0 +1,54 @@ +import arxiv +from langchain.tools import tool +import wikipedia + +@tool +def search_wikipedia(query: str) -> str: + """ + Fetch a brief summary from Wikipedia. + + Args: + query (str): The keyword or topic to search for. + + Returns: + str: A 3-sentence summary of the topic, the first option if + ambiguous, or an error message if not found. + """ + try: + return wikipedia.search(query) + except wikipedia.DisambiguationError as e: + # happens when query is ambiguous, pick first option + return wikipedia.summary(e.options[0], sentences=3) + except wikipedia.PageError: + return "Page not found" + +@tool +def search_arxiv(query: str) -> str: + """Search ArXiv for scientific papers on a given topic. + Use this when you need to find research papers, abstracts or academic references.""" + + try: + client = arxiv.Client() + search = arxiv.Search( + query=query, + max_results=3, + sort_by=arxiv.SortCriterion.Relevance + ) + + results = [] + for paper in client.results(search): + results.append( + f"Title: {paper.title}\n" + f"Authors: {', '.join(a.name for a in paper.authors)}\n" + f"Published: {paper.published.strftime('%Y-%m-%d')}\n" + f"Summary: {paper.summary[:300]}...\n" + f"URL: {paper.entry_id}\n" + ) + + if not results: + return "No papers found for this query." + + return "\n---\n".join(results) + + except Exception as e: + return f"Error searching ArXiv: {str(e)}" \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index e6e3dce..f5f1ce4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -251,6 +251,22 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} [package.extras] trio = ["trio (>=0.31.0)", "trio (>=0.32.0)"] +[[package]] +name = "arxiv" +version = "3.0.0" +description = "Python wrapper for the arXiv API" +optional = false +python-versions = ">=3.10" +files = [ + {file = "arxiv-3.0.0-py3-none-any.whl", hash = "sha256:8b4d4e2e336bfeb71ea653623d7dadb260f682f0475cee2aecad0560a23b34db"}, + {file = "arxiv-3.0.0.tar.gz", hash = "sha256:c8cb0d31208afbc1ceb17bd3f9816c8d4c5ca1e0abf199d211e216715440498d"}, +] + +[package.dependencies] +feedparser = ">=6.0.10,<6.1.0" +requests = ">=2.32,<2.34" +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} + [[package]] name = "async-timeout" version = "5.0.1" @@ -273,6 +289,28 @@ files = [ {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"}, ] +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb"}, + {file = "beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86"}, +] + +[package.dependencies] +soupsieve = ">=1.6.1" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "blinker" version = "1.9.0" @@ -1290,6 +1328,20 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "feedparser" +version = "6.0.12" +description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" +optional = false +python-versions = ">=3.6" +files = [ + {file = "feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324"}, + {file = "feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228"}, +] + +[package.dependencies] +sgmllib3k = "*" + [[package]] name = "filelock" version = "3.24.2" @@ -5611,6 +5663,16 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.18.*)", "pytest-mypy"] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +description = "Py3k port of sgmllib." +optional = false +python-versions = "*" +files = [ + {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, +] + [[package]] name = "shellingham" version = "1.5.4" @@ -5676,6 +5738,17 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "soupsieve" +version = "2.8.3" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.9" +files = [ + {file = "soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95"}, + {file = "soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349"}, +] + [[package]] name = "sqlalchemy" version = "2.0.46" @@ -6367,6 +6440,20 @@ markupsafe = ">=2.1.1" [package.extras] watchdog = ["watchdog (>=2.3)"] +[[package]] +name = "wikipedia" +version = "1.4.0" +description = "Wikipedia API for Python" +optional = false +python-versions = "*" +files = [ + {file = "wikipedia-1.4.0.tar.gz", hash = "sha256:db0fad1829fdd441b1852306e9856398204dc0786d2996dd2e0c8bb8e26133b2"}, +] + +[package.dependencies] +beautifulsoup4 = "*" +requests = ">=2.0.0,<3.0.0" + [[package]] name = "word2number" version = "1.1" @@ -6868,4 +6955,4 @@ cffi = ["cffi (>=1.17,<2.0)", "cffi (>=2.0.0b)"] [metadata] lock-version = "2.0" python-versions = ">= 3.10.0 < 3.14.0" -content-hash = "c7c8e8591227891fa161988b58a7a8e708d8769ce133faf007103f95df4a0ef4" +content-hash = "afb0288bd77331357bfc74c7862dee098b73e2fce3028027cc78717cf022970b" diff --git a/pyproject.toml b/pyproject.toml index 95c44ad..20e60bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ dotenv = "^0.9.9" mlflow = "^3.9.0" lm-eval = {extras = ["math"], version = "^0.4.11"} boto3 = "^1.42.51" +wikipedia = "^1.4.0" +arxiv = "^3.0.0" [tool.poetry.group.dev.dependencies] coverage = "^7.4.0" From ea2f8e25ffb4e2b16f88edf4a5b3e0b67876b72e Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 24 Apr 2026 10:47:51 +0200 Subject: [PATCH 11/27] fix: fix names in type arg --- GoT/model/utils/parse_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GoT/model/utils/parse_args.py b/GoT/model/utils/parse_args.py index 6cb3b9d..818903a 100644 --- a/GoT/model/utils/parse_args.py +++ b/GoT/model/utils/parse_args.py @@ -39,7 +39,7 @@ def defining_and_parse_args(): "intermediate_algebra", "number_theory", "precalculus", - "statistics", + "prealgebra" ], help="The type of math problems to run, only for hendrycks_math.", ) From 8e8ff331842f12d4340ea5489c9d74b4b350759d Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Sat, 25 Apr 2026 14:24:23 +0200 Subject: [PATCH 12/27] chore: add specific crafter LLM and improve prompt --- GoT/model/graph_model.py | 89 ++++++++++++++++++++++++++++++++++------ GoT/model/ollama_llm.py | 8 +++- 2 files changed, 84 insertions(+), 13 deletions(-) diff --git a/GoT/model/graph_model.py b/GoT/model/graph_model.py index 7769781..8fc881a 100644 --- a/GoT/model/graph_model.py +++ b/GoT/model/graph_model.py @@ -103,18 +103,80 @@ def multiply(a: float, b: float) -> float: ' return a * b + + Bad example (hardcoded/placeholder result): + def search_papers(query: str) -> str: + return "Results about " + query # WRONG: never return hardcoded strings + + Good example (real API call): + def search_papers(query: str) -> str: + ' + Arguments: + query: the search query string + Returns: + A string with real results fetched from the API + ' + import arxiv + client = arxiv.Client() + search = arxiv.Search(query=query, max_results=3) + results = [p.title + ": " + p.summary[:200] for p in client.results(search)] + return "\\n".join(results) + + Bad example: Too specific + def get_oldest_blu_ray_title(spreadsheet_path: str) -> str: + " + Analyzes a spreadsheet to find the oldest Blu-Ray title. + + Arguments: + spreadsheet_path: The file path to the spreadsheet (e.g., 'C:/Users/user/data.xlsx'). + + Returns: + The title of the oldest Blu-Ray as it appears in the spreadsheet. + " + import pandas as pd + + df = pd.read_excel(spreadsheet_path) + + # Assuming 'Format' column for media type and 'Recording Date' for date + blu_rays = df[df['Format'] == 'Blu-Ray'] + + if blu_rays.empty: + return "No Blu-Ray titles found." + + # Ensure 'Recording Date' is in datetime format for proper comparison + blu_rays['Recording Date'] = pd.to_datetime(blu_rays['Recording Date']) + + oldest_blu_ray = blu_rays.sort_values(by='Recording Date', ascending=True).iloc[0] + + return oldest_blu_ray['Title'] + + Good example + def open_excel_files(excel_path: str) + Analyzes a spreadsheet. + + Arguments: + spreadsheet_path: The file path to the spreadsheet (e.g., 'C:/Users/user/data.xlsx'). + + Returns: + The excel file in string + " + import pandas as pd + + df = pd.read_excel(spreadsheet_path) + return df.to_string() + Rules: - Prefer generic names and parameters, never craft specific functions. - If the function contains specific numbers or values, it is wrong. - - Craft only one function, it must contains always the docs. + - Never return hardcoded or placeholder strings, the function must fetch real data. + - Craft a maximum of 3 tools, it must contains always the docs. If the number of tool crafted exceed, you fail. - Never craft tool that raise exceptions. - - Respond ONLY using the tool available. - No natural language. - - No comments in the python interpreter. + - No more than 1 line comments in the python codes. """ ), response_format=Response, - type="remote_response_format", + type="remote_crafter", ) reasoning_agent = LLM().create_custom_agent( @@ -280,16 +342,19 @@ def crafting(messages: MessagesState): HumanMessage(content="Original task:\n" + parse_response(runtime_graph.goal)), AIMessage(content=ai_feedback), SystemMessage( - content="Craft a tool to solve this problem using craft_tool. It must be a function" + content="Use the context given to craft a tool to solve this problem using craft_tool. It must be a function" ), ] - craft_res = crafter_agent.invoke( - {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS} - ) - runtime_graph.temp_response.response = parse_response_for_tool_node( - craft_res - ).response - parsed_res = f"Response: {parse_response_for_tool_node(craft_res).response}\nExplanation: {parse_response_for_tool_node(craft_res).explanation}" + try: + craft_res = crafter_agent.invoke( + {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS} + ) + parsed_res = parse_response(craft_res) + except Exception: + parsed_res = "" + # runtime_graph.temp_response.response = parse_response_for_tool_node( + # craft_res + # ).response runtime_graph.resolve_node(crafting_node, parsed_res) runtime_graph.temp_node = runtime_graph.call_tool_node() runtime_graph.add_edge(crafting_node, runtime_graph.temp_node) diff --git a/GoT/model/ollama_llm.py b/GoT/model/ollama_llm.py index 5a59369..e06bfe9 100644 --- a/GoT/model/ollama_llm.py +++ b/GoT/model/ollama_llm.py @@ -57,6 +57,11 @@ def __init__(self): api_key=os.environ.get("GEMINI_API_KEY"), temperature=1.0, # Gemini 3.0+ defaults to 1.0 ) + self.remoteLLMCrafter = ChatGoogleGenerativeAI( + model="gemini-3-flash-preview", + api_key=os.environ.get("GEMINI_API_KEY"), + temperature=1.0, # Gemini 3.0+ defaults to 1.0 + ) self.remoteLLMScoreFormat = ChatGoogleGenerativeAI( model="gemini-2.5-flash", api_key=os.environ.get("GEMINI_API_KEY"), @@ -67,12 +72,13 @@ def __init__(self): "remote_standard": self.remoteLLMStandard, "remote_response_format": self.remoteLLMResponseFormat, "remote_score_format": self.remoteLLMScoreFormat, + "remote_crafter": self.remoteLLMCrafter } self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL) def get_tools(self): - initial_tools = [summing, minus, square_root, multiply, divide, search_wikipedia, search_arxiv] + initial_tools = [summing, minus, square_root, multiply, divide] crafted_tools = self.get_crafted_tools() return initial_tools + crafted_tools From f0cec829dd2559beae94b771245869e7df0ea768 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Sat, 25 Apr 2026 15:21:25 +0200 Subject: [PATCH 13/27] style: change system folder architecture --- GoT/__init__.py | 8 ++++---- GoT/{tools => agent_tools}/ai_tool.py | 0 GoT/{tools => agent_tools}/craft_tool.py | 0 GoT/{tools => agent_tools}/math_tool.py | 0 GoT/{tools => agent_tools}/runtime_graph_tool.py | 6 +++--- GoT/{tools => agent_tools}/web_tool.py | 0 GoT/{model/utils => cli}/parse_args.py | 2 +- GoT/{model => core}/graph_model.py | 8 ++++---- GoT/{model/ollama_llm.py => core/llm.py} | 7 +++---- GoT/{model => core}/runtime_graph.py | 0 GoT/{model/utils => experiments}/hf_formatter.py | 6 +++--- GoT/{model => experiments}/lm_wrapper.py | 6 +++--- GoT/{model => }/utils/utils.py | 2 +- 13 files changed, 22 insertions(+), 23 deletions(-) rename GoT/{tools => agent_tools}/ai_tool.py (100%) rename GoT/{tools => agent_tools}/craft_tool.py (100%) rename GoT/{tools => agent_tools}/math_tool.py (100%) rename GoT/{tools => agent_tools}/runtime_graph_tool.py (95%) rename GoT/{tools => agent_tools}/web_tool.py (100%) rename GoT/{model/utils => cli}/parse_args.py (96%) rename GoT/{model => core}/graph_model.py (99%) rename GoT/{model/ollama_llm.py => core/llm.py} (94%) rename GoT/{model => core}/runtime_graph.py (100%) rename GoT/{model/utils => experiments}/hf_formatter.py (98%) rename GoT/{model => experiments}/lm_wrapper.py (98%) rename GoT/{model => }/utils/utils.py (99%) diff --git a/GoT/__init__.py b/GoT/__init__.py index c9d8fc7..a5df598 100644 --- a/GoT/__init__.py +++ b/GoT/__init__.py @@ -3,10 +3,10 @@ from dotenv import load_dotenv from lm_eval import evaluator, tasks -from GoT.model.graph_model import call_graph -from GoT.model.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper -from GoT.model.utils.parse_args import call_benchmark, defining_and_parse_args -from GoT.model.utils.utils import ( +from GoT.core.graph_model import call_graph +from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper +from GoT.cli.parse_args import call_benchmark, defining_and_parse_args +from GoT.utils.utils import ( print_benchmark_result, print_benchmark_result_loglikehood, ) diff --git a/GoT/tools/ai_tool.py b/GoT/agent_tools/ai_tool.py similarity index 100% rename from GoT/tools/ai_tool.py rename to GoT/agent_tools/ai_tool.py diff --git a/GoT/tools/craft_tool.py b/GoT/agent_tools/craft_tool.py similarity index 100% rename from GoT/tools/craft_tool.py rename to GoT/agent_tools/craft_tool.py diff --git a/GoT/tools/math_tool.py b/GoT/agent_tools/math_tool.py similarity index 100% rename from GoT/tools/math_tool.py rename to GoT/agent_tools/math_tool.py diff --git a/GoT/tools/runtime_graph_tool.py b/GoT/agent_tools/runtime_graph_tool.py similarity index 95% rename from GoT/tools/runtime_graph_tool.py rename to GoT/agent_tools/runtime_graph_tool.py index 6b2a7ef..7db745c 100644 --- a/GoT/tools/runtime_graph_tool.py +++ b/GoT/agent_tools/runtime_graph_tool.py @@ -1,9 +1,9 @@ from langchain.messages import HumanMessage, SystemMessage from langchain.tools import tool -from GoT.model.ollama_llm import LLM -from GoT.model.runtime_graph import ReasoningNode, RuntimeGraph -from GoT.model.utils.utils import parse_response +from GoT.core.llm import LLM +from GoT.core.runtime_graph import ReasoningNode, RuntimeGraph +from GoT.utils.utils import parse_response MAX_INTERACTIONS = 10 diff --git a/GoT/tools/web_tool.py b/GoT/agent_tools/web_tool.py similarity index 100% rename from GoT/tools/web_tool.py rename to GoT/agent_tools/web_tool.py diff --git a/GoT/model/utils/parse_args.py b/GoT/cli/parse_args.py similarity index 96% rename from GoT/model/utils/parse_args.py rename to GoT/cli/parse_args.py index 818903a..b7605b4 100644 --- a/GoT/model/utils/parse_args.py +++ b/GoT/cli/parse_args.py @@ -1,7 +1,7 @@ import argparse import sys -from GoT.model.utils.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math +from GoT.experiments.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math def defining_and_parse_args(): diff --git a/GoT/model/graph_model.py b/GoT/core/graph_model.py similarity index 99% rename from GoT/model/graph_model.py rename to GoT/core/graph_model.py index 8fc881a..b1ab35b 100644 --- a/GoT/model/graph_model.py +++ b/GoT/core/graph_model.py @@ -2,8 +2,8 @@ from langchain_core.messages import HumanMessage, SystemMessage, AIMessage from langgraph.graph import StateGraph, MessagesState, START, END -from GoT.model.ollama_llm import LLM -from GoT.model.runtime_graph import ( +from GoT.core.llm import LLM +from GoT.core.runtime_graph import ( BacktrackNode, CompletitionNode, CraftingNode, @@ -15,13 +15,13 @@ TestNode, ToolNode, ) -from GoT.model.utils.utils import ( +from GoT.utils.utils import ( extract_tool_used, parse_response, parse_response_for_tool_node, parse_score, ) -from GoT.tools.runtime_graph_tool import divide_thought +from GoT.agent_tools.runtime_graph_tool import divide_thought SCORE_THRESHOLD = 5 COMPLEXITY_COEFFICIENT = 0.5 diff --git a/GoT/model/ollama_llm.py b/GoT/core/llm.py similarity index 94% rename from GoT/model/ollama_llm.py rename to GoT/core/llm.py index e06bfe9..29d4513 100644 --- a/GoT/model/ollama_llm.py +++ b/GoT/core/llm.py @@ -11,7 +11,7 @@ from langchain.agents import create_agent import mlflow -from GoT.tools.math_tool import ( +from GoT.agent_tools.math_tool import ( multiply, summing, minus, @@ -19,8 +19,7 @@ divide, ) -from GoT.tools.craft_tool import craft_tool, install_dependency -from GoT.tools.web_tool import search_arxiv, search_wikipedia +from GoT.agent_tools.craft_tool import craft_tool, install_dependency load_dotenv() @@ -86,7 +85,7 @@ def get_craft_tool(self): return [craft_tool, install_dependency] def get_crafted_tools(self) -> list[BaseTool]: - module_name = "GoT.tools.ai_tool" + module_name = "GoT.agent_tools.ai_tool" if module_name in sys.modules: module = importlib.reload(sys.modules[module_name]) else: diff --git a/GoT/model/runtime_graph.py b/GoT/core/runtime_graph.py similarity index 100% rename from GoT/model/runtime_graph.py rename to GoT/core/runtime_graph.py diff --git a/GoT/model/utils/hf_formatter.py b/GoT/experiments/hf_formatter.py similarity index 98% rename from GoT/model/utils/hf_formatter.py rename to GoT/experiments/hf_formatter.py index 5c9b162..f7127a4 100644 --- a/GoT/model/utils/hf_formatter.py +++ b/GoT/experiments/hf_formatter.py @@ -7,9 +7,9 @@ from langchain.messages import HumanMessage -from GoT.model.graph_model import call_graph -from GoT.model.ollama_llm import LLM -from GoT.model.utils.utils import ( +from GoT.core.graph_model import call_graph +from GoT.core.llm import LLM +from GoT.utils.utils import ( extract_output, normalize_list, normalize_number, diff --git a/GoT/model/lm_wrapper.py b/GoT/experiments/lm_wrapper.py similarity index 98% rename from GoT/model/lm_wrapper.py rename to GoT/experiments/lm_wrapper.py index 1013c0b..8f9bc42 100644 --- a/GoT/model/lm_wrapper.py +++ b/GoT/experiments/lm_wrapper.py @@ -1,10 +1,10 @@ -from GoT.model.graph_model import call_graph +from GoT.core.graph_model import call_graph from lm_eval.api.registry import register_model from lm_eval.api.model import LM -from GoT.model.ollama_llm import LLM +from GoT.core.llm import LLM from langchain_core.messages import HumanMessage -from GoT.model.utils.utils import extract_output, normalize_number, parse_response +from GoT.utils.utils import extract_output, normalize_number, parse_response class LangGraphLM: diff --git a/GoT/model/utils/utils.py b/GoT/utils/utils.py similarity index 99% rename from GoT/model/utils/utils.py rename to GoT/utils/utils.py index 3ae103a..d156a87 100644 --- a/GoT/model/utils/utils.py +++ b/GoT/utils/utils.py @@ -5,7 +5,7 @@ import numpy as np from sympy import simplify, sympify -from GoT.model.runtime_graph import Response, Score +from GoT.core.runtime_graph import Response, Score from langgraph.graph import MessagesState from langchain_core.messages import AIMessage From fab5a9cb58b91fa1396fb191f13e619ff78a3253 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Sat, 25 Apr 2026 15:26:29 +0200 Subject: [PATCH 14/27] style: ruff format + ignores arxiv, wikipedia stubs --- GoT/agent_tools/web_tool.py | 24 ++++++++++++------------ GoT/cli/parse_args.py | 9 +++++++-- GoT/core/graph_model.py | 20 +++++++++++--------- GoT/core/llm.py | 2 +- GoT/experiments/hf_formatter.py | 15 ++++++++++----- GoT/utils/utils.py | 1 + pyproject.toml | 2 +- 7 files changed, 43 insertions(+), 30 deletions(-) diff --git a/GoT/agent_tools/web_tool.py b/GoT/agent_tools/web_tool.py index 5987bce..b56413d 100644 --- a/GoT/agent_tools/web_tool.py +++ b/GoT/agent_tools/web_tool.py @@ -2,6 +2,7 @@ from langchain.tools import tool import wikipedia + @tool def search_wikipedia(query: str) -> str: """ @@ -11,7 +12,7 @@ def search_wikipedia(query: str) -> str: query (str): The keyword or topic to search for. Returns: - str: A 3-sentence summary of the topic, the first option if + str: A 3-sentence summary of the topic, the first option if ambiguous, or an error message if not found. """ try: @@ -21,20 +22,19 @@ def search_wikipedia(query: str) -> str: return wikipedia.summary(e.options[0], sentences=3) except wikipedia.PageError: return "Page not found" - + + @tool def search_arxiv(query: str) -> str: - """Search ArXiv for scientific papers on a given topic. + """Search ArXiv for scientific papers on a given topic. Use this when you need to find research papers, abstracts or academic references.""" - + try: client = arxiv.Client() search = arxiv.Search( - query=query, - max_results=3, - sort_by=arxiv.SortCriterion.Relevance + query=query, max_results=3, sort_by=arxiv.SortCriterion.Relevance ) - + results = [] for paper in client.results(search): results.append( @@ -44,11 +44,11 @@ def search_arxiv(query: str) -> str: f"Summary: {paper.summary[:300]}...\n" f"URL: {paper.entry_id}\n" ) - + if not results: return "No papers found for this query." - + return "\n---\n".join(results) - + except Exception as e: - return f"Error searching ArXiv: {str(e)}" \ No newline at end of file + return f"Error searching ArXiv: {str(e)}" diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py index b7605b4..f52e05e 100644 --- a/GoT/cli/parse_args.py +++ b/GoT/cli/parse_args.py @@ -1,7 +1,12 @@ import argparse import sys -from GoT.experiments.hf_formatter import use_gaia, use_gpqa, use_gsm8k, use_hendrycks_math +from GoT.experiments.hf_formatter import ( + use_gaia, + use_gpqa, + use_gsm8k, + use_hendrycks_math, +) def defining_and_parse_args(): @@ -39,7 +44,7 @@ def defining_and_parse_args(): "intermediate_algebra", "number_theory", "precalculus", - "prealgebra" + "prealgebra", ], help="The type of math problems to run, only for hendrycks_math.", ) diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py index b1ab35b..c64e3b1 100644 --- a/GoT/core/graph_model.py +++ b/GoT/core/graph_model.py @@ -283,7 +283,9 @@ def tool_call(messages: MessagesState): ) tool_used = extract_tool_used(res) runtime_graph.temp_response.response = parse_response_for_tool_node(res).response - runtime_graph.temp_response.explanation = parse_response_for_tool_node(res).explanation + runtime_graph.temp_response.explanation = parse_response_for_tool_node( + res + ).explanation parsed_res = f"Response: {parse_response_for_tool_node(res).response}\nExplanation: {parse_response_for_tool_node(res).explanation}" runtime_graph.resolve_node(call_node, parsed_res) @@ -345,12 +347,13 @@ def crafting(messages: MessagesState): content="Use the context given to craft a tool to solve this problem using craft_tool. It must be a function" ), ] - try: + try: craft_res = crafter_agent.invoke( - {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS} + {"messages": crafting_messages}, + config={"recursion_limit": MAX_INTERACTIONS}, ) parsed_res = parse_response(craft_res) - except Exception: + except Exception: parsed_res = "" # runtime_graph.temp_response.response = parse_response_for_tool_node( # craft_res @@ -380,7 +383,9 @@ def test_result(messages: MessagesState): test_node = runtime_graph.temp_node if not isinstance(test_node, TestNode): raise TypeError("Expected TestNode for scoring") - threshold = COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity + threshold = ( + COMPLEXITY_THRESHOLD - COMPLEXITY_COEFFICIENT * test_node.problem_complexity + ) if test_node.score >= threshold: runtime_graph.add_edge(test_node, runtime_graph.temp_response) runtime_graph.temp_response.resolved = True @@ -391,10 +396,7 @@ def test_result(messages: MessagesState): and test_node.need_tool_crafting is True ): return "crafting" - elif ( - test_node.score < threshold - and is_tool_path_available is True - ): + elif test_node.score < threshold and is_tool_path_available is True: if test_node.need_tool_crafting is True: test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity." return "backtrack" diff --git a/GoT/core/llm.py b/GoT/core/llm.py index 29d4513..f76b50f 100644 --- a/GoT/core/llm.py +++ b/GoT/core/llm.py @@ -71,7 +71,7 @@ def __init__(self): "remote_standard": self.remoteLLMStandard, "remote_response_format": self.remoteLLMResponseFormat, "remote_score_format": self.remoteLLMScoreFormat, - "remote_crafter": self.remoteLLMCrafter + "remote_crafter": self.remoteLLMCrafter, } self.system_prompt = SystemMessage(SYSTEM_PROMPT_GENERAL) diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py index f7127a4..6e2ebc8 100644 --- a/GoT/experiments/hf_formatter.py +++ b/GoT/experiments/hf_formatter.py @@ -18,6 +18,7 @@ TOKEN = os.getenv("HF_TOKEN") + class ResultEval: def __init__( self, @@ -207,6 +208,7 @@ def hendrycks_math_eval(responses: list[ResultEval]): print(f"Total: {len(responses)}") print(f"Correct: {correct}") + def benchmark_run( questions: list[ResultEval], max_run: int, test: bool ) -> list[ResultEval]: @@ -261,11 +263,11 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]: attachment = sample.get("file_name", None) if attachment: abs_path = hf_hub_download( - repo_id="gaia-benchmark/GAIA", - filename=f"2023/validation/{attachment}", - repo_type="dataset", - token=TOKEN - ) + repo_id="gaia-benchmark/GAIA", + filename=f"2023/validation/{attachment}", + repo_type="dataset", + token=TOKEN, + ) print(abs_path) question += f"\nAttachment file path: {abs_path}" correct_answer = sample["Final answer"] @@ -283,6 +285,7 @@ def gaia_format(dataset: Dataset) -> list[ResultEval]: return questions + def gaia_eval(responses: list[ResultEval]): correct = 0 @@ -300,6 +303,7 @@ def gaia_eval(responses: list[ResultEval]): print(f"Total: {len(responses)}") print(f"Correct: {correct}") + def use_gpqa(max_run: int, test: bool, model_name: str): ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") questions = gpqa_format(ds) @@ -323,6 +327,7 @@ def use_hendrycks_math(max_run: int, test: bool, model_name: str, type: str): hendrycks_math_eval(responses) save_eval_results(responses, model_name=model_name) + def use_gaia(max_run: int, test: bool, model_name: str): ds = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation") questions = gaia_format(ds) diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py index d156a87..1b936b4 100644 --- a/GoT/utils/utils.py +++ b/GoT/utils/utils.py @@ -229,6 +229,7 @@ def print_benchmark_result_loglikehood( print(f"Correct: {n_correct}") print(f"Wrong: {n_wrong}") + def download_mlflow_traces(n_max: int): traces = mlflow.search_traces(max_results=n_max, order_by=["timestamp DESC"]) traces.to_csv("traces.csv", index=False) diff --git a/pyproject.toml b/pyproject.toml index 20e60bf..27f02e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,5 +51,5 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [[tool.mypy.overrides]] -module = ["lm_eval.*", "datasets.*", "sympy.*"] +module = ["lm_eval.*", "datasets.*", "sympy.*", "arxiv.*", "wikipedia.*"] follow_untyped_imports = true From 345ce80b33533f88ecf79d90b2e7d24a9d254f89 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 7 May 2026 15:36:26 +0200 Subject: [PATCH 15/27] chore: improve evaluation --- GoT/experiments/hf_formatter.py | 4 ++-- GoT/utils/utils.py | 42 +++++++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py index 6e2ebc8..42c333b 100644 --- a/GoT/experiments/hf_formatter.py +++ b/GoT/experiments/hf_formatter.py @@ -10,6 +10,7 @@ from GoT.core.graph_model import call_graph from GoT.core.llm import LLM from GoT.utils.utils import ( + extract_answer_from_response, extract_output, normalize_list, normalize_number, @@ -190,8 +191,7 @@ def hendrycks_math_eval(responses: list[ResultEval]): correct = 0 for res in responses: - opt_res = re.search(r"\\boxed\{(.*)\}", res.response) - norm_res = opt_res.group(1) if opt_res else "N/A" + norm_res = extract_answer_from_response(res.response) norm_correct = normalize_number(res.correct_answer) res.filtered_answer = norm_res diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py index 1b936b4..c0714cd 100644 --- a/GoT/utils/utils.py +++ b/GoT/utils/utils.py @@ -83,8 +83,14 @@ def parse_response_for_tool_node(response: MessagesState) -> Response: if isinstance(structured_response, Response): return structured_response elif score_res is not None: - data = json.loads(score_res) - return Response.model_validate(data) + try: + data = json.loads(score_res) + return Response.model_validate(data) + except json.JSONDecodeError: + return Response( + response=score_res, + explanation="", + ) else: return Response( response="Failed to parse response", @@ -180,6 +186,38 @@ def symbolic_equal(a, b): except Exception: return False +def extract_answer_from_response(response: str) -> str: + """ + Extract the answer from the LLM response. + + :param response: The LLM response + :type response: str + :return: The extracted answer + :rtype: str + """ + # Try to extract using \\boxed{answer} + boxed_match = re.search(r"\\boxed\{([^}]*)\}", response) + if boxed_match: + return boxed_match.group(1).strip() + + # Try to extract using boxed{answer} + boxed_match_alt = re.search(r"oxed\{([^}]*)\}", response) + if boxed_match_alt: + return boxed_match_alt.group(1).strip() + + # Try to extract using Answer: answer + answer_match = re.search(r"Answer:\s*(.*)", response) + if answer_match: + return answer_match.group(1).strip() + + try: + if float(response.strip()): + return response.strip() + except ValueError: + return "N/A" + # If no pattern matched, return the original response + return "N/A" + def print_benchmark_result(results: dict, task_name: str, filter: str) -> None: samples = results["samples"][task_name] From 2b4f6171715bd334781ddc67e95bd7f062cc31c0 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 8 May 2026 11:01:05 +0200 Subject: [PATCH 16/27] chore: code refinement --- GoT/core/graph_model.py | 3 +-- GoT/core/runtime_graph.py | 28 +++------------------------- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py index c64e3b1..9fc8460 100644 --- a/GoT/core/graph_model.py +++ b/GoT/core/graph_model.py @@ -168,6 +168,7 @@ def open_excel_files(excel_path: str) Rules: - Prefer generic names and parameters, never craft specific functions. - If the function contains specific numbers or values, it is wrong. + - The Tool must follow the Json schema protocol, Tuple is banned. - Never return hardcoded or placeholder strings, the function must fetch real data. - Craft a maximum of 3 tools, it must contains always the docs. If the number of tool crafted exceed, you fail. - Never craft tool that raise exceptions. @@ -205,7 +206,6 @@ def goal(prompt: MessagesState): call_node = ToolNode( "Please, resolve the problem with the tools given, you MUST follow the previous reasoning.", "", - tool_name="", ) reasoning_node = ReasoningNode("") runtime_graph.add_node(reasoning_node) @@ -454,7 +454,6 @@ def backtrack(messages: MessagesState): runtime_graph.add_edge( backtrack_node, runtime_graph.temp_node ) # tool call node that we want to resolve - # messages = runtime_graph.append_prompt_to_messages_state(runtime_graph.temp_node) messages.get("messages", []).append(AIMessage(backtrack_node.feedback)) return messages diff --git a/GoT/core/runtime_graph.py b/GoT/core/runtime_graph.py index 6047e23..56225fe 100644 --- a/GoT/core/runtime_graph.py +++ b/GoT/core/runtime_graph.py @@ -1,17 +1,16 @@ from typing import Dict, List from langgraph.graph import MessagesState -from langchain_core.messages import AnyMessage, HumanMessage from pydantic import BaseModel, Field class RuntimeNode: - _id_counter = 0 # Contatore globale per ID unici + _id_counter = 0 # global ID counter def __init__( self, resolved: bool = False, ): - self.id = RuntimeNode._id_counter # ID unico per ogni nodo + self.id = RuntimeNode._id_counter RuntimeNode._id_counter += 1 self.resolved = resolved @@ -52,13 +51,11 @@ def __init__( self, prompt: str, response: str, - tool_name: str, resolved: bool = False, ): super().__init__(resolved) self.prompt = prompt self.response = response - self.tool_name = tool_name class GoalNode(RuntimeNode): @@ -170,7 +167,6 @@ class RuntimeGraph: def __init__(self): self.goal: MessagesState = MessagesState(messages=[]) self.nodes: Dict[RuntimeNode, List[RuntimeNode]] = {} - self.tools_available: Dict[RuntimeNode, str] = {} self.temp_node: RuntimeNode = RuntimeNode() self.temp_response: ResponseNode = ResponseNode(response="", resolved=False) @@ -181,9 +177,6 @@ def add_edge(self, n1: RuntimeNode, n2: RuntimeNode): self.nodes.setdefault(n1, []).append(n2) self.nodes.setdefault(n2, []) - def add_tool_link(self, call_node: RuntimeNode, tool_name: str): - self.tools_available.setdefault(call_node, tool_name) - def resolve_node(self, node: RuntimeNode, response: str) -> None: if isinstance(node, (ToolNode, TestNode, CompletitionNode)): node.response = response @@ -208,31 +201,16 @@ def exist_tool_available(self) -> bool: call_nodes = [n for n in nodes if (isinstance(n, ToolNode) and not n.resolved)] return True if call_nodes else False - def get_resolved_tools(self): - resolved_nodes = [t for t in self.tools_available.keys() if t.resolved is True] - return [self.tools_available[n] for n in resolved_nodes] - - def is_craftin_node_resolved(self) -> bool: + def is_crafting_node_resolved(self) -> bool: nodes = list(self.nodes.keys()) crafting_nodes = [ n for n in nodes if (isinstance(n, CraftingNode) and n.resolved) ] return True if crafting_nodes else False - def append_prompt_to_messages_state( - self, node: TestNode | ToolNode | CompletitionNode | GoalNode - ) -> MessagesState: - messages: list[AnyMessage] = [] - - if node.prompt: - messages.append(HumanMessage(content=node.prompt)) - - return MessagesState(messages=messages) - def clear(self): RuntimeNode._id_counter = 0 self.nodes = {} - self.tools_available = {} self.temp_node = RuntimeNode() self.temp_response = ResponseNode(response="", resolved=False) From 47ef8b4a50dd5072bf7b145b35f6ed29e566caa1 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Thu, 14 May 2026 14:46:39 +0200 Subject: [PATCH 17/27] feat: add custom runs on cli args --- GoT/__init__.py | 7 +------ GoT/cli/parse_args.py | 11 ++++++++++- GoT/experiments/runner_custom.py | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 GoT/experiments/runner_custom.py diff --git a/GoT/__init__.py b/GoT/__init__.py index a5df598..8a62a54 100644 --- a/GoT/__init__.py +++ b/GoT/__init__.py @@ -3,7 +3,6 @@ from dotenv import load_dotenv from lm_eval import evaluator, tasks -from GoT.core.graph_model import call_graph from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper from GoT.cli.parse_args import call_benchmark, defining_and_parse_args from GoT.utils.utils import ( @@ -62,14 +61,10 @@ def lm_eval_graph_benchmark(): print_benchmark_result_loglikehood(results, task_name, filter_val="none") -def custom_test(): - call_graph("Solve this integral ∫x2⋅ex2dx") - - def main(): - # It could be changed with custom_test() to test a custom problem instead of the benchmark args = defining_and_parse_args() call_benchmark(args) + # download_mlflow_traces(50) # let this be the last line of this file diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py index f52e05e..635b67f 100644 --- a/GoT/cli/parse_args.py +++ b/GoT/cli/parse_args.py @@ -7,6 +7,7 @@ use_gsm8k, use_hendrycks_math, ) +from GoT.experiments.runner_custom import custom_test def defining_and_parse_args(): @@ -17,7 +18,7 @@ def defining_and_parse_args(): "--benchmark", required=True, type=str, - choices=["gsm8k", "gpqa", "hendrycks_math", "gaia"], + choices=["gsm8k", "gpqa", "hendrycks_math", "gaia", "custom"], help="The benchmark to run the model on.", ) parser.add_argument( @@ -27,6 +28,12 @@ def defining_and_parse_args(): choices=["graph", "standard"], help="Whether to run the standard model or the graph model.", ) + parser.add_argument( + "--prompt", + type=str, + default="", + help="Insert a prompt during a custom run." + ) parser.add_argument( "--max_run", type=int, @@ -69,3 +76,5 @@ def call_benchmark(args): use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type) elif args.benchmark == "gaia": use_gaia(max_run=max_run, test=test, model_name=mode) + elif args.benchmark == "custom" and args.prompt != "": + custom_test(args.prompt, test) diff --git a/GoT/experiments/runner_custom.py b/GoT/experiments/runner_custom.py new file mode 100644 index 0000000..5db80a9 --- /dev/null +++ b/GoT/experiments/runner_custom.py @@ -0,0 +1,15 @@ +from langchain.messages import HumanMessage + +from GoT.core.graph_model import call_graph +from GoT.core.llm import LLM + + +def custom_test(text: str, is_graph_mode: bool): + if not is_graph_mode: + call_graph(text) + else: + agent = LLM().create_custom_agent(LLM().get_tools()) + agent.invoke( + {"messages": [HumanMessage(content=text)]}, + config={"recursion_limit": 20}, + ) \ No newline at end of file From 89e468de056c8cfc899e5de1990ca998f6d5d52c Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 15 May 2026 14:38:20 +0200 Subject: [PATCH 18/27] chore: change argument 'type' in 'category' --- GoT/cli/parse_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py index 635b67f..2cd4275 100644 --- a/GoT/cli/parse_args.py +++ b/GoT/cli/parse_args.py @@ -41,7 +41,7 @@ def defining_and_parse_args(): help="The maximum number of runs for the benchmark.", ) parser.add_argument( - "--type", + "--category", type=str, default="algebra", choices=[ @@ -73,7 +73,7 @@ def call_benchmark(args): elif args.benchmark == "gpqa": use_gpqa(max_run=max_run, test=test, model_name=mode) elif args.benchmark == "hendrycks_math": - use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type) + use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.category) elif args.benchmark == "gaia": use_gaia(max_run=max_run, test=test, model_name=mode) elif args.benchmark == "custom" and args.prompt != "": From d2f886fbf1d629ce9effd7ec5dc0e441bfd89456 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 10:31:46 +0200 Subject: [PATCH 19/27] chore: memorize tool crafted in each CraftingNode --- GoT/core/graph_model.py | 21 +++++++-------------- GoT/core/runtime_graph.py | 4 ++-- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py index 9fc8460..4b1a76b 100644 --- a/GoT/core/graph_model.py +++ b/GoT/core/graph_model.py @@ -17,6 +17,7 @@ ) from GoT.utils.utils import ( extract_tool_used, + extract_tools_crafted, parse_response, parse_response_for_tool_node, parse_score, @@ -54,16 +55,16 @@ Your duty is to score, from 0 to 5, the response that user gives and assign a score. Rules: + - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem. - You MUST respond ONLY using the Score function. - You must consider if the format of the answer follow the instruction - You cannot give the full solution, only hints. - - If a response suggest the need of crafting a tool, score it with 1 or less and specify clearly the need of a new tool to solve the problem. - Do not write natural language outside the function. - Always consider creating a tool if it makes the response correct or reusable. Score meanings: 0: Impossible to understand / completely wrong - 1: Nearly completely wrong + 1: Nearly completely wrong / need to craft a tool 2: Correct language but does not follow instruction 3: Tries to solve but fails instruction / wrong 4: Follows instruction but result wrong or incomplete @@ -203,11 +204,11 @@ def goal(prompt: MessagesState): runtime_graph.add_node(goal_node) runtime_graph.temp_node = goal_node for i in range(0, 3): + reasoning_node = ReasoningNode("") call_node = ToolNode( "Please, resolve the problem with the tools given, you MUST follow the previous reasoning.", "", ) - reasoning_node = ReasoningNode("") runtime_graph.add_node(reasoning_node) runtime_graph.add_node(call_node) runtime_graph.add_edge(goal_node, reasoning_node) @@ -242,7 +243,7 @@ def tool_call(messages: MessagesState): # It calls the llm and it resolves the call node call_node = runtime_graph.temp_node tool_agent = LLM().create_custom_agent( - LLM().get_tools() + [divide_thought], + LLM().get_tools(), SystemMessage( "You are an assistant specialized in tools. Your goal is to resolve the problem with " " the tool that the user indicates to you. You HAVE to use or craft the tool that the assistant indicates to you." @@ -335,7 +336,7 @@ def response_evaluation(messages: MessagesState): def crafting(messages: MessagesState): - crafting_node = CraftingNode(response="", tool_crafted="", resolved=False) + crafting_node = CraftingNode(response="", tools_crafted="", resolved=False) runtime_graph.add_node(crafting_node) runtime_graph.add_edge(runtime_graph.temp_node, crafting_node) runtime_graph.temp_node = crafting_node @@ -352,12 +353,10 @@ def crafting(messages: MessagesState): {"messages": crafting_messages}, config={"recursion_limit": MAX_INTERACTIONS}, ) + crafting_node.tools_crafted = extract_tools_crafted(craft_res) parsed_res = parse_response(craft_res) except Exception: parsed_res = "" - # runtime_graph.temp_response.response = parse_response_for_tool_node( - # craft_res - # ).response runtime_graph.resolve_node(crafting_node, parsed_res) runtime_graph.temp_node = runtime_graph.call_tool_node() runtime_graph.add_edge(crafting_node, runtime_graph.temp_node) @@ -390,12 +389,6 @@ def test_result(messages: MessagesState): runtime_graph.add_edge(test_node, runtime_graph.temp_response) runtime_graph.temp_response.resolved = True return END - elif ( - test_node.score < threshold - and is_tool_path_available is True - and test_node.need_tool_crafting is True - ): - return "crafting" elif test_node.score < threshold and is_tool_path_available is True: if test_node.need_tool_crafting is True: test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity." diff --git a/GoT/core/runtime_graph.py b/GoT/core/runtime_graph.py index 56225fe..6661e15 100644 --- a/GoT/core/runtime_graph.py +++ b/GoT/core/runtime_graph.py @@ -116,12 +116,12 @@ class CraftingNode(RuntimeNode): def __init__( self, response: str, - tool_crafted: str = "", + tools_crafted: list[str] = [], resolved: bool = False, ): super().__init__(resolved) self.response = response - self.tool_crafted = tool_crafted + self.tools_crafted = tools_crafted class Score(BaseModel): From 2558b007b45aebd697ac5767291e8d8c96d86312 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 10:32:25 +0200 Subject: [PATCH 20/27] chore: improve parsing --- GoT/experiments/hf_formatter.py | 3 +- GoT/utils/utils.py | 71 ++++++++++++++++++++++++--------- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/GoT/experiments/hf_formatter.py b/GoT/experiments/hf_formatter.py index 42c333b..a3973c9 100644 --- a/GoT/experiments/hf_formatter.py +++ b/GoT/experiments/hf_formatter.py @@ -14,6 +14,7 @@ extract_output, normalize_list, normalize_number, + parse_response, symbolic_equal, ) @@ -222,7 +223,7 @@ def benchmark_run( correct_answer = q.correct_answer try: if test: - response = extract_output( + response = parse_response( agent.invoke( {"messages": [HumanMessage(content=prompt)]}, config={"recursion_limit": 20}, diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py index c0714cd..fd88910 100644 --- a/GoT/utils/utils.py +++ b/GoT/utils/utils.py @@ -17,7 +17,10 @@ def parse_response(res) -> str: :param res: the MessagesState :return: The response in string """ - return res["messages"][-1].content + response = res["messages"][-1].text + if response is None or response == "": + response = res["messages"][-1].content + return response def parse_tool_list(response: str) -> list[str]: @@ -114,6 +117,45 @@ def extract_tool_used(response: MessagesState) -> list[str]: tools_used.append(tool_call["name"]) return tools_used +def extract_function_signature(tool_crafted: dict) -> str: + """ + Extract name and arguments from function string. + """ + func_str = tool_crafted.get("tool_function", "") + match = re.search(r"def (\w+)\(([^)]*)\)", func_str) + if not match: + return "" + + func_name = match.group(1) + args = match.group(2) + + clean_args = ", ".join( + arg.split("=")[0].strip() + for arg in args.split(",") + if arg.strip() and arg.strip() != "self" + ) + + return f"{func_name}({clean_args})" + +def extract_tools_crafted(response: MessagesState) -> list[str]: + """ + Extract the tools that LLM has crafted. + + :param response: The LLM response + :type response: MessagesState + :return: The list of tools crafted + :rtype: list[str] + """ + tools_crafted = [] + for msg in response.get("messages", []): + if isinstance(msg, AIMessage): + for tool_call in msg.tool_calls: + tool_crafted = tool_call["args"] + signature = extract_function_signature(tool_crafted) + if signature != '': + tools_crafted.append(signature) + return tools_crafted + def remove_tools_from_list(tool_list, tools_to_remove): """ @@ -187,36 +229,27 @@ def symbolic_equal(a, b): return False def extract_answer_from_response(response: str) -> str: - """ - Extract the answer from the LLM response. - - :param response: The LLM response - :type response: str - :return: The extracted answer - :rtype: str - """ - # Try to extract using \\boxed{answer} - boxed_match = re.search(r"\\boxed\{([^}]*)\}", response) + boxed_match = re.search(r"\\boxed\{(.*)\}", response) if boxed_match: return boxed_match.group(1).strip() - # Try to extract using boxed{answer} - boxed_match_alt = re.search(r"oxed\{([^}]*)\}", response) + boxed_match_alt = re.search(r"boxed\{(.*)\}", response) if boxed_match_alt: return boxed_match_alt.group(1).strip() - # Try to extract using Answer: answer - answer_match = re.search(r"Answer:\s*(.*)", response) + answer_match = re.search(r"Answer:\s*(.*)", response, re.IGNORECASE) if answer_match: return answer_match.group(1).strip() + clean_response = response.strip() + if not clean_response: + return "N/A" + try: - if float(response.strip()): - return response.strip() + float(clean_response) + return clean_response except ValueError: return "N/A" - # If no pattern matched, return the original response - return "N/A" def print_benchmark_result(results: dict, task_name: str, filter: str) -> None: From 0468e3ee1cc319e7caf66d0123ccb86e0c0ada5d Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 10:59:04 +0200 Subject: [PATCH 21/27] style: change var names --- GoT/core/llm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GoT/core/llm.py b/GoT/core/llm.py index f76b50f..6ae2014 100644 --- a/GoT/core/llm.py +++ b/GoT/core/llm.py @@ -51,26 +51,26 @@ def __init__(self): api_key=os.environ.get("GEMINI_API_KEY"), temperature=1.0, # Gemini 3.0+ defaults to 1.0 ) - self.remoteLLMResponseFormat = ChatGoogleGenerativeAI( + self.remoteLLMReasoning = ChatGoogleGenerativeAI( model="gemini-2.5-flash", api_key=os.environ.get("GEMINI_API_KEY"), temperature=1.0, # Gemini 3.0+ defaults to 1.0 ) self.remoteLLMCrafter = ChatGoogleGenerativeAI( - model="gemini-3-flash-preview", + model="gemini-2.5-flash", api_key=os.environ.get("GEMINI_API_KEY"), temperature=1.0, # Gemini 3.0+ defaults to 1.0 ) - self.remoteLLMScoreFormat = ChatGoogleGenerativeAI( + self.remoteLLMEvaluator = ChatGoogleGenerativeAI( model="gemini-2.5-flash", api_key=os.environ.get("GEMINI_API_KEY"), - temperature=0.7, # Gemini 3.0+ defaults to 1.0 + temperature=1.0, # Gemini 3.0+ defaults to 1.0 ) self.remoteLLMs = { "remote_standard": self.remoteLLMStandard, - "remote_response_format": self.remoteLLMResponseFormat, - "remote_score_format": self.remoteLLMScoreFormat, + "remote_response_format": self.remoteLLMReasoning, + "remote_score_format": self.remoteLLMEvaluator, "remote_crafter": self.remoteLLMCrafter, } From ac7289b8cc5b6134953c8dbeeb2c40d67e06dba0 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 10:59:50 +0200 Subject: [PATCH 22/27] chore: comment pypi release --- .github/workflows/deploy.yml | 2 +- release.config.mjs | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 555263c..0bfbde1 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -62,6 +62,6 @@ jobs: env: # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} - RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }} + # RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }} # dry run if not on main/master branch, or if initial commit RELEASE_DRY_RUN: ${{ steps.skip_release.outputs.skip == 'true' || (github.ref_name != 'master' && github.ref_name != 'main') }} diff --git a/release.config.mjs b/release.config.mjs index 2501f2f..2a1bc9b 100644 --- a/release.config.mjs +++ b/release.config.mjs @@ -1,25 +1,25 @@ let dryRun = (process.env.RELEASE_DRY_RUN || "false").toLowerCase() === "true"; -let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true"; -const pypiToken = process.env.PYPI_TOKEN; +// let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true"; +// const pypiToken = process.env.PYPI_TOKEN; let prepareCmd = "poetry version -- \${nextRelease.version}" + ` && poetry config pypi-token.pypi ${pypiToken}`; -let publishCmd = `poetry publish --build`; +// let publishCmd = `poetry publish --build`; -if (testPypi) { - publishCmd += ` --repository testpypi`; - prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi"); -} +// if (testPypi) { +// publishCmd += ` --repository testpypi`; +// prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi"); +// } -if (dryRun) { - publishCmd += " --dry-run"; -} +// if (dryRun) { +// publishCmd += " --dry-run"; +// } import config from 'semantic-release-preconfigured-conventional-commits' with {type: 'json'}; config.plugins.push( ["@semantic-release/exec", { "prepareCmd" : prepareCmd, - "publishCmd": publishCmd, + // "publishCmd": publishCmd, }] ) From b787ac37e5baa33e0d94fe6cbcd882da7bcafef3 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 11:04:38 +0200 Subject: [PATCH 23/27] fix: accidentally delete crafting condition --- GoT/core/graph_model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py index 4b1a76b..194d551 100644 --- a/GoT/core/graph_model.py +++ b/GoT/core/graph_model.py @@ -389,6 +389,12 @@ def test_result(messages: MessagesState): runtime_graph.add_edge(test_node, runtime_graph.temp_response) runtime_graph.temp_response.resolved = True return END + elif ( + test_node.score < threshold + and is_tool_path_available is True + and test_node.need_tool_crafting is True + ): + return "crafting" elif test_node.score < threshold and is_tool_path_available is True: if test_node.need_tool_crafting is True: test_node.response = "The problem is too complex to craft a new tool, try reason step by step or divide complexity." From 868b3d07a2c339ad819dc8b679ffeee14863167d Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Wed, 20 May 2026 17:49:12 +0200 Subject: [PATCH 24/27] chore: add docstring control --- GoT/agent_tools/craft_tool.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/GoT/agent_tools/craft_tool.py b/GoT/agent_tools/craft_tool.py index dc1d2d1..d2569b4 100644 --- a/GoT/agent_tools/craft_tool.py +++ b/GoT/agent_tools/craft_tool.py @@ -83,6 +83,13 @@ def sanitize_input(query: str) -> str: func = functions[0] + try: + docstring = ast.get_docstring(func) + if not docstring or not docstring.strip(): + return "Error: missing docstring. A description of the function is mandatory for Gemini tools." + except TypeError: + return "Error: missing docstring. A description of the function is mandatory for Gemini tools." + for arg in func.args.args: if arg.annotation is None: return f"Error: missing type annotation for '{arg.arg}'" From b417846230ca4845db12ab4fac65293c76008231 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 22 May 2026 13:44:02 +0200 Subject: [PATCH 25/27] style: ruff format --- GoT/cli/parse_args.py | 9 ++++----- GoT/experiments/runner_custom.py | 2 +- GoT/utils/utils.py | 19 +++++++++++-------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/GoT/cli/parse_args.py b/GoT/cli/parse_args.py index 2cd4275..8169319 100644 --- a/GoT/cli/parse_args.py +++ b/GoT/cli/parse_args.py @@ -29,10 +29,7 @@ def defining_and_parse_args(): help="Whether to run the standard model or the graph model.", ) parser.add_argument( - "--prompt", - type=str, - default="", - help="Insert a prompt during a custom run." + "--prompt", type=str, default="", help="Insert a prompt during a custom run." ) parser.add_argument( "--max_run", @@ -73,7 +70,9 @@ def call_benchmark(args): elif args.benchmark == "gpqa": use_gpqa(max_run=max_run, test=test, model_name=mode) elif args.benchmark == "hendrycks_math": - use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.category) + use_hendrycks_math( + max_run=max_run, test=test, model_name=mode, type=args.category + ) elif args.benchmark == "gaia": use_gaia(max_run=max_run, test=test, model_name=mode) elif args.benchmark == "custom" and args.prompt != "": diff --git a/GoT/experiments/runner_custom.py b/GoT/experiments/runner_custom.py index 5db80a9..ab9ec3e 100644 --- a/GoT/experiments/runner_custom.py +++ b/GoT/experiments/runner_custom.py @@ -12,4 +12,4 @@ def custom_test(text: str, is_graph_mode: bool): agent.invoke( {"messages": [HumanMessage(content=text)]}, config={"recursion_limit": 20}, - ) \ No newline at end of file + ) diff --git a/GoT/utils/utils.py b/GoT/utils/utils.py index fd88910..37f8e64 100644 --- a/GoT/utils/utils.py +++ b/GoT/utils/utils.py @@ -117,6 +117,7 @@ def extract_tool_used(response: MessagesState) -> list[str]: tools_used.append(tool_call["name"]) return tools_used + def extract_function_signature(tool_crafted: dict) -> str: """ Extract name and arguments from function string. @@ -125,18 +126,19 @@ def extract_function_signature(tool_crafted: dict) -> str: match = re.search(r"def (\w+)\(([^)]*)\)", func_str) if not match: return "" - + func_name = match.group(1) args = match.group(2) - + clean_args = ", ".join( arg.split("=")[0].strip() for arg in args.split(",") if arg.strip() and arg.strip() != "self" - ) - + ) + return f"{func_name}({clean_args})" + def extract_tools_crafted(response: MessagesState) -> list[str]: """ Extract the tools that LLM has crafted. @@ -152,7 +154,7 @@ def extract_tools_crafted(response: MessagesState) -> list[str]: for tool_call in msg.tool_calls: tool_crafted = tool_call["args"] signature = extract_function_signature(tool_crafted) - if signature != '': + if signature != "": tools_crafted.append(signature) return tools_crafted @@ -228,11 +230,12 @@ def symbolic_equal(a, b): except Exception: return False + def extract_answer_from_response(response: str) -> str: boxed_match = re.search(r"\\boxed\{(.*)\}", response) if boxed_match: return boxed_match.group(1).strip() - + boxed_match_alt = re.search(r"boxed\{(.*)\}", response) if boxed_match_alt: return boxed_match_alt.group(1).strip() @@ -240,11 +243,11 @@ def extract_answer_from_response(response: str) -> str: answer_match = re.search(r"Answer:\s*(.*)", response, re.IGNORECASE) if answer_match: return answer_match.group(1).strip() - + clean_response = response.strip() if not clean_response: return "N/A" - + try: float(clean_response) return clean_response From 52d5dbbf59caa0e4ea3751d8a19753f87e5f76d5 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 22 May 2026 13:47:21 +0200 Subject: [PATCH 26/27] chore: mypy check --- GoT/core/graph_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GoT/core/graph_model.py b/GoT/core/graph_model.py index 194d551..6ace4dc 100644 --- a/GoT/core/graph_model.py +++ b/GoT/core/graph_model.py @@ -336,7 +336,7 @@ def response_evaluation(messages: MessagesState): def crafting(messages: MessagesState): - crafting_node = CraftingNode(response="", tools_crafted="", resolved=False) + crafting_node = CraftingNode(response="", tools_crafted=[], resolved=False) runtime_graph.add_node(crafting_node) runtime_graph.add_edge(runtime_graph.temp_node, crafting_node) runtime_graph.temp_node = crafting_node From 6bfc5c437e7ff4087a2f6e1bdde2825923169786 Mon Sep 17 00:00:00 2001 From: Raggini Marco Date: Fri, 22 May 2026 14:59:29 +0200 Subject: [PATCH 27/27] chore: uncomment pypi config --- .github/workflows/deploy.yml | 4 ++-- release.config.mjs | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0bfbde1..77a591e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -60,8 +60,8 @@ jobs: npm install npx semantic-release env: - # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} - # RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }} + RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }} # dry run if not on main/master branch, or if initial commit RELEASE_DRY_RUN: ${{ steps.skip_release.outputs.skip == 'true' || (github.ref_name != 'master' && github.ref_name != 'main') }} diff --git a/release.config.mjs b/release.config.mjs index 2a1bc9b..2501f2f 100644 --- a/release.config.mjs +++ b/release.config.mjs @@ -1,25 +1,25 @@ let dryRun = (process.env.RELEASE_DRY_RUN || "false").toLowerCase() === "true"; -// let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true"; -// const pypiToken = process.env.PYPI_TOKEN; +let testPypi = (process.env.RELEASE_TEST_PYPI || "false").toLowerCase() === "true"; +const pypiToken = process.env.PYPI_TOKEN; let prepareCmd = "poetry version -- \${nextRelease.version}" + ` && poetry config pypi-token.pypi ${pypiToken}`; -// let publishCmd = `poetry publish --build`; +let publishCmd = `poetry publish --build`; -// if (testPypi) { -// publishCmd += ` --repository testpypi`; -// prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi"); -// } +if (testPypi) { + publishCmd += ` --repository testpypi`; + prepareCmd = prepareCmd.replace("pypi-token.pypi", "pypi-token.testpypi"); +} -// if (dryRun) { -// publishCmd += " --dry-run"; -// } +if (dryRun) { + publishCmd += " --dry-run"; +} import config from 'semantic-release-preconfigured-conventional-commits' with {type: 'json'}; config.plugins.push( ["@semantic-release/exec", { "prepareCmd" : prepareCmd, - // "publishCmd": publishCmd, + "publishCmd": publishCmd, }] )