Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
fa71e8f
chore: improve prompt
MarkRagg Apr 15, 2026
5569cb6
chore: remove craft tools in hf formatter
MarkRagg Apr 16, 2026
53344dd
chore: create a single benchmark_run for all datasets
MarkRagg Apr 16, 2026
4cf3a01
feat: gaia benchmark added
MarkRagg Apr 16, 2026
929ebd0
chore: simplify codes
MarkRagg Apr 17, 2026
1045101
chore: add explanation of tool needed
MarkRagg Apr 17, 2026
c840f47
chore: remove comments
MarkRagg Apr 21, 2026
5bd0c62
chore: add method to download mlflow traces
MarkRagg Apr 22, 2026
b4a4dda
chore: change var name and simplify codes
MarkRagg Apr 23, 2026
0c93f80
feat: add wikipedia and arxiv tools
MarkRagg Apr 24, 2026
ea2f8e2
fix: fix names in type arg
MarkRagg Apr 24, 2026
8e8ff33
chore: add specific crafter LLM and improve prompt
MarkRagg Apr 25, 2026
f0cec82
style: change system folder architecture
MarkRagg Apr 25, 2026
fab5a9c
style: ruff format + ignores arxiv, wikipedia stubs
MarkRagg Apr 25, 2026
345ce80
chore: improve evaluation
MarkRagg May 7, 2026
2b4f617
chore: code refinement
MarkRagg May 8, 2026
47ef8b4
feat: add custom runs on cli args
MarkRagg May 14, 2026
89e468d
chore: change argument 'type' in 'category'
MarkRagg May 15, 2026
d2f886f
chore: memorize tool crafted in each CraftingNode
MarkRagg May 20, 2026
2558b00
chore: improve parsing
MarkRagg May 20, 2026
0468e3e
style: change var names
MarkRagg May 20, 2026
ac7289b
chore: comment pypi release
MarkRagg May 20, 2026
b787ac3
fix: accidentally delete crafting condition
MarkRagg May 20, 2026
868b3d0
chore: add docstring control
MarkRagg May 20, 2026
b417846
style: ruff format
MarkRagg May 22, 2026
52d5dbb
chore: mypy check
MarkRagg May 22, 2026
6bfc5c4
chore: uncomment pypi config
MarkRagg May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
npm install
npx semantic-release
env:
# PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
RELEASE_TEST_PYPI: ${{ github.event.repository.is_template || contains(github.repository, 'template') }}
# dry run if not on main/master branch, or if initial commit
Expand Down
13 changes: 4 additions & 9 deletions GoT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
from dotenv import load_dotenv

from lm_eval import evaluator, tasks
from GoT.model.graph_model import call_graph
from GoT.model.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
from GoT.model.utils.parse_args import call_benchmark, defining_and_parse_args
from GoT.model.utils.utils import (
from GoT.experiments.lm_wrapper import LangGraphBigBenchWrapper, TestBigBenchWrapper
from GoT.cli.parse_args import call_benchmark, defining_and_parse_args
from GoT.utils.utils import (
print_benchmark_result,
print_benchmark_result_loglikehood,
)
Expand Down Expand Up @@ -62,14 +61,10 @@ def lm_eval_graph_benchmark():
print_benchmark_result_loglikehood(results, task_name, filter_val="none")


def custom_test():
call_graph("Solve this integral ∫x2⋅ex2dx")


def main():
# It could be changed with custom_test() to test a custom problem instead of the benchmark
args = defining_and_parse_args()
call_benchmark(args)
# download_mlflow_traces(50)


# let this be the last line of this file
Expand Down
File renamed without changes.
9 changes: 9 additions & 0 deletions GoT/tools/craft_tool.py → GoT/agent_tools/craft_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def craft_tool(tool_function: str) -> str:
"""Save the function definition provided by the LLM as a tool that can be used by other agents.
The function should be defined as a python function.
The function should be general and reusable, and should not be specific to the current problem.
The function must not use tuple as args type.
The function must be defined as gemini api format, with type annotations for all arguments and return type.
The function should be defined in a way that it can be imported and used by other agents."""

def sanitize_input(query: str) -> str:
Expand All @@ -81,6 +83,13 @@ def sanitize_input(query: str) -> str:

func = functions[0]

try:
docstring = ast.get_docstring(func)
if not docstring or not docstring.strip():
return "Error: missing docstring. A description of the function is mandatory for Gemini tools."
except TypeError:
return "Error: missing docstring. A description of the function is mandatory for Gemini tools."

for arg in func.args.args:
if arg.annotation is None:
return f"Error: missing type annotation for '{arg.arg}'"
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from langchain.messages import HumanMessage, SystemMessage
from langchain.tools import tool

from GoT.model.ollama_llm import LLM
from GoT.model.runtime_graph import ReasoningNode, RuntimeGraph
from GoT.model.utils.utils import parse_response
from GoT.core.llm import LLM
from GoT.core.runtime_graph import ReasoningNode, RuntimeGraph
from GoT.utils.utils import parse_response

MAX_INTERACTIONS = 10

Expand All @@ -21,6 +21,8 @@ def divide_thought(
HOW TO USE THIS TOOL:
- Call it when you think the problem is complex.
- The two parts must be as independent as possible.
IMPORTANT NOTES:
- You can't use the result of the first part to reason about the second part, and vice versa. The two parts must be as independent as possible.
Arguments:
- first_part: the first part of the thought process
- second_part: the second part of the thought process
Expand Down
54 changes: 54 additions & 0 deletions GoT/agent_tools/web_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import arxiv
from langchain.tools import tool
import wikipedia


@tool
def search_wikipedia(query: str) -> str:
"""
Fetch a brief summary from Wikipedia.

Args:
query (str): The keyword or topic to search for.

Returns:
str: A 3-sentence summary of the topic, the first option if
ambiguous, or an error message if not found.
"""
try:
return wikipedia.search(query)
except wikipedia.DisambiguationError as e:
# happens when query is ambiguous, pick first option
return wikipedia.summary(e.options[0], sentences=3)
except wikipedia.PageError:
return "Page not found"


@tool
def search_arxiv(query: str) -> str:
"""Search ArXiv for scientific papers on a given topic.
Use this when you need to find research papers, abstracts or academic references."""

try:
client = arxiv.Client()
search = arxiv.Search(
query=query, max_results=3, sort_by=arxiv.SortCriterion.Relevance
)

results = []
for paper in client.results(search):
results.append(
f"Title: {paper.title}\n"
f"Authors: {', '.join(a.name for a in paper.authors)}\n"
f"Published: {paper.published.strftime('%Y-%m-%d')}\n"
f"Summary: {paper.summary[:300]}...\n"
f"URL: {paper.entry_id}\n"
)

if not results:
return "No papers found for this query."

return "\n---\n".join(results)

except Exception as e:
return f"Error searching ArXiv: {str(e)}"
25 changes: 20 additions & 5 deletions GoT/model/utils/parse_args.py → GoT/cli/parse_args.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import argparse
import sys

from GoT.model.utils.hf_formatter import use_gpqa, use_gsm8k, use_hendrycks_math
from GoT.experiments.hf_formatter import (
use_gaia,
use_gpqa,
use_gsm8k,
use_hendrycks_math,
)
from GoT.experiments.runner_custom import custom_test


def defining_and_parse_args():
Expand All @@ -12,7 +18,7 @@ def defining_and_parse_args():
"--benchmark",
required=True,
type=str,
choices=["gsm8k", "gpqa", "hendrycks_math"],
choices=["gsm8k", "gpqa", "hendrycks_math", "gaia", "custom"],
help="The benchmark to run the model on.",
)
parser.add_argument(
Expand All @@ -22,14 +28,17 @@ def defining_and_parse_args():
choices=["graph", "standard"],
help="Whether to run the standard model or the graph model.",
)
parser.add_argument(
"--prompt", type=str, default="", help="Insert a prompt during a custom run."
)
parser.add_argument(
"--max_run",
type=int,
default=1,
help="The maximum number of runs for the benchmark.",
)
parser.add_argument(
"--type",
"--category",
type=str,
default="algebra",
choices=[
Expand All @@ -39,7 +48,7 @@ def defining_and_parse_args():
"intermediate_algebra",
"number_theory",
"precalculus",
"statistics",
"prealgebra",
],
help="The type of math problems to run, only for hendrycks_math.",
)
Expand All @@ -61,4 +70,10 @@ def call_benchmark(args):
elif args.benchmark == "gpqa":
use_gpqa(max_run=max_run, test=test, model_name=mode)
elif args.benchmark == "hendrycks_math":
use_hendrycks_math(max_run=max_run, test=test, model_name=mode, type=args.type)
use_hendrycks_math(
max_run=max_run, test=test, model_name=mode, type=args.category
)
elif args.benchmark == "gaia":
use_gaia(max_run=max_run, test=test, model_name=mode)
elif args.benchmark == "custom" and args.prompt != "":
custom_test(args.prompt, test)
Loading
Loading