Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions biome.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
{
"$schema": "https://biomejs.dev/schemas/1.9.4/schema.json",
"$schema": "https://biomejs.dev/schemas/2.4.14/schema.json",
"files": {
"ignore": [
"dist",
"node_modules",
"coverage",
".turbo",
"results",
"trajectories",
"package-lock.json",
"pnpm-lock.yaml"
"includes": [
"**",
"!**/dist",
"!**/node_modules",
"!**/coverage",
"!**/.turbo",
"!**/results",
"!**/trajectories",
"!**/package-lock.json",
"!**/pnpm-lock.yaml"
]
},
"organizeImports": {
"enabled": true
},
"assist": { "actions": { "source": { "organizeImports": "off" } } },
"linter": {
"enabled": true,
"rules": {
Expand Down
9 changes: 6 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"prepare": "husky"
},
"devDependencies": {
"@biomejs/biome": "^1.9.4",
"@biomejs/biome": "^2.4.14",
"@changesets/changelog-github": "^0.6.0",
"@changesets/cli": "^2.28.1",
"@vitest/coverage-v8": "^3.2.4",
Expand All @@ -43,14 +43,17 @@
},
"homepage": "https://github.com/reaatech/agent-eval-harness#readme",
"pnpm": {
"onlyBuiltDependencies": ["@biomejs/biome", "esbuild"],
"onlyBuiltDependencies": [
"@biomejs/biome",
"esbuild"
],
"overrides": {
"uuid@<14.0.0": ">=14.0.0",
"hono@<4.12.18": ">=4.12.18",
"ip-address@<=10.1.0": ">=10.1.1",
"fast-uri@<=3.1.1": ">=3.1.2",
"@opentelemetry/sdk-node@<0.217.0": ">=0.217.0",
"@opentelemetry/auto-instrumentations-node@<0.75.0": ">=0.75.0",
"@opentelemetry/auto-instrumentsations-node@<0.75.0": ">=0.75.0",
"@opentelemetry/api@<1.9.0": ">=1.9.0"
}
},
Expand Down
4 changes: 3 additions & 1 deletion packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/cost/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/gate/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/golden/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/judge/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
52 changes: 17 additions & 35 deletions packages/judge/src/prompts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ Your task is to determine if the response:
4. Accurately represents the information provided`,

user: `Context:
\({context}\)
{context}

Assistant Response:
\({response}\)
{response}

Rate the faithfulness on a scale from 0.0 to 1.0:
- 1.0: Completely faithful - response only uses information from context
Expand All @@ -62,11 +62,7 @@ Provide your response in this exact JSON format:
Examples of faithful vs unfaithful responses:
{examples}

Now evaluate the response above.`
.replace('${context}', '{context}')
.replace('${response}', '{response}')
.replace('${rubric}', '{rubric}')
.replace('${examples}', '{examples}'),
Now evaluate the response above.`,

responseFormat: `{
"score": <number between 0.0 and 1.0>,
Expand All @@ -91,10 +87,10 @@ Your task is to determine if the response:
4. Matches the expected response type (e.g., answer vs question)`,

user: `User Intent:
\({intent}\)
{intent}

Assistant Response:
\({response}\)
{response}

Rate the relevance on a scale from 0.0 to 1.0:
- 1.0: Perfectly relevant - directly and completely addresses intent
Expand All @@ -109,11 +105,7 @@ Rate the relevance on a scale from 0.0 to 1.0:
Provide your response in this exact JSON format:
{responseFormat}

Now evaluate the response above.`
.replace('${intent}', '{intent}')
.replace('${response}', '{response}')
.replace('${rubric}', '{rubric}')
.replace('${responseFormat}', '{responseFormat}'),
Now evaluate the response above.`,

responseFormat: `{
"score": <number between 0.0 and 1.0>,
Expand All @@ -137,9 +129,9 @@ Your task is to determine if:
3. The tool was used in the right context
4. The tool result was interpreted correctly`,

user: `Expected Tool: \({expected_tool}\)
Actual Tool: \({actual_tool}\)
Arguments: \({arguments}\)
user: `Expected Tool: {expected_tool}
Actual Tool: {actual_tool}
Arguments: {arguments}

{rubric}

Expand All @@ -152,14 +144,9 @@ Rate the tool correctness on a scale from 0.0 to 1.0:
- 0.0: Terrible - completely wrong tool and usage

Provide your response in this exact JSON format:
\({responseFormat}\)
{responseFormat}

Now evaluate the tool usage above.`
.replace('${expected_tool}', '{expected_tool}')
.replace('${actual_tool}', '{actual_tool}')
.replace('${arguments}', '{arguments}')
.replace('${rubric}', '{rubric}')
.replace('${responseFormat}', '{responseFormat}'),
Now evaluate the tool usage above.`,

responseFormat: `{
"score": <number between 0.0 and 1.0>,
Expand Down Expand Up @@ -187,13 +174,13 @@ Your task is to provide a holistic quality assessment considering:
6. Proper use of tools (if applicable)`,

user: `Context:
\({context}\)
{context}

User Intent:
\({intent}\)
{intent}

Assistant Response:
\({response}\)
{response}

{rubric}

Expand All @@ -206,14 +193,9 @@ Rate the overall quality on a scale from 0.0 to 1.0:
- 0.0: Unacceptable - completely inadequate

Provide your response in this exact JSON format:
\({responseFormat}\)

Now evaluate the response above.`
.replace('${context}', '{context}')
.replace('${intent}', '{intent}')
.replace('${response}', '{response}')
.replace('${rubric}', '{rubric}')
.replace('${responseFormat}', '{responseFormat}'),
{responseFormat}

Now evaluate the response above.`,

responseFormat: `{
"score": <number between 0.0 and 1.0>,
Expand Down
4 changes: 3 additions & 1 deletion packages/latency/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/mcp-server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/observability/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/suite/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/tool-use/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/trajectory/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
4 changes: 3 additions & 1 deletion packages/types/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
}
},
"sideEffects": false,
"files": ["dist"],
"files": [
"dist"
],
"publishConfig": {
"access": "public"
},
Expand Down
Loading
Loading