-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtask.eval.js
More file actions
152 lines (125 loc) · 5.21 KB
/
Copy pathtask.eval.js
File metadata and controls
152 lines (125 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import { execFileSync } from "node:child_process";
import { readFileSync } from "node:fs";
import { describe, it, prompt } from "katt";
const REPO = getRepositorySlug();
const EXPECTED_RESULT = { issues: getExpectedIssues() };
function getRepositorySlug() {
const packageJson = JSON.parse(readFileSync(new URL("./package.json", import.meta.url), "utf8"));
const repositoryUrl = packageJson.repository?.url ?? packageJson.homepage;
const match = String(repositoryUrl).match(/github\.com[/:]([^/]+\/[^/.#]+)(?:\.git)?(?:#.*)?$/);
if (!match) {
throw new Error(`Could not determine GitHub repository slug from package.json: ${repositoryUrl}`);
}
return match[1];
}
function getExpectedIssues() {
const raw = execFileSync(
"gh",
[
"issue",
"list",
"-R",
REPO,
"--limit",
"20",
"--state",
"all",
"--json",
"number,title,createdAt,labels"
],
{ encoding: "utf8" }
);
const issues = JSON.parse(raw)
.map((issue) => ({
id: issue.number,
title: issue.title,
labels: issue.labels.map((label) => label.name),
created_at: issue.createdAt
}))
.sort((left, right) => {
const createdAtDifference = Date.parse(right.created_at) - Date.parse(left.created_at);
return createdAtDifference !== 0 ? createdAtDifference : right.id - left.id;
})
.slice(0, 5);
if (issues.length !== 5) {
throw new Error(`Expected at least 5 issues from ${REPO}, got ${issues.length}`);
}
return issues;
}
function parseAndValidateResult(result) {
let parsed;
try {
parsed = JSON.parse(result);
} catch (error) {
throw new Error(`Result was not valid JSON.\nRaw result:\n${result}\nParse error: ${String(error)}`);
}
if (!parsed || Array.isArray(parsed) || typeof parsed !== "object") {
throw new Error(`Result must be a JSON object.\nRaw result:\n${result}`);
}
const topLevelKeys = Object.keys(parsed).sort();
if (JSON.stringify(topLevelKeys) !== JSON.stringify(["issues"])) {
throw new Error(`Result must contain only the top-level key "issues". Received keys: ${topLevelKeys.join(", ")}`);
}
if (!Array.isArray(parsed.issues)) {
throw new Error(`"issues" must be an array.\nRaw result:\n${result}`);
}
if (parsed.issues.length !== 5) {
throw new Error(`"issues" must contain exactly 5 items. Received ${parsed.issues.length}.\nRaw result:\n${result}`);
}
parsed.issues.forEach((issue, index) => {
if (!issue || Array.isArray(issue) || typeof issue !== "object") {
throw new Error(`Issue at index ${index} must be an object.\nRaw result:\n${result}`);
}
const issueKeys = Object.keys(issue).sort();
const expectedKeys = ["created_at", "id", "labels", "title"];
if (JSON.stringify(issueKeys) !== JSON.stringify(expectedKeys)) {
throw new Error(
`Issue at index ${index} must contain only ${expectedKeys.join(", ")}. Received keys: ${issueKeys.join(", ")}.\nRaw result:\n${result}`
);
}
if (!Number.isInteger(issue.id)) {
throw new Error(`Issue at index ${index} has a non-integer "id".\nRaw result:\n${result}`);
}
if (typeof issue.title !== "string") {
throw new Error(`Issue at index ${index} has a non-string "title".\nRaw result:\n${result}`);
}
if (!Array.isArray(issue.labels) || issue.labels.some((label) => typeof label !== "string")) {
throw new Error(`Issue at index ${index} must have a string array in "labels".\nRaw result:\n${result}`);
}
if (typeof issue.created_at !== "string" || Number.isNaN(Date.parse(issue.created_at))) {
throw new Error(`Issue at index ${index} has an invalid "created_at".\nRaw result:\n${result}`);
}
});
return parsed;
}
function assertMatchesLiveIssues(result) {
const parsed = parseAndValidateResult(result);
const actual = JSON.stringify(parsed);
const expected = JSON.stringify(EXPECTED_RESULT);
if (actual !== expected) {
throw new Error(
[
`Result did not match the 5 most recent live issues from ${REPO}.`,
`Expected: ${expected}`,
`Actual: ${actual}`
].join("\n")
);
}
}
describe("Issues listing", () => {
// ---
// The same test runs 3x to track prompt stability for a stochastic model.
// ---
it("should list the issues - 1", async () => {
const result = await prompt('Follow the instructions in "task.prompt.md"', { model: "gpt-5-mini" });
assertMatchesLiveIssues(result);
});
it("should list the issues - 2", async () => {
const result = await prompt('Follow the instructions in "task.prompt.md"', { model: "gpt-5-mini" });
assertMatchesLiveIssues(result);
});
it("should list the issues - 3", async () => {
const result = await prompt('Follow the instructions in "task.prompt.md"', { model: "gpt-5-mini" });
assertMatchesLiveIssues(result);
});
});