-
Notifications
You must be signed in to change notification settings - Fork 1.8k
fix(knowledge-graph): deduplication, validation, GC, and locking (6 fixes) #813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
20fc5d2
511f335
671d5cc
bf4e57d
1b39a37
ba4dcef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,7 @@ import type { | |
| CompressedObservation, | ||
| MemoryProvider, | ||
| } from "../types.js"; | ||
| import { KV, generateId } from "../state/schema.js"; | ||
| import { KV, generateId, jaccardSimilarity } from "../state/schema.js"; | ||
| import type { StateKV } from "../state/kv.js"; | ||
| import { | ||
| GRAPH_EXTRACTION_SYSTEM, | ||
|
|
@@ -399,6 +399,9 @@ function parseGraphXml( | |
| const type = attrs["type"] as GraphNode["type"] | undefined; | ||
| const name = attrs["name"]; | ||
| if (!type || !name) return; | ||
| const VALID_TYPES = new Set(["file", "function", "concept", "error", "decision", "pattern", "library", "person"]); | ||
| // Must match types listed in GRAPH_EXTRACTION_SYSTEM prompt — update both together | ||
| if (!VALID_TYPES.has(type)) return; // skip corrupt/invalid types from LLM | ||
| const properties: Record<string, string> = {}; | ||
| const propRegex = /<property\s+key="([^"]+)">([^<]*)<\/property>/g; | ||
| let propMatch; | ||
|
|
@@ -490,6 +493,9 @@ export function registerGraphFunction( | |
| let newNodeCount = 0; | ||
| let newEdgeCount = 0; | ||
| const newEdgesForTopCheck: GraphEdge[] = []; | ||
| // #813: cache all nodes once for fuzzy dedup, avoiding per-node kv.list calls | ||
| const allNodes = await kv.list<GraphNode>(KV.graphNodes); | ||
|
Comment on lines
+496
to
+497
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid reintroducing a full graph scan in Line 497 brings back the exact 🤖 Prompt for AI Agents |
||
| const nodeIdMap = new Map<string, string>(); | ||
|
Comment on lines
+496
to
+498
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep the fuzzy-dedup candidate cache in sync within the same extraction run.
Suggested fix } else {
await Promise.all([
kv.set(KV.graphNodes, node.id, node),
kv.set(KV.graphNameIndex, indexKey, node.id),
kv.set(KV.graphNodeDegree, node.id, 0),
]);
+ allNodes.push(node);
snap.stats.totalNodes += 1;
snap.stats.nodesByType[node.type] =
(snap.stats.nodesByType[node.type] ?? 0) + 1;
newNodeCount += 1;
nodeIdMap.set(node.id, node.id);Also applies to: 536-545 🤖 Prompt for AI Agents |
||
|
|
||
| for (const node of nodes) { | ||
| const indexKey = nameIndexKey(node.type, node.name); | ||
|
|
@@ -501,8 +507,20 @@ export function registerGraphFunction( | |
| let existing: GraphNode | null = null; | ||
| if (existingId) { | ||
| existing = await kv.get<GraphNode>(KV.graphNodes, existingId); | ||
| // #813: skip stale nodes — they shouldn't merge | ||
| if (existing?.stale) existing = null; | ||
| } | ||
|
|
||
| // #813: Fuzzy dedup via Jaccard similarity (if exact match failed) | ||
| if (!existing && node.name.length >= 3) { | ||
| existing = allNodes.find( | ||
| (n) => | ||
| !n.stale && | ||
| n.type === node.type && | ||
| n.name.length >= 3 && | ||
| jaccardSimilarity(n.name.toLowerCase(), node.name.toLowerCase()) > 0.8, | ||
| ) ?? null; | ||
| } | ||
| if (existing) { | ||
| const merged = mergeNode(existing, node, obsIds, capturedAt); | ||
| await kv.set(KV.graphNodes, existing.id, merged); | ||
|
|
@@ -512,14 +530,19 @@ export function registerGraphFunction( | |
| (n) => n.id === existing!.id, | ||
| ); | ||
| if (topIdx !== -1) snap.topNodes[topIdx] = merged; | ||
| // #813: track mapping from parsed ID -> persisted ID | ||
| nodeIdMap.set(node.id, existing.id); | ||
| } else { | ||
| await kv.set(KV.graphNodes, node.id, node); | ||
| await kv.set(KV.graphNameIndex, indexKey, node.id); | ||
| await kv.set(KV.graphNodeDegree, node.id, 0); | ||
| await Promise.all([ | ||
| kv.set(KV.graphNodes, node.id, node), | ||
| kv.set(KV.graphNameIndex, indexKey, node.id), | ||
| kv.set(KV.graphNodeDegree, node.id, 0), | ||
| ]); | ||
| snap.stats.totalNodes += 1; | ||
| snap.stats.nodesByType[node.type] = | ||
| (snap.stats.nodesByType[node.type] ?? 0) + 1; | ||
| newNodeCount += 1; | ||
| nodeIdMap.set(node.id, node.id); | ||
| if (snap.topNodes.length < SNAPSHOT_TOP_NODES) { | ||
| // Degree 0 still beats an empty slot — sit at the tail | ||
| // until edges arrive and promote. | ||
|
|
@@ -530,6 +553,20 @@ export function registerGraphFunction( | |
| } | ||
|
|
||
| for (const edge of edges) { | ||
| // #813: remap transient parsed node IDs to persisted node IDs | ||
| const srcId = nodeIdMap.get(edge.sourceNodeId); | ||
| const tgtId = nodeIdMap.get(edge.targetNodeId); | ||
| if (!srcId || !tgtId) { | ||
| logger.warn("Edge references unknown node, skipping", { | ||
| edgeId: edge.id, | ||
| sourceNodeId: edge.sourceNodeId, | ||
| targetNodeId: edge.targetNodeId, | ||
| }); | ||
| continue; | ||
| } | ||
| edge.sourceNodeId = srcId; | ||
| edge.targetNodeId = tgtId; | ||
|
|
||
| const eKey = edgeIndexKey( | ||
| edge.sourceNodeId, | ||
| edge.targetNodeId, | ||
|
|
@@ -551,8 +588,10 @@ export function registerGraphFunction( | |
| ); | ||
| if (topIdx !== -1) snap.topEdges[topIdx] = merged; | ||
| } else { | ||
| await kv.set(KV.graphEdges, edge.id, edge); | ||
| await kv.set(KV.graphEdgeKey, eKey, edge.id); | ||
| await Promise.all([ | ||
| kv.set(KV.graphEdges, edge.id, edge), | ||
| kv.set(KV.graphEdgeKey, eKey, edge.id), | ||
| ]); | ||
| snap.stats.totalEdges += 1; | ||
| snap.stats.edgesByType[edge.type] = | ||
| (snap.stats.edgesByType[edge.type] ?? 0) + 1; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.