Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions code_review_graph/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ class EdgeInfo:
".v": "verilog",
".vh": "verilog",
".sql": "sql",
# Knowledge formats (.md/.yaml): no tree-sitter grammar bundled, so they use
# regex-based extraction (same approach as ReScript). This lets documentation and
# config live in the same graph as code, linkable via CONTAINS/REFERENCES edges.
".md": "markdown",
".mdx": "markdown",
".qmd": "markdown",
".yaml": "yaml",
".yml": "yaml",
}

# Shebang interpreter → language mapping for extension-less Unix scripts.
Expand Down Expand Up @@ -939,6 +947,12 @@ def parse_bytes(self, path: Path, source: bytes) -> tuple[list[NodeInfo], list[E
if language == "sql":
return self._parse_sql(path, source)

# Knowledge formats: regex-based extraction (no tree-sitter grammar bundled).
if language == "markdown":
return self._parse_markdown(path, source)
if language == "yaml":
return self._parse_yaml(path, source)

parser = self._get_parser(language)
if not parser:
return [], []
Expand Down Expand Up @@ -1553,6 +1567,138 @@ def _parse_databricks_py_notebook(
# is bundled for ReScript, so we extract best-effort structure via
# comment-stripping + line-anchored regex + brace-counted module scan).
# ------------------------------------------------------------------
# Knowledge extractors (regex-based, no tree-sitter grammar): .md and .yaml.
# Markdown headings -> Section nodes; nesting -> CONTAINS; links -> REFERENCES.
# YAML top-level keys / registry entries -> nodes, so config and docs share the
# graph with code via the same edge kinds.
# ------------------------------------------------------------------

def _parse_markdown(
self, path: Path, source: bytes,
) -> tuple[list[NodeInfo], list[EdgeInfo]]:
"""Markdown: File + Section(heading) nodes; CONTAINS (nesting) + REFERENCES (links)."""
text = source.decode("utf-8", errors="replace")
fp = str(path)
stem = path.stem
lines = text.splitlines()
nodes: list[NodeInfo] = [
NodeInfo(
kind="File", name=path.name, file_path=fp,
line_start=1, line_end=max(1, len(lines)), language="markdown",
)
]
edges: list[EdgeInfo] = []
h_re = re.compile(r"^(#{1,6})\s+(.+)")
bt_re = re.compile(r"`([^`\n]+)`")
wl_re = re.compile(r"\[\[([^\]\|]+)")
ml_re = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
stack: list[tuple[int, str]] = []
seen_sec: set[str] = set()
seen_ref: set[str] = set()
in_code = False
for i, line in enumerate(lines, 1):
s = line.strip()
if s.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
m = h_re.match(line)
if m:
level = len(m.group(1))
title = m.group(2).strip()
qn = f"{stem}#{title}"
if qn in seen_sec:
qn = f"{qn}@L{i}"
seen_sec.add(qn)
while stack and stack[-1][0] >= level:
stack.pop()
parent = stack[-1][1] if stack else path.name
nodes.append(
NodeInfo(
kind="Section", name=qn, file_path=fp, line_start=i,
line_end=i, language="markdown", parent_name=parent,
)
)
edges.append(
EdgeInfo(kind="CONTAINS", source=parent, target=qn, file_path=fp, line=i)
)
stack.append((level, qn))
continue
for tok in bt_re.findall(line) + wl_re.findall(line) + ml_re.findall(line):
tok_s = tok.strip()
t = tok_s.split("#")[0].rsplit("/", 1)[-1]
t = t[:-3] if t.endswith(".md") else t
looks_like = bool(re.fullmatch(r"[\w.\-]{3,60}", tok_s))
if (tok_s.endswith(".md") or looks_like) and t and t != stem and t not in seen_ref:
seen_ref.add(t)
edges.append(
EdgeInfo(
kind="REFERENCES", source=path.name, target=t,
file_path=fp, line=i,
)
)
return nodes, edges

def _parse_yaml(
self, path: Path, source: bytes,
) -> tuple[list[NodeInfo], list[EdgeInfo]]:
"""YAML (regex): File + Section(top-key) + Type(entry) nodes + CONTAINS edges."""
text = source.decode("utf-8", errors="replace")
fp = str(path)
stem = path.stem
lines = text.splitlines()
nodes: list[NodeInfo] = [
NodeInfo(
kind="File", name=path.name, file_path=fp,
line_start=1, line_end=max(1, len(lines)), language="yaml",
)
]
edges: list[EdgeInfo] = []
topkey_re = re.compile(r"^([A-Za-z_][\w-]*):")
entry_re = re.compile(r"""^\s*-?\s*(?:id|name):\s*["']?([\w./-]{2,60})""")
cur_top = path.name
seen: set[str] = set()
for i, line in enumerate(lines, 1):
mt = topkey_re.match(line)
if mt:
key = mt.group(1)
qn = f"{stem}:{key}"
if qn not in seen:
seen.add(qn)
nodes.append(
NodeInfo(
kind="Section", name=qn, file_path=fp, line_start=i,
line_end=i, language="yaml", parent_name=path.name,
)
)
edges.append(
EdgeInfo(
kind="CONTAINS", source=path.name, target=qn,
file_path=fp, line=i,
)
)
cur_top = qn
continue
me = entry_re.match(line)
if me:
eid = me.group(1)
qn = f"{stem}::{eid}"
if qn not in seen:
seen.add(qn)
nodes.append(
NodeInfo(
kind="Type", name=qn, file_path=fp, line_start=i,
line_end=i, language="yaml", parent_name=cur_top,
)
)
edges.append(
EdgeInfo(
kind="CONTAINS", source=cur_top, target=qn,
file_path=fp, line=i,
)
)
return nodes, edges

def _parse_rescript(
self, path: Path, source: bytes,
Expand Down
63 changes: 63 additions & 0 deletions tests/test_knowledge_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Tests for the Markdown and YAML knowledge extractors.

These formats have no bundled tree-sitter grammar, so they are parsed with
regex-based extractors (same approach as ReScript / SQL CREATE PROCEDURE).
Markdown headings and YAML registry entries become graph nodes, letting the
graph link documentation/config to code via the shared CONTAINS/REFERENCES
edge kinds.
"""
import tempfile
from pathlib import Path

from code_review_graph.parser import CodeParser


class TestKnowledgeExtractors:
def setup_method(self):
self.parser = CodeParser()

def _write(self, suffix: str, content: str) -> Path:
d = Path(tempfile.mkdtemp())
p = d / f"sample{suffix}"
p.write_text(content, encoding="utf-8")
return p

def test_markdown_headings_become_nodes(self):
p = self._write(".md", "# Title\n## Section A\n### Sub\n## Section B\n")
nodes, edges = self.parser.parse_file(p)
kinds = {n.kind for n in nodes}
assert "File" in kinds
assert "Section" in kinds
# 1 File + 4 headings
assert sum(1 for n in nodes if n.kind == "Section") == 4
contains = [e for e in edges if e.kind == "CONTAINS"]
assert len(contains) == 4

def test_markdown_nesting_and_code_fence(self):
p = self._write(".md", "# Top\n## Child\n```\n# not a heading\n```\n")
nodes, edges = self.parser.parse_file(p)
labels = {n.name for n in nodes if n.kind == "Section"}
# fenced "# not a heading" must NOT become a node
assert not any("not a heading" in lbl for lbl in labels)
# Child nests under Top, not the file
nest = [
e for e in edges
if e.kind == "CONTAINS" and "Top" in e.source and "Child" in e.target
]
assert nest

def test_markdown_references(self):
p = self._write(".md", "# Doc\nSee `other_doc.md` and [link](target.md).\n")
_, edges = self.parser.parse_file(p)
refs = {e.target for e in edges if e.kind == "REFERENCES"}
assert "other_doc" in refs
assert "target" in refs

def test_yaml_top_keys_and_entries(self):
p = self._write(".yaml", "crons:\n - id: CRON-1\n name: alpha\n - id: CRON-2\n")
nodes, edges = self.parser.parse_file(p)
names = {n.name for n in nodes}
assert any(n.kind == "File" for n in nodes)
assert any("crons" in nm for nm in names) # top-level key node
assert any("CRON-1" in nm for nm in names) # registry entry node
assert [e for e in edges if e.kind == "CONTAINS"]