tirth8205 · otalrapha · Jun 8, 2026
diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py
@@ -142,6 +142,14 @@ class EdgeInfo:
     ".v": "verilog",
     ".vh": "verilog",
     ".sql": "sql",
+    # Knowledge formats (.md/.yaml): no tree-sitter grammar bundled, so they use
+    # regex-based extraction (same approach as ReScript). This lets documentation and
+    # config live in the same graph as code, linkable via CONTAINS/REFERENCES edges.
+    ".md": "markdown",
+    ".mdx": "markdown",
+    ".qmd": "markdown",
+    ".yaml": "yaml",
+    ".yml": "yaml",
 }
 
 # Shebang interpreter → language mapping for extension-less Unix scripts.
@@ -939,6 +947,12 @@ def parse_bytes(self, path: Path, source: bytes) -> tuple[list[NodeInfo], list[E
         if language == "sql":
             return self._parse_sql(path, source)
 
+        # Knowledge formats: regex-based extraction (no tree-sitter grammar bundled).
+        if language == "markdown":
+            return self._parse_markdown(path, source)
+        if language == "yaml":
+            return self._parse_yaml(path, source)
+
         parser = self._get_parser(language)
         if not parser:
             return [], []
@@ -1553,6 +1567,138 @@ def _parse_databricks_py_notebook(
     # is bundled for ReScript, so we extract best-effort structure via
     # comment-stripping + line-anchored regex + brace-counted module scan).
     # ------------------------------------------------------------------
+    # Knowledge extractors (regex-based, no tree-sitter grammar): .md and .yaml.
+    # Markdown headings -> Section nodes; nesting -> CONTAINS; links -> REFERENCES.
+    # YAML top-level keys / registry entries -> nodes, so config and docs share the
+    # graph with code via the same edge kinds.
+    # ------------------------------------------------------------------
+
+    def _parse_markdown(
+        self, path: Path, source: bytes,
+    ) -> tuple[list[NodeInfo], list[EdgeInfo]]:
+        """Markdown: File + Section(heading) nodes; CONTAINS (nesting) + REFERENCES (links)."""
+        text = source.decode("utf-8", errors="replace")
+        fp = str(path)
+        stem = path.stem
+        lines = text.splitlines()
+        nodes: list[NodeInfo] = [
+            NodeInfo(
+                kind="File", name=path.name, file_path=fp,
+                line_start=1, line_end=max(1, len(lines)), language="markdown",
+            )
+        ]
+        edges: list[EdgeInfo] = []
+        h_re = re.compile(r"^(#{1,6})\s+(.+)")
+        bt_re = re.compile(r"`([^`\n]+)`")
+        wl_re = re.compile(r"\[\[([^\]\|]+)")
+        ml_re = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
+        stack: list[tuple[int, str]] = []
+        seen_sec: set[str] = set()
+        seen_ref: set[str] = set()
+        in_code = False
+        for i, line in enumerate(lines, 1):
+            s = line.strip()
+            if s.startswith("```"):
+                in_code = not in_code
+                continue
+            if in_code:
+                continue
+            m = h_re.match(line)
+            if m:
+                level = len(m.group(1))
+                title = m.group(2).strip()
+                qn = f"{stem}#{title}"
+                if qn in seen_sec:
+                    qn = f"{qn}@L{i}"
+                seen_sec.add(qn)
+                while stack and stack[-1][0] >= level:
+                    stack.pop()
+                parent = stack[-1][1] if stack else path.name
+                nodes.append(
+                    NodeInfo(
+                        kind="Section", name=qn, file_path=fp, line_start=i,
+                        line_end=i, language="markdown", parent_name=parent,
+                    )
+                )
+                edges.append(
+                    EdgeInfo(kind="CONTAINS", source=parent, target=qn, file_path=fp, line=i)
+                )
+                stack.append((level, qn))
+                continue
+            for tok in bt_re.findall(line) + wl_re.findall(line) + ml_re.findall(line):
+                tok_s = tok.strip()
+                t = tok_s.split("#")[0].rsplit("/", 1)[-1]
+                t = t[:-3] if t.endswith(".md") else t
+                looks_like = bool(re.fullmatch(r"[\w.\-]{3,60}", tok_s))
+                if (tok_s.endswith(".md") or looks_like) and t and t != stem and t not in seen_ref:
+                    seen_ref.add(t)
+                    edges.append(
+                        EdgeInfo(
+                            kind="REFERENCES", source=path.name, target=t,
+                            file_path=fp, line=i,
+                        )
+                    )
+        return nodes, edges
+
+    def _parse_yaml(
+        self, path: Path, source: bytes,
+    ) -> tuple[list[NodeInfo], list[EdgeInfo]]:
+        """YAML (regex): File + Section(top-key) + Type(entry) nodes + CONTAINS edges."""
+        text = source.decode("utf-8", errors="replace")
+        fp = str(path)
+        stem = path.stem
+        lines = text.splitlines()
+        nodes: list[NodeInfo] = [
+            NodeInfo(
+                kind="File", name=path.name, file_path=fp,
+                line_start=1, line_end=max(1, len(lines)), language="yaml",
+            )
+        ]
+        edges: list[EdgeInfo] = []
+        topkey_re = re.compile(r"^([A-Za-z_][\w-]*):")
+        entry_re = re.compile(r"""^\s*-?\s*(?:id|name):\s*["']?([\w./-]{2,60})""")
+        cur_top = path.name
+        seen: set[str] = set()
+        for i, line in enumerate(lines, 1):
+            mt = topkey_re.match(line)
+            if mt:
+                key = mt.group(1)
+                qn = f"{stem}:{key}"
+                if qn not in seen:
+                    seen.add(qn)
+                    nodes.append(
+                        NodeInfo(
+                            kind="Section", name=qn, file_path=fp, line_start=i,
+                            line_end=i, language="yaml", parent_name=path.name,
+                        )
+                    )
+                    edges.append(
+                        EdgeInfo(
+                            kind="CONTAINS", source=path.name, target=qn,
+                            file_path=fp, line=i,
+                        )
+                    )
+                    cur_top = qn
+                continue
+            me = entry_re.match(line)
+            if me:
+                eid = me.group(1)
+                qn = f"{stem}::{eid}"
+                if qn not in seen:
+                    seen.add(qn)
+                    nodes.append(
+                        NodeInfo(
+                            kind="Type", name=qn, file_path=fp, line_start=i,
+                            line_end=i, language="yaml", parent_name=cur_top,
+                        )
+                    )
+                    edges.append(
+                        EdgeInfo(
+                            kind="CONTAINS", source=cur_top, target=qn,
+                            file_path=fp, line=i,
+                        )
+                    )
+        return nodes, edges
 
     def _parse_rescript(
         self, path: Path, source: bytes,

diff --git a/tests/test_knowledge_extractors.py b/tests/test_knowledge_extractors.py
@@ -0,0 +1,63 @@
+"""Tests for the Markdown and YAML knowledge extractors.
+
+These formats have no bundled tree-sitter grammar, so they are parsed with
+regex-based extractors (same approach as ReScript / SQL CREATE PROCEDURE).
+Markdown headings and YAML registry entries become graph nodes, letting the
+graph link documentation/config to code via the shared CONTAINS/REFERENCES
+edge kinds.
+"""
+import tempfile
+from pathlib import Path
+
+from code_review_graph.parser import CodeParser
+
+
+class TestKnowledgeExtractors:
+    def setup_method(self):
+        self.parser = CodeParser()
+
+    def _write(self, suffix: str, content: str) -> Path:
+        d = Path(tempfile.mkdtemp())
+        p = d / f"sample{suffix}"
+        p.write_text(content, encoding="utf-8")
+        return p
+
+    def test_markdown_headings_become_nodes(self):
+        p = self._write(".md", "# Title\n## Section A\n### Sub\n## Section B\n")
+        nodes, edges = self.parser.parse_file(p)
+        kinds = {n.kind for n in nodes}
+        assert "File" in kinds
+        assert "Section" in kinds
+        # 1 File + 4 headings
+        assert sum(1 for n in nodes if n.kind == "Section") == 4
+        contains = [e for e in edges if e.kind == "CONTAINS"]
+        assert len(contains) == 4
+
+    def test_markdown_nesting_and_code_fence(self):
+        p = self._write(".md", "# Top\n## Child\n```\n# not a heading\n```\n")
+        nodes, edges = self.parser.parse_file(p)
+        labels = {n.name for n in nodes if n.kind == "Section"}
+        # fenced "# not a heading" must NOT become a node
+        assert not any("not a heading" in lbl for lbl in labels)
+        # Child nests under Top, not the file
+        nest = [
+            e for e in edges
+            if e.kind == "CONTAINS" and "Top" in e.source and "Child" in e.target
+        ]
+        assert nest
+
+    def test_markdown_references(self):
+        p = self._write(".md", "# Doc\nSee `other_doc.md` and [link](target.md).\n")
+        _, edges = self.parser.parse_file(p)
+        refs = {e.target for e in edges if e.kind == "REFERENCES"}
+        assert "other_doc" in refs
+        assert "target" in refs
+
+    def test_yaml_top_keys_and_entries(self):
+        p = self._write(".yaml", "crons:\n  - id: CRON-1\n    name: alpha\n  - id: CRON-2\n")
+        nodes, edges = self.parser.parse_file(p)
+        names = {n.name for n in nodes}
+        assert any(n.kind == "File" for n in nodes)
+        assert any("crons" in nm for nm in names)          # top-level key node
+        assert any("CRON-1" in nm for nm in names)         # registry entry node
+        assert [e for e in edges if e.kind == "CONTAINS"]