Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
327 changes: 327 additions & 0 deletions .bmad-core/scripts/validate_docs_consistency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
#!/usr/bin/env python3
"""
BMAD-METHOD Utility Script: validate_docs_consistency.py

Audits Markdown/HTML documentation for enterprise documentation consistency.

Checks covered:
- Broken local Markdown/HTML links and anchors.
- Prohibited emoji/pictograph characters in Markdown files (R-14).
- Common mojibake/encoding artifacts (R-03).
- Obsolete or forbidden stack references in docs (R-16/R-20).
- Missing bilingual counterpart files for common English/Spanish doc patterns (R-01).

The script is read-only and exits non-zero when issues are found.
"""

from __future__ import annotations

import argparse
import os
import re
import sys
import unicodedata
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import unquote, urlparse

REPO_ROOT = Path(__file__).resolve().parents[2]
DEFAULT_TARGETS = [REPO_ROOT / "README.md", REPO_ROOT / "docs"]

SKIP_DIRS = {
".git",
".nx",
".venv",
"bin",
"build",
"dist",
"node_modules",
"obj",
}

LOCAL_LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)|(?:href|src)=[\"']([^\"']+)[\"']", re.IGNORECASE)
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE)

MOJIBAKE_TOKENS = (
"Ã",
"Â",
"“",
"â€",
"’",
"–",
"—",
"",
)

FORBIDDEN_STACK_PATTERNS = [
(re.compile(r"\bPostgreSQL\b", re.IGNORECASE), "PostgreSQL reference found. UMS authoritative database is SQL Server unless explicitly marked as external comparison."),
(re.compile(r"\.NET\s+8\b", re.IGNORECASE), ".NET 8 reference found. UMS authoritative backend stack is .NET 10."),
(re.compile(r"SQL\s+Server\s+2019\b", re.IGNORECASE), "SQL Server 2019 reference found. Current project baseline is SQL Server 2022."),
]

EMOJI_RANGES = [
(0x1F000, 0x1FAFF),
(0x2600, 0x27BF),
(0x1F100, 0x1F1FF),
]
EXTRA_EMOJI_CODEPOINTS = {0x2B50, 0x2B55, 0x2B1B, 0x2B1C, 0xFE0F, 0xFE0E, 0x200D, 0x200B, 0x200C, 0x2060, 0xFEFF}


@dataclass(frozen=True)
class Issue:
severity: str
rule: str
path: Path
line: int
issue_type: str
message: str
recommendation: str


def iter_files(targets: list[Path]) -> list[Path]:
files: list[Path] = []
for target in targets:
if not target.exists():
continue
if target.is_file() and target.suffix.lower() in {".md", ".html"}:
files.append(target)
continue
if target.is_dir():
for root, dirs, names in os.walk(target):
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
for name in names:
path = Path(root) / name
if path.suffix.lower() in {".md", ".html"}:
files.append(path)
return sorted(set(files))


def line_number(text: str, index: int) -> int:
return text.count("\n", 0, index) + 1


def is_emoji(ch: str) -> bool:
cp = ord(ch)
if cp in EXTRA_EMOJI_CODEPOINTS:
return True
return any(start <= cp <= end for start, end in EMOJI_RANGES)


def slugify_heading(heading: str) -> str:
heading = re.sub(r"<[^>]+>", "", heading)
heading = re.sub(r"[`*_~]", "", heading)
heading = unicodedata.normalize("NFKD", heading)
heading = "".join(ch for ch in heading if not unicodedata.combining(ch))
heading = heading.strip().lower()
heading = re.sub(r"[^a-z0-9\s-]", "", heading)
heading = re.sub(r"\s+", "-", heading)
heading = re.sub(r"-+", "-", heading).strip("-")
return heading


def collect_anchors(text: str) -> set[str]:
anchors: set[str] = set()
seen: dict[str, int] = {}
for match in HEADING_RE.finditer(text):
base = slugify_heading(match.group(2))
if not base:
continue
count = seen.get(base, 0)
slug = base if count == 0 else f"{base}-{count}"
seen[base] = count + 1
anchors.add(slug)
anchors.update(re.findall(r"<a\s+(?:[^>]*?\s+)?name=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE))
anchors.update(re.findall(r"id=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE))
return anchors


def normalize_link(raw: str) -> str:
raw = raw.strip()
if raw.startswith("<") and raw.endswith(">"):
raw = raw[1:-1].strip()
return raw


def is_external_or_special(link: str) -> bool:
parsed = urlparse(link)
return bool(parsed.scheme in {"http", "https", "mailto", "tel", "data"}) or link.startswith("#")


def validate_links(path: Path, text: str, file_cache: dict[Path, str]) -> list[Issue]:
issues: list[Issue] = []
for match in LOCAL_LINK_RE.finditer(text):
link = normalize_link(match.group(1) or match.group(2) or "")
if not link or is_external_or_special(link):
continue
if link.startswith(".") or link.startswith("/") or not urlparse(link).scheme:
link_path, _, anchor = link.partition("#")
clean_path = unquote(link_path)
clean_path = clean_path.split("?", 1)[0]
target = (path.parent / clean_path).resolve() if clean_path else path.resolve()
if clean_path and not target.exists():
issues.append(Issue(
"critical",
"R-10/R-13",
path,
line_number(text, match.start()),
"broken-link",
f"Broken local link: {link}",
"Fix the relative path from the current document or remove the stale link.",
))
continue
if anchor and target.suffix.lower() in {".md", ".html"} and target.exists():
target_text = file_cache.get(target)
if target_text is None:
target_text = target.read_text(encoding="utf-8", errors="replace")
file_cache[target] = target_text
if anchor and unquote(anchor).lower() not in collect_anchors(target_text):
issues.append(Issue(
"warning",
"R-10/R-13",
path,
line_number(text, match.start()),
"broken-anchor",
f"Anchor not found: {link}",
"Update the anchor to match the target heading generated by GitHub Markdown.",
))
return issues


def validate_encoding_and_professionalism(path: Path, text: str) -> list[Issue]:
issues: list[Issue] = []
for token in MOJIBAKE_TOKENS:
index = text.find(token)
if index >= 0:
issues.append(Issue(
"critical",
"R-03",
path,
line_number(text, index),
"encoding",
f"Possible mojibake token found: {token}",
"Run cleanup_markdown_encoding.py and review the affected sentence manually.",
))
if path.suffix.lower() == ".md":
for index, ch in enumerate(text):
if is_emoji(ch):
issues.append(Issue(
"warning",
"R-14",
path,
line_number(text, index),
"decorative-character",
f"Prohibited emoji/decorative character found: U+{ord(ch):04X}",
"Remove the emoji/icon and keep the Markdown enterprise-professional.",
))
break
return issues


def validate_stack(path: Path, text: str) -> list[Issue]:
issues: list[Issue] = []
lower_path = str(path).replace("\\", "/").lower()
if "/docs/" not in lower_path and not lower_path.endswith("readme.md"):
return issues
for pattern, message in FORBIDDEN_STACK_PATTERNS:
for match in pattern.finditer(text):
context = text[max(0, match.start() - 100): match.end() + 100].lower()
if "external comparison" in context or "comparación externa" in context:
continue
issues.append(Issue(
"critical",
"R-16/R-20",
path,
line_number(text, match.start()),
"stack-consistency",
message,
"Align the document with .NET 10 + SQL Server 2022 + EF Core, or mark the reference as an explicit external comparison.",
))
return issues


def counterpart_candidates(path: Path) -> list[Path]:
s = str(path)
candidates: list[str] = []
if s.endswith(".es.md"):
candidates.append(s[:-6] + ".md")
elif s.endswith(".md"):
candidates.append(s[:-3] + ".es.md")
candidates.append(s.replace("/product-es/", "/product/"))
candidates.append(s.replace("/product/", "/product-es/"))
candidates.append(s.replace("/project-es/", "/project/"))
candidates.append(s.replace("/project/", "/project-es/"))
candidates.append(s.replace("/requirements-es/", "/requirements/"))
candidates.append(s.replace("/requirements/", "/requirements-es/"))
candidates.append(s.replace("/blueprints-es/", "/blueprints/"))
candidates.append(s.replace("/blueprints/", "/blueprints-es/"))
return [Path(c) for c in candidates if c != s]


def validate_bilingual(files: list[Path]) -> list[Issue]:
file_set = {p.resolve() for p in files if p.suffix.lower() == ".md"}
issues: list[Issue] = []
for path in file_set:
rel = path.relative_to(REPO_ROOT) if path.is_relative_to(REPO_ROOT) else path
rel_s = str(rel).replace("\\", "/")
if not rel_s.startswith("docs/"):
continue
if rel_s.startswith("docs/qa/"):
continue
candidates = [c.resolve() for c in counterpart_candidates(path)]
if not any(c in file_set for c in candidates):
issues.append(Issue(
"warning",
"R-01",
path,
1,
"bilingual-sync",
"No obvious bilingual counterpart found for this documentation file.",
"Create or link the English/Spanish counterpart, or document why the file is intentionally single-language.",
))
return issues


def format_issue(issue: Issue) -> str:
rel = issue.path.relative_to(REPO_ROOT) if issue.path.is_relative_to(REPO_ROOT) else issue.path
return (
f"- [{issue.severity.upper()}] {issue.rule} {rel}:{issue.line}\n"
f" Type: {issue.issue_type}\n"
f" Issue: {issue.message}\n"
f" Fix: {issue.recommendation}"
)


def main() -> int:
parser = argparse.ArgumentParser(description="Validate UMS documentation consistency.")
parser.add_argument("paths", nargs="*", help="Files or directories to audit. Defaults to README.md and docs/.")
parser.add_argument("--no-bilingual", action="store_true", help="Skip bilingual counterpart scan.")
args = parser.parse_args()

targets = [Path(p).resolve() for p in args.paths] if args.paths else DEFAULT_TARGETS
files = iter_files(targets)
file_cache: dict[Path, str] = {}
issues: list[Issue] = []

for path in files:
text = path.read_text(encoding="utf-8", errors="replace")
file_cache[path.resolve()] = text
issues.extend(validate_encoding_and_professionalism(path, text))
issues.extend(validate_stack(path, text))
issues.extend(validate_links(path, text, file_cache))

if not args.no_bilingual:
issues.extend(validate_bilingual(files))

if issues:
print("Documentation consistency audit failed.\n")
for issue in sorted(issues, key=lambda x: (str(x.path), x.line, x.rule, x.issue_type)):
print(format_issue(issue))
print(f"\nTotal issues: {len(issues)}")
return 1

print(f"Documentation consistency audit passed. Scanned {len(files)} files.")
return 0


if __name__ == "__main__":
sys.exit(main())
Loading
Loading