diff --git a/docs/developer/architecture.md b/docs/developer/architecture.md index a26de46..9bd9df0 100644 --- a/docs/developer/architecture.md +++ b/docs/developer/architecture.md @@ -17,7 +17,7 @@ Technical documentation for developers working on the QuantEcon Style Guide Chec ▼ ┌─────────────────────────────────────────────────────┐ │ action.yml │ -│ Sets up Python, installs deps, invokes action.py │ +│ Installs uv, syncs deps, invokes action.py via uv │ └──────────────────────────┬──────────────────────────┘ │ ┌────────────┴────────────┐ @@ -32,18 +32,16 @@ Technical documentation for developers working on the QuantEcon Style Guide Chec ┌───────────────────┼──────────────┐ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌─────────────┐ - │prompt_loader │ │ fix_applier │ │Anthropic API│ - │Load prompts │ │Apply fixes │ │(Claude) │ - │Load rules │ │Validate │ └─────────────┘ - └──────┬───────┘ └──────────────┘ - ┌──────┴───────┐ - ▼ ▼ - ┌──────────┐ ┌──────────┐ - │prompts/ │ │ rules/ │ - │(8 files) │ │(8 files) │ - └──────────┘ └──────────┘ + │ prompts/ │ │ fix_applier │ │Anthropic API│ + │ prompt.md │ │Apply fixes │ │(Claude) │ + │ + rules/*.md │ │Validate │ └─────────────┘ + └──────────────┘ └──────────────┘ ``` +`categories.py` is the single source of truth for the 8 category names +(`writing`, `math`, `code`, `jax`, `figures`, `references`, `links`, +`admonitions`); every other module imports `VALID_CATEGORIES` from it. + ## Two Entry Points, One Engine - **`action.py`**: GitHub Action entry point. Reads files via GitHub API, creates PRs with fixes. @@ -87,15 +85,21 @@ Key classes: - `AnthropicProvider` — Claude API wrapper with extended thinking and streaming fallback - `StyleReviewer` — Main review orchestrator -### Prompt Loader (`prompt_loader.py`) +### Prompt Construction (`reviewer.create_single_rule_prompt`) -Loads and combines category-specific prompts and rules: +For each rule, the reviewer builds an LLM prompt as: ``` -[Category Prompt] + [Style Guide Rules] + [Lecture Content] → LLM +[Shared base prompt (prompts/prompt.md)] + + [Single rule definition from rules/{category}-rules.md] + + [Lecture content] + → LLM ``` -The prompt is rule-agnostic — all 8 category prompts are identical. Scope and analysis context come from the rule definitions themselves. This prevents signal dilution from category-specific instructions. +The base prompt is rule-agnostic — a single `prompts/prompt.md` file is +shared across all 8 categories. Scope and analysis context come from the +rule definitions themselves, which prevents signal dilution from +category-specific instructions. ### Fix Applier (`fix_applier.py`) @@ -228,16 +232,18 @@ Depends on lecture length and violations found. ``` action-style-guide/ ├── action.yml # GitHub Action definition +├── pyproject.toml # Package + dep manifest (uv-managed) +├── uv.lock # Reproducible dep lockfile ├── style_checker/ # Main package │ ├── __init__.py # Version (__version__) +│ ├── categories.py # Single source of truth for VALID_CATEGORIES │ ├── cli.py # Local CLI entry point (qestyle) │ ├── action.py # GitHub Action entry point │ ├── reviewer.py # LLM review engine (shared) │ ├── fix_applier.py # Apply fixes to files (shared) │ ├── github_handler.py # GitHub API (action only) -│ ├── prompt_loader.py # Load prompts + rules (shared) -│ ├── prompts/ # Minimal rule-agnostic prompts -│ └── rules/ # Category-specific rule definitions +│ ├── prompts/ # Single shared prompt.md (+ v0.6.1 archive) +│ └── rules/ # Per-category rule definitions ├── tests/ # Test suite ├── docs/ # Documentation (this site) └── examples/ # Example workflows diff --git a/docs/developer/contributing.md b/docs/developer/contributing.md index 6277723..a3cf097 100644 --- a/docs/developer/contributing.md +++ b/docs/developer/contributing.md @@ -88,16 +88,15 @@ Rules are in `style_checker/rules/` and are read directly by the LLM — **no co [Good and bad examples] ``` -3. Update corresponding prompt file in `style_checker/prompts/` if needed +3. The base prompt (`prompts/prompt.md`) is shared across all categories; usually no edit needed there. 4. Test with real lecture files ### Adding a New Category -1. Create `prompts/category-prompt.md` -2. Create `rules/category-rules.md` -3. Add category to `VALID_CATEGORIES` in `github_handler.py` and `prompt_loader.py` -4. Add to category list in `review_lecture_smart()` -5. Test end-to-end +1. Create `style_checker/rules/{category}-rules.md` +2. Add the new name to `VALID_CATEGORIES` in `style_checker/categories.py` +3. Add an entry for it in `RULE_EVALUATION_ORDER` in `style_checker/reviewer.py` (the test suite will fail loudly if the keys drift) +4. Test end-to-end ## Pull Request Process diff --git a/docs/developer/extended-thinking.md b/docs/developer/extended-thinking.md index 7a8c6be..9907d86 100644 --- a/docs/developer/extended-thinking.md +++ b/docs/developer/extended-thinking.md @@ -117,5 +117,5 @@ This is 40 lines vs the previous 120-line category-specific prompts, and it prod |----------|-----------| | `thinking_budget=10000` | Enough for careful analysis, not excessive cost | | `temperature=1.0` | Required by Anthropic for extended thinking | -| 8 identical prompt files (for now) | Consolidation to single file planned (validated on writing, pending other categories) | +| Single shared `prompts/prompt.md` | Consolidated from 8 byte-identical files (validated on writing, then rolled out across all categories) | | Archive v0.6.1 prompts | Reference for regression testing and comparison | diff --git a/docs/developer/roadmap.md b/docs/developer/roadmap.md index a4c0f53..d147a44 100644 --- a/docs/developer/roadmap.md +++ b/docs/developer/roadmap.md @@ -28,7 +28,7 @@ The project is in **active development**. Breaking changes are acceptable — th ### Phase 3: Test Suite Improvements ✅ - Fixed `test_parsing.py` to test real methods -- Added tests for `fix_applier.py`, `prompt_loader.py`, `reviewer.py` +- Added tests for `fix_applier.py`, `reviewer.py` (incl. RULE_EVALUATION_ORDER drift detection and prompt file existence) - Set up CI pipeline (GitHub Actions, ruff linting, Python 3.11/3.12/3.13) ## In Progress @@ -44,7 +44,7 @@ Focus: reduce LLM hallucinations, improve fix accuracy, move mechanical rules to | 4.3 Deterministic Checkers | ~13 mechanical rules via regex (zero hallucination risk) | Planned | | 4.4 Rule Clarity | Improve 12 rule descriptions to reduce misinterpretation | Planned | | 4.5 Scope Reduction | Reduce noise from overly subjective rules | Planned | -| 4.6 Prompt Consolidation | Merge 8 identical prompt files into single `prompt.md` | Planned | +| 4.6 Prompt Consolidation | Merge 8 identical prompt files into single `prompt.md` | **Done** (PR #17) | | 4.7 Extended Thinking | Claude reasons internally → 0% false positives | **Done** (v0.7.0) | ### Phase 5: Style Suggestion UX diff --git a/docs/developer/testing.md b/docs/developer/testing.md index e48f7f9..6bd9c0a 100644 --- a/docs/developer/testing.md +++ b/docs/developer/testing.md @@ -27,8 +27,7 @@ tests/ ├── test_github_handler.py # GitHub API interaction, comment parsing ├── test_markdown_parser.py # LLM response parsing ├── test_parsing.py # Comment trigger pattern matching -├── test_prompt_loader.py # Prompt/rules file loading -├── test_reviewer.py # Rule extraction and evaluation order +├── test_reviewer.py # Rule extraction, RULE_EVALUATION_ORDER, prompt file ├── test_llm_integration.py # Real LLM API calls (@integration) └── test_cli.py # CLI argument parsing ``` @@ -43,8 +42,7 @@ Run automatically with `pytest`: | `test_github_handler.py` | GitHub API interaction, comment parsing | | `test_markdown_parser.py` | LLM response parsing | | `test_parsing.py` | Comment trigger pattern matching (real method) | -| `test_prompt_loader.py` | Prompt/rules file loading | -| `test_reviewer.py` | Rule extraction and evaluation order | +| `test_reviewer.py` | Rule extraction, RULE_EVALUATION_ORDER consistency, prompt file existence | | `test_cli.py` | CLI argument parsing | ### Integration Tests (Slow, Costs Money) @@ -69,15 +67,14 @@ pytest --cov=style_checker --cov-report=html open htmlcov/index.html ``` -Current coverage: +Current coverage (approximate — re-measure with `pytest --cov`): | File | Coverage | |------|----------| -| `fix_applier.py` | 92% | -| `prompt_loader.py` | 86% | -| `github_handler.py` | 55% | -| `reviewer.py` | 47% | -| `action.py` | 0% (needs integration mocking) | +| `fix_applier.py` | high | +| `github_handler.py` | medium | +| `reviewer.py` | medium | +| `action.py` | 0% (needs integration mocking — tracked in TECHNICAL-REVIEW §6.1) | ## CI Pipeline diff --git a/style_checker/categories.py b/style_checker/categories.py new file mode 100644 index 0000000..dceefab --- /dev/null +++ b/style_checker/categories.py @@ -0,0 +1,21 @@ +""" +Single source of truth for the style-rule category names. + +Every category here must have a corresponding `{name}-rules.md` file in +`style_checker/rules/` and a matching entry in +`reviewer.RULE_EVALUATION_ORDER`. The consistency between this tuple and +`RULE_EVALUATION_ORDER.keys()` is enforced by a test in `tests/test_reviewer.py`. +""" + +# Ordered — index is the default category processing order used by +# `StyleReviewer.review_lecture_smart`. +VALID_CATEGORIES = ( + "writing", + "math", + "code", + "jax", + "figures", + "references", + "links", + "admonitions", +) diff --git a/style_checker/cli.py b/style_checker/cli.py index 8ccb4a7..b6c07f0 100644 --- a/style_checker/cli.py +++ b/style_checker/cli.py @@ -22,16 +22,10 @@ from datetime import datetime from style_checker import __version__ +from style_checker.categories import VALID_CATEGORIES from style_checker.reviewer import StyleReviewer -# All available categories (matches reviewer.RULE_EVALUATION_ORDER keys) -ALL_CATEGORIES = [ - "writing", "math", "code", "jax", - "figures", "references", "links", "admonitions", -] - - def display_width(s: str) -> int: """Calculate terminal display width, accounting for wide/emoji characters.""" w = 0 @@ -278,13 +272,13 @@ def main(): # Parse categories if args.categories: categories = [c.strip() for c in args.categories.split(",")] - invalid = [c for c in categories if c not in ALL_CATEGORIES] + invalid = [c for c in categories if c not in VALID_CATEGORIES] if invalid: print(f"Error: invalid categories: {', '.join(invalid)}", file=sys.stderr) - print(f"Valid categories: {', '.join(ALL_CATEGORIES)}", file=sys.stderr) + print(f"Valid categories: {', '.join(VALID_CATEGORIES)}", file=sys.stderr) sys.exit(1) else: - categories = list(ALL_CATEGORIES) + categories = list(VALID_CATEGORIES) # API key api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY") diff --git a/style_checker/github_handler.py b/style_checker/github_handler.py index 3281c42..47af966 100644 --- a/style_checker/github_handler.py +++ b/style_checker/github_handler.py @@ -11,16 +11,12 @@ from github import Github, GithubException from datetime import datetime +from .categories import VALID_CATEGORIES + class GitHubHandler: """Handles GitHub API interactions for PR and issue management""" - # Valid category names (must match files in style_checker/rules/) - VALID_CATEGORIES = { - 'writing', 'math', 'code', 'jax', - 'figures', 'references', 'links', 'admonitions' - } - def __init__(self, token: str, repository: str): """ Initialize GitHub handler @@ -79,10 +75,10 @@ def extract_lecture_from_comment(self, comment_body: str) -> Optional[Tuple[str, # Validate categories if categories != ['all']: - invalid = [c for c in categories if c not in self.VALID_CATEGORIES] + invalid = [c for c in categories if c not in VALID_CATEGORIES] if invalid: print(f"⚠️ Invalid categories: {', '.join(invalid)}") - print(f" Valid categories: {', '.join(sorted(self.VALID_CATEGORIES))}") + print(f" Valid categories: {', '.join(sorted(VALID_CATEGORIES))}") return None else: categories = ['all'] # Default to all categories diff --git a/style_checker/prompt_loader.py b/style_checker/prompt_loader.py deleted file mode 100644 index 0414abf..0000000 --- a/style_checker/prompt_loader.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -Simple utility to load markdown prompt templates by category. - -Loads concise instruction prompts from prompts/ and detailed rules from rules/, -then combines them for the LLM. -""" - -from pathlib import Path -import re -from typing import List, Optional - - -class PromptLoader: - """Loads category-specific style guide prompts and rules from markdown files.""" - - VALID_CATEGORIES = [ - "writing", "math", "code", "jax", - "figures", "references", "links", "admonitions" - ] - - def __init__(self, prompts_dir: Optional[Path] = None, rules_dir: Optional[Path] = None): - """ - Initialize the prompt loader. - - Args: - prompts_dir: Directory containing prompt markdown files. - Defaults to style_checker/prompts/ - rules_dir: Directory containing rule markdown files. - Defaults to style_checker/rules/ - """ - if prompts_dir is None: - prompts_dir = Path(__file__).parent / "prompts" - if rules_dir is None: - rules_dir = Path(__file__).parent / "rules" - - self.prompts_dir = Path(prompts_dir) - self.rules_dir = Path(rules_dir) - - if not self.prompts_dir.exists(): - raise FileNotFoundError( - f"Prompts directory not found: {self.prompts_dir}" - ) - if not self.rules_dir.exists(): - raise FileNotFoundError( - f"Rules directory not found: {self.rules_dir}" - ) - - def load_prompt(self, categories: List[str], lecture_content: str) -> str: - """ - Load and combine prompts + rules for specified categories. - - Approach: - 1. Load concise instruction prompt - 2. Load detailed rules - 3. Combine: prompt + rules + lecture_content - - Args: - categories: List of category names (e.g., ["writing", "math"]) - lecture_content: The lecture markdown content to review - - Returns: - Complete prompt with instructions, rules, and lecture content - - Raises: - ValueError: If invalid category specified - FileNotFoundError: If prompt or rules file not found - """ - # Validate categories - invalid = [c for c in categories if c not in self.VALID_CATEGORIES] - if invalid: - raise ValueError( - f"Invalid categories: {invalid}. " - f"Valid options: {self.VALID_CATEGORIES}" - ) - - # For single category, use simple combination - if len(categories) == 1: - return self._load_single_category(categories[0], lecture_content) - - # For multiple categories, combine them - return self._combine_categories(categories, lecture_content) - - def _load_single_category(self, category: str, lecture_content: str) -> str: - """ - Load prompt and rules for a single category. - - - prompt file: category-prompt.md - - rules file: category-rules.md - """ - # Load prompt (instruction template) - prompt_file = self.prompts_dir / f"{category}-prompt.md" - if not prompt_file.exists(): - raise FileNotFoundError(f"Prompt file not found: {prompt_file}") - - with open(prompt_file, 'r') as f: - prompt = f.read() - - # Debug: Check prompt version - if category == "writing": - # Check for version comment - version_match = re.search(r' - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/figures-prompt.md b/style_checker/prompts/figures-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/figures-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/jax-prompt.md b/style_checker/prompts/jax-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/jax-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/links-prompt.md b/style_checker/prompts/links-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/links-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/math-prompt.md b/style_checker/prompts/math-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/math-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/admonitions-prompt.md b/style_checker/prompts/prompt.md similarity index 100% rename from style_checker/prompts/admonitions-prompt.md rename to style_checker/prompts/prompt.md diff --git a/style_checker/prompts/references-prompt.md b/style_checker/prompts/references-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/references-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/prompts/writing-prompt.md b/style_checker/prompts/writing-prompt.md deleted file mode 100644 index 2508759..0000000 --- a/style_checker/prompts/writing-prompt.md +++ /dev/null @@ -1,41 +0,0 @@ - - -You are a style checker for QuantEcon lecture files written in MyST Markdown. - -## Task - -Find all violations of the provided rule in the lecture document. - -First, silently analyze the entire document and identify candidate violations. -Then, verify each candidate — confirm the current text actually violates the rule and the fix changes the text. -Only include confirmed violations in your response. Report 0 if none exist. - -## Response Format - -```markdown -# Review Results - -## Summary -[1-2 sentence summary] - -## Issues Found -[NUMBER ONLY] - -## Violations - -### Violation 1: [rule-code] - [Rule Title] -**Severity:** error -**Location:** Line [X] / Section "[name]" -**Description:** [Why this violates the rule] -**Current text:** -~~~markdown -[exact quote] -~~~ -**Suggested fix:** -~~~markdown -[corrected version — MUST be different from current text] -~~~ -**Explanation:** [Why this fix resolves the violation] -``` - -If Issues Found is 0, do not include a Violations section. diff --git a/style_checker/reviewer.py b/style_checker/reviewer.py index 444af28..c0886fb 100644 --- a/style_checker/reviewer.py +++ b/style_checker/reviewer.py @@ -13,6 +13,7 @@ import anthropic +from .categories import VALID_CATEGORIES from .fix_applier import apply_fixes, validate_fix_quality @@ -153,25 +154,28 @@ def extract_individual_rules(category: str) -> List[Dict[str, str]]: def create_single_rule_prompt(category: str, rule: Dict[str, str], lecture_content: str) -> str: """ Create a focused prompt for checking a single rule. - + + The base prompt is rule-agnostic — a single `prompts/prompt.md` is shared + across all categories. The `category` arg is currently unused but kept in + the signature so callers don't need to change if a category-specific + prompt is ever reintroduced. + Args: - category: Category name (e.g., 'writing') + category: Category name (e.g., 'writing') — currently unused rule: Dict with 'rule_id', 'title', and 'content' lecture_content: The lecture to check - + Returns: Complete prompt focused on one specific rule """ - # Load the base prompt (without rules) prompts_dir = Path(__file__).parent / "prompts" - prompt_file = prompts_dir / f"{category}-prompt.md" - + prompt_file = prompts_dir / "prompt.md" + if not prompt_file.exists(): raise FileNotFoundError(f"Prompt file not found: {prompt_file}") - - with open(prompt_file, 'r') as f: - base_prompt = f.read() - + + base_prompt = prompt_file.read_text() + # Create focused prompt with single rule focused_prompt = f"""{base_prompt} @@ -185,7 +189,7 @@ def create_single_rule_prompt(category: str, rule: Dict[str, str], lecture_conte {lecture_content} """ - + return focused_prompt def parse_markdown_response(response: str) -> Dict[str, Any]: @@ -575,24 +579,15 @@ def review_lecture_smart( Returns: Dictionary with all violations found across all categories """ - # Define all categories to check (matches files in style_checker/rules/) - all_categories = [ - 'writing', - 'math', - 'code', - 'jax', - 'figures', - 'references', - 'links', - 'admonitions' - ] - + # Single source of truth for the category list lives in categories.py. + all_categories = list(VALID_CATEGORIES) + print(f"\n🤖 Starting AI-powered review using single-rule evaluation...") print(f"📊 Lecture: {lecture_name}") print(f"\n📦 Processing {len(all_categories)} categories:") for category in all_categories: print(f" • {category}") - + # Delegate to review_lecture_single_rule which handles: # - Single-rule-per-LLM-call evaluation # - Sequential fix application between rules diff --git a/tests/README.md b/tests/README.md index 1e06f48..9d72d0a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -33,21 +33,14 @@ Tests the fix application engine: - First-occurrence-only replacement - Fix quality validation warnings -### `test_prompt_loader.py` -Tests prompt and rules loading: -- Single and multi-category prompt loading -- All 8 categories loadable -- Invalid category error handling -- Prompt version tracking presence -- Rules file content validation - ### `test_reviewer.py` -Tests rule extraction and evaluation order: +Tests rule extraction, evaluation order, and prompt-file invariants: - Rule counts per category (49 total) - Rule type distribution (32 rule, 13 style, 4 migrate) -- RULE_EVALUATION_ORDER consistency with rule files +- RULE_EVALUATION_ORDER consistency with rule files AND with VALID_CATEGORIES (drift detection) - Rule field validation and ID format - No duplicate rule IDs +- Shared `prompts/prompt.md` exists and carries a version header ### `test_llm_integration.py` **Integration tests** that make real LLM API calls (marked with `@pytest.mark.integration`): @@ -107,7 +100,8 @@ pytest -m integration -v Install test dependencies: ```bash -pip install -r requirements.txt +uv sync --all-extras # recommended (uses uv.lock) +# or: pip install -e ".[dev,action]" ``` This includes: diff --git a/tests/test_cli.py b/tests/test_cli.py index 096bfd7..5bb216f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -15,7 +15,8 @@ import tempfile from pathlib import Path -from style_checker.cli import format_report, default_report_path, check_git_dirty, ALL_CATEGORIES +from style_checker.cli import format_report, default_report_path, check_git_dirty +from style_checker.categories import VALID_CATEGORIES from style_checker import __version__ @@ -244,12 +245,12 @@ class TestCategories: """Tests for category validation.""" def test_all_categories_present(self): - """ALL_CATEGORIES should match the 8 standard categories.""" + """VALID_CATEGORIES should match the 8 standard categories.""" expected = {'writing', 'math', 'code', 'jax', 'figures', 'references', 'links', 'admonitions'} - assert set(ALL_CATEGORIES) == expected + assert set(VALID_CATEGORIES) == expected def test_all_categories_count(self): - assert len(ALL_CATEGORIES) == 8 + assert len(VALID_CATEGORIES) == 8 # --------------------------------------------------------------------------- diff --git a/tests/test_parsing.py b/tests/test_parsing.py index fbb1d17..25ca07b 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -13,12 +13,13 @@ @pytest.fixture def handler(): - """Create a GitHubHandler without connecting to GitHub.""" + """Create a GitHubHandler without connecting to GitHub. + + The method under test reads VALID_CATEGORIES from the module-level + `style_checker.categories` import, so no class-attribute setup needed. + """ with patch.object(GitHubHandler, '__init__', lambda self, *a, **kw: None): - h = GitHubHandler.__new__(GitHubHandler) - # Set the class attribute that extract_lecture_from_comment uses - h.VALID_CATEGORIES = GitHubHandler.VALID_CATEGORIES - return h + return GitHubHandler.__new__(GitHubHandler) class TestCommentParsing: diff --git a/tests/test_prompt_loader.py b/tests/test_prompt_loader.py deleted file mode 100644 index a8e5b83..0000000 --- a/tests/test_prompt_loader.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Tests for prompt_loader.py — PromptLoader class -""" - -import pytest -from style_checker.prompt_loader import PromptLoader, load_prompt - - -@pytest.fixture -def loader(): - """Create a PromptLoader using the real prompts/rules directories.""" - return PromptLoader() - - -class TestPromptLoader: - """Test PromptLoader class""" - - def test_load_single_category(self, loader): - """Loading a single category should return prompt + rules + lecture""" - result = loader.load_prompt(["writing"], "# Test Lecture\n\nSome content.") - assert "# Test Lecture" in result - assert "Some content." in result - # Should include rules content - assert "qe-writing-001" in result - - def test_load_multiple_categories(self, loader): - """Loading multiple categories should include rules from all""" - result = loader.load_prompt(["writing", "math"], "# Test\n\nContent.") - assert "qe-writing-001" in result - assert "qe-math-001" in result - - def test_all_categories_loadable(self, loader): - """Every valid category should load without error""" - for category in PromptLoader.VALID_CATEGORIES: - result = loader.load_prompt([category], "# Test\n\nContent.") - assert len(result) > 100, f"Category '{category}' returned suspiciously short prompt" - - def test_invalid_category_raises(self, loader): - """Invalid category name should raise ValueError""" - with pytest.raises(ValueError, match="Invalid categories"): - loader.load_prompt(["nonexistent"], "# Test") - - def test_prompt_includes_lecture_content(self, loader): - """Lecture content should appear in the final prompt""" - marker = "UNIQUE_MARKER_12345" - result = loader.load_prompt(["writing"], f"# Test\n\n{marker}") - assert marker in result - - def test_prompt_file_has_version(self, loader): - """All prompt files should have version comments""" - for category in PromptLoader.VALID_CATEGORIES: - prompt_file = loader.prompts_dir / f"{category}-prompt.md" - content = prompt_file.read_text() - assert "Prompt Version:" in content, \ - f"{category}-prompt.md missing version comment" - - def test_rules_file_has_rules(self, loader): - """All rules files should contain at least one rule""" - for category in PromptLoader.VALID_CATEGORIES: - rules_file = loader.rules_dir / f"{category}-rules.md" - content = rules_file.read_text() - assert "### Rule:" in content, \ - f"{category}-rules.md has no rules" - - -class TestConvenienceFunction: - """Test the load_prompt() convenience function""" - - def test_load_prompt_function(self): - """Convenience function should work identically to class method""" - result = load_prompt(["math"], "# Test\n\nContent.") - assert "qe-math-001" in result - - def test_load_prompt_invalid_category(self): - """Convenience function should raise on invalid category""" - with pytest.raises(ValueError): - load_prompt(["invalid"], "# Test") diff --git a/tests/test_reviewer.py b/tests/test_reviewer.py index b33368d..9dd7fda 100644 --- a/tests/test_reviewer.py +++ b/tests/test_reviewer.py @@ -2,11 +2,17 @@ Tests for reviewer.py — extract_individual_rules() and RULE_EVALUATION_ORDER """ +from pathlib import Path + +import style_checker +from style_checker.categories import VALID_CATEGORIES from style_checker.reviewer import ( extract_individual_rules, RULE_EVALUATION_ORDER, ) -from style_checker.prompt_loader import PromptLoader + + +PROMPTS_DIR = Path(style_checker.__file__).parent / "prompts" # Expected rule counts per category (from rule files) @@ -27,7 +33,7 @@ class TestExtractIndividualRules: def test_all_categories_extract(self): """Every category should extract at least one rule""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) assert len(rules) > 0, f"No rules extracted for '{category}'" @@ -42,7 +48,7 @@ def test_total_rule_count(self): """Total rules across all categories should be 49""" total = sum( len(extract_individual_rules(cat)) - for cat in PromptLoader.VALID_CATEGORIES + for cat in VALID_CATEGORIES ) assert total == 49 @@ -57,7 +63,7 @@ def test_rule_has_required_fields(self): def test_rule_type_values(self): """Rule types should only be 'rule', 'style', or 'migrate'""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) for rule in rules: assert rule['rule_type'] in ('rule', 'style', 'migrate'), \ @@ -65,7 +71,7 @@ def test_rule_type_values(self): def test_rule_id_format(self): """Rule IDs should follow qe-category-NNN pattern""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) for rule in rules: assert rule['rule_id'].startswith('qe-'), \ @@ -74,7 +80,7 @@ def test_rule_id_format(self): def test_no_duplicate_rule_ids(self): """No duplicate rule IDs across all categories""" all_ids = [] - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) all_ids.extend(r['rule_id'] for r in rules) assert len(all_ids) == len(set(all_ids)), \ @@ -88,15 +94,24 @@ def test_nonexistent_category_returns_empty(self): class TestRuleEvaluationOrder: """Test RULE_EVALUATION_ORDER configuration""" + def test_rule_evaluation_order_keys_match_valid_categories(self): + """RULE_EVALUATION_ORDER must have exactly the same keys as VALID_CATEGORIES, + in the same order. Catches drift between the two when either is edited.""" + assert tuple(RULE_EVALUATION_ORDER.keys()) == tuple(VALID_CATEGORIES), ( + "RULE_EVALUATION_ORDER keys diverged from VALID_CATEGORIES.\n" + f" RULE_EVALUATION_ORDER: {tuple(RULE_EVALUATION_ORDER.keys())}\n" + f" VALID_CATEGORIES: {tuple(VALID_CATEGORIES)}" + ) + def test_all_categories_have_order(self): """Every valid category should have an evaluation order defined""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: assert category in RULE_EVALUATION_ORDER, \ f"'{category}' missing from RULE_EVALUATION_ORDER" def test_order_matches_extracted_rules(self): """Evaluation order should include exactly the rules in the rule files""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) extracted_ids = [r['rule_id'] for r in rules] ordered_ids = RULE_EVALUATION_ORDER[category] @@ -106,7 +121,7 @@ def test_order_matches_extracted_rules(self): def test_evaluation_order_is_respected(self): """extract_individual_rules should return rules in RULE_EVALUATION_ORDER""" - for category in PromptLoader.VALID_CATEGORIES: + for category in VALID_CATEGORIES: rules = extract_individual_rules(category) ids = [r['rule_id'] for r in rules] expected = RULE_EVALUATION_ORDER[category] @@ -125,13 +140,36 @@ def test_no_extra_ids_in_order(self): f"'{category}': '{rule_id}' in order but not in rule file" +class TestPromptFile: + """Test the single shared prompt file used by create_single_rule_prompt. + + Replaces coverage from the deleted test_prompt_loader.py — but for a + single consolidated prompt instead of one file per category. + """ + + def test_prompt_file_exists(self): + """create_single_rule_prompt() raises FileNotFoundError at runtime if + the prompt file is missing — assert at test time so we catch packaging + regressions before they hit production.""" + prompt_file = PROMPTS_DIR / "prompt.md" + assert prompt_file.exists(), \ + f"Shared prompt file missing: {prompt_file}" + + def test_prompt_file_has_version_header(self): + """The prompt should carry a `` comment + so logs can record which prompt produced a given review.""" + content = (PROMPTS_DIR / "prompt.md").read_text() + assert "Prompt Version:" in content, \ + "prompt.md missing the 'Prompt Version: ...' header comment" + + class TestRuleTypeCounts: """Test that rule type distribution matches expectations""" def test_rule_type_count(self): """Should have 32 'rule' type rules""" count = sum( - 1 for cat in PromptLoader.VALID_CATEGORIES + 1 for cat in VALID_CATEGORIES for r in extract_individual_rules(cat) if r['rule_type'] == 'rule' ) @@ -140,7 +178,7 @@ def test_rule_type_count(self): def test_style_type_count(self): """Should have 13 'style' type rules""" count = sum( - 1 for cat in PromptLoader.VALID_CATEGORIES + 1 for cat in VALID_CATEGORIES for r in extract_individual_rules(cat) if r['rule_type'] == 'style' ) @@ -149,7 +187,7 @@ def test_style_type_count(self): def test_migrate_type_count(self): """Should have 4 'migrate' type rules""" count = sum( - 1 for cat in PromptLoader.VALID_CATEGORIES + 1 for cat in VALID_CATEGORIES for r in extract_individual_rules(cat) if r['rule_type'] == 'migrate' )