diff --git a/CHANGELOG.md b/CHANGELOG.md index 44ab995..b39364e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.7.1] - 2026-06-02 + +### Added + +- **Syllable-span probe (opt-in neural enhancement).** A frozen-encoder probe that improves detection recall on broken-compound, over-segmentation, and consonant-substitution errors. Three strategies share one small model (~430 MB) with a single inference per sentence: + - `ProbeBoostedCompoundStrategy` (priority 24) — combines neural span scoring with dictionary lookup to flag whitespace-broken compounds. + - `ProbeSegmenterRescueStrategy` (priority 26) — rescues typos the segmenter splits into adjacent dictionary-valid tokens by scoring the boundary and recovering a high-frequency merged-form candidate. + - `ProbeValidationStrategy` (priority 85) — a compact frozen-architecture replacement for the legacy token-classification corrector. +- **Probe configuration.** Opt-in via `use_probe_corrector`, `use_probe_compound`, and `use_probe_segmenter_rescue` config flags (or the `MSC_USE_PROBE_*` environment variables), with tunable confidence thresholds and dictionary-frequency floors. All default off. + +### Changed + +- The probe strategies are registered as suppression-immune so their detections survive the downstream filter cascade. + +### Benchmark + +- With the probe enabled (all flags), spelling composite improves `0.6436` → `0.6561` (**+0.0125**): +47 true positives, recall +3.2pp, top-1 accuracy +1.7pp, clean false-positive sentences 84 → 92. Default behavior (probe off) is unchanged. + +### Compatibility + +- No change to default behavior; existing deployments are unaffected unless the probe flags are enabled. Adds three public strategy classes in `myspellchecker.core.validation_strategies`. The probe model artifact is required at runtime only when the flags are enabled. + ## [1.7.0] - 2026-04-30 ### Added diff --git a/README.md b/README.md index eedbcd8..0ab7d30 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ * **Suffix-Aware Re-Segmentation**: DefaultSegmenter post-processes oversized tokens and colloquial-locative merges (e.g. `ရန်ကုန်မာ` → `[ရန်, ကုန်, မာ]`) for cleaner downstream validation. * **Compound & Morpheme Handling**: DP-based compound resolution, ternary compound splits in morpheme correction, productive reduplication validation. * **AI Semantic Checking (Optional)**: ONNX masked language model for context-aware validation. +* **Syllable-Span Probe (opt-in, v1.7.1)**: A frozen-encoder neural probe that improves recall on broken-compound, over-segmentation, and consonant-substitution errors. Three strategies share one small model; default-off, enabled via `use_probe_*` config flags or `MSC_USE_PROBE_*` environment variables. * **Named Entity Recognition**: Heuristic and Transformer-based NER to reduce false positives on names and places. ### Dictionary Building Pipeline @@ -64,7 +65,7 @@ Full documentation is available at **[docs.myspellchecker.com](https://docs.myspellchecker.com/)**. -> **What's new in v1.6.0?** See the **[Release Notes](https://docs.myspellchecker.com/reference/release-notes)** for new validation strategies (mined-confusable, pre-segmenter raw probe), the compound-split confusable boost, the skip-rule confidence gate, consonant-gated Tall-AA normalization, the flat-AA dictionary migration, and spelling-first benchmark labeling. +> **What's new in v1.7.1?** See the **[Release Notes](https://docs.myspellchecker.com/reference/release-notes)** for the opt-in **syllable-span probe** — a frozen-encoder neural enhancement (three default-off strategies sharing one small model) that improves recall on broken-compound, over-segmentation, and consonant-substitution errors (+0.0125 composite when enabled). Earlier v1.7.x work added mined-confusable detection, the cross-whitespace and compound-merge probes, the skip-rule confidence gate, and benchmark-hygiene reclassification. ### Getting Started * **[Introduction](https://docs.myspellchecker.com/introduction)**: Overview of the library and its architecture. diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 18fb18f..aa8e79f 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -899,6 +899,85 @@ def run_benchmark( existing_immune.add("ByT5SafetyNetStrategy") config.validation.suppression_immune_strategies = frozenset(existing_immune) + # GECToR neural corrector overrides + gector_env = _os.environ.get("MSC_USE_GECTOR", "").strip().lower() + gector_path_env = _os.environ.get("MSC_GECTOR_MODEL_PATH", "").strip() + gector_min_conf_env = _os.environ.get("MSC_GECTOR_MIN_CONFIDENCE", "").strip() + gector_conf_env = _os.environ.get("MSC_GECTOR_CONFIDENCE", "").strip() + gector_max_existing_env = _os.environ.get("MSC_GECTOR_MAX_EXISTING_ERRORS", "").strip() + if gector_env in ("1", "true", "yes", "on"): + config.validation.use_gector = True + print(" use_gector: True") + if gector_path_env: + config.validation.gector_model_path = gector_path_env + print(f" gector_model_path: {gector_path_env}") + if gector_min_conf_env: + config.validation.gector_min_confidence = float(gector_min_conf_env) + print(f" gector_min_confidence: {gector_min_conf_env}") + if gector_conf_env: + config.validation.gector_confidence = float(gector_conf_env) + print(f" gector_confidence: {gector_conf_env}") + if gector_max_existing_env: + config.validation.gector_max_existing_errors = int(gector_max_existing_env) + print(f" gector_max_existing_errors: {gector_max_existing_env}") + if config.validation.use_gector: + existing_immune = set(config.validation.suppression_immune_strategies or ()) + existing_immune.add("GECToRValidationStrategy") + config.validation.suppression_immune_strategies = frozenset(existing_immune) + + # Probe-based syllable-span detection overrides (v1.7.x neural enhancement). + # Three strategies share one probe model; toggle each independently. + probe_corr_env = _os.environ.get("MSC_USE_PROBE_CORRECTOR", "").strip().lower() + probe_comp_env = _os.environ.get("MSC_USE_PROBE_COMPOUND", "").strip().lower() + probe_rescue_env = _os.environ.get("MSC_USE_PROBE_RESCUE", "").strip().lower() + probe_path_env = _os.environ.get("MSC_PROBE_MODEL_PATH", "").strip() + probe_corr_thr_env = _os.environ.get("MSC_PROBE_CORRECTOR_THRESHOLD", "").strip() + probe_comp_thr_env = _os.environ.get("MSC_PROBE_COMPOUND_THRESHOLD", "").strip() + probe_comp_freq_env = _os.environ.get("MSC_PROBE_COMPOUND_MIN_FREQ", "").strip() + probe_rescue_thr_env = _os.environ.get("MSC_PROBE_RESCUE_THRESHOLD", "").strip() + probe_rescue_freq_env = _os.environ.get("MSC_PROBE_RESCUE_MIN_FREQ", "").strip() + probe_max_existing_env = _os.environ.get("MSC_PROBE_MAX_EXISTING_ERRORS", "").strip() + if probe_corr_env in ("1", "true", "yes", "on"): + config.validation.use_probe_corrector = True + print(" use_probe_corrector: True") + if probe_comp_env in ("1", "true", "yes", "on"): + config.validation.use_probe_compound = True + print(" use_probe_compound: True") + if probe_rescue_env in ("1", "true", "yes", "on"): + config.validation.use_probe_segmenter_rescue = True + print(" use_probe_segmenter_rescue: True") + if probe_path_env: + config.validation.probe_model_path = probe_path_env + print(f" probe_model_path: {probe_path_env}") + if probe_corr_thr_env: + config.validation.probe_corrector_threshold = float(probe_corr_thr_env) + print(f" probe_corrector_threshold: {probe_corr_thr_env}") + if probe_comp_thr_env: + config.validation.probe_compound_threshold = float(probe_comp_thr_env) + print(f" probe_compound_threshold: {probe_comp_thr_env}") + if probe_comp_freq_env: + config.validation.probe_compound_min_freq = int(probe_comp_freq_env) + print(f" probe_compound_min_freq: {probe_comp_freq_env}") + if probe_rescue_thr_env: + config.validation.probe_rescue_threshold = float(probe_rescue_thr_env) + print(f" probe_rescue_threshold: {probe_rescue_thr_env}") + if probe_rescue_freq_env: + config.validation.probe_rescue_min_freq = int(probe_rescue_freq_env) + print(f" probe_rescue_min_freq: {probe_rescue_freq_env}") + if probe_max_existing_env: + config.validation.probe_max_existing_errors = int(probe_max_existing_env) + print(f" probe_max_existing_errors: {probe_max_existing_env}") + # Auto-register ProbeValidationStrategy as suppression-immune (it uses the + # GECToRValidationStrategy class label so the downstream filters treat it + # the same as the legacy GECToR strategy for fusion / meta-classifier + # bypass purposes). ProbeBoostedCompoundStrategy is NOT added to this set + # — its dictionary-gated suggestions need to flow through the normal + # suppression cascade (verified empirically: adding it costs +25 FP). + if config.validation.use_probe_corrector: + existing_immune = set(config.validation.suppression_immune_strategies or ()) + existing_immune.add("GECToRValidationStrategy") + config.validation.suppression_immune_strategies = frozenset(existing_immune) + # Initialize checker with specified database provider = SQLiteProvider(database_path=str(db_path)) checker = SpellChecker(config=config, provider=provider) diff --git a/pyproject.toml b/pyproject.toml index b0da2c9..2b9d980 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "myspellchecker" -version = "1.7.0" +version = "1.7.1" description = "Myanmar (Burmese) text intelligence library — spell checking, grammar validation, dictionary building, and AI model training" readme = "README.md" # NOTE: Codebase uses Python 3.10+ syntax (PEP 604 unions like `str | None`). diff --git a/src/myspellchecker/algorithms/probe/__init__.py b/src/myspellchecker/algorithms/probe/__init__.py new file mode 100644 index 0000000..34d6245 --- /dev/null +++ b/src/myspellchecker/algorithms/probe/__init__.py @@ -0,0 +1,16 @@ +"""Probe-based syllable-span detection for Myanmar spell checking. + +Provides a frozen-encoder + thin-Linear-head detector that achieves +0.0067 +composite when paired with rule-based correction strategies via the +ProbeBoostedCompoundStrategy and ProbeValidationStrategy. + +See [[Probe Hybrid Ships at +0.0067 2026-05-03]] for design and benchmark +results. +""" + +from myspellchecker.algorithms.probe.syllable_span_probe import ( + FrozenSyllableSpanProbe, + ProbeInferenceEngine, +) + +__all__ = ["FrozenSyllableSpanProbe", "ProbeInferenceEngine"] diff --git a/src/myspellchecker/algorithms/probe/syllable_span_probe.py b/src/myspellchecker/algorithms/probe/syllable_span_probe.py new file mode 100644 index 0000000..a8cb32c --- /dev/null +++ b/src/myspellchecker/algorithms/probe/syllable_span_probe.py @@ -0,0 +1,198 @@ +"""Frozen-encoder + thin-Linear-head syllable span probe. + +Production module for the v1.7.x neural enhancement. Wraps a frozen BERT-class +encoder with a single Linear layer that emits per-syllable binary span scores. +Trained for ~5 minutes on 50K examples, head-only (no encoder fine-tuning). + +Run-time inference helpers project per-syllable scores onto words via: + - direct char-span overlap, OR + - whitespace-adjacency (high-prob whitespace syllable attaches to the + preceding Myanmar word — the broken_compound signal). + +Validated artifact: ``models/probe-syllable-span-v1/`` (head.pt + config.json). +See ``30_Audits/Probe Hybrid Ships at +0.0067 2026-05-03.md``. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING + +import numpy as np + +from myspellchecker.utils.logging_utils import get_logger + +if TYPE_CHECKING: # pragma: no cover - type-only imports + pass + +logger = get_logger(__name__) + + +def _detect_device() -> str: + """Return the best available torch device string.""" + import torch + + if torch.cuda.is_available(): + return "cuda" + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + return "cpu" + + +class FrozenSyllableSpanProbe: + """Frozen BERT encoder + thin per-syllable binary head.""" + + def __init__(self, encoder_path: str | Path): + import torch + import torch.nn as nn + from transformers import AutoModel + + self.encoder = AutoModel.from_pretrained(str(encoder_path)) + for param in self.encoder.parameters(): + param.requires_grad = False + self.head = nn.Linear(self.encoder.config.hidden_size, 1) + self._torch = torch + self._nn = nn + + def to(self, device: str) -> "FrozenSyllableSpanProbe": + self.encoder.to(device) + self.head.to(device) + return self + + def eval(self) -> "FrozenSyllableSpanProbe": + self.encoder.eval() + self.head.eval() + return self + + @property + def hidden_size(self) -> int: + return int(self.encoder.config.hidden_size) + + def predict_logits(self, input_ids, attention_mask, syl_to_subword_mask): + """Return per-syllable logits (B, S). + + Args: + input_ids: (B, T) tensor of subword ids. + attention_mask: (B, T) tensor of 0/1. + syl_to_subword_mask: (B, S, T) float tensor; per syllable, mask + of which subwords belong to it (overlap-based). + """ + torch = self._torch + with torch.no_grad(): + out = self.encoder(input_ids=input_ids, attention_mask=attention_mask) + hidden = out.last_hidden_state # (B, T, H) + mask = syl_to_subword_mask.float() + denom = mask.sum(dim=-1, keepdim=True).clamp(min=1) + syl_hidden = torch.einsum("bst,bth->bsh", mask, hidden) / denom + return self.head(syl_hidden).squeeze(-1) + + +@dataclass +class _SyllableSpan: + """Helper: a syllable's text and char span in the input.""" + + text: str + start: int + end: int + + +class ProbeInferenceEngine: + """High-level inference helper used by validation strategies. + + Loads probe + tokenizer + syllable segmenter once and exposes a single + ``score_sentence(text)`` that returns per-syllable probabilities and the + underlying syllable spans. + """ + + def __init__( + self, + model_path: str | Path, + device: str | None = None, + max_length: int = 256, + ): + import torch + from transformers import AutoTokenizer + + from myspellchecker.segmenters.regex import RegexSegmenter + + model_path = Path(model_path) + if not model_path.exists(): + raise FileNotFoundError( + f"Probe model directory not found: {model_path}. Expected head.pt + config.json." + ) + + config_path = model_path / "config.json" + if not config_path.exists(): + raise FileNotFoundError(f"Missing probe config.json at {config_path}") + cfg = json.loads(config_path.read_text()) + encoder_path = cfg["encoder"] + + head_path = model_path / "head.pt" + if not head_path.exists(): + raise FileNotFoundError(f"Missing probe head.pt at {head_path}") + + self._torch = torch + self.tokenizer = AutoTokenizer.from_pretrained(str(encoder_path)) + self.model = FrozenSyllableSpanProbe(encoder_path) + self.model.head.load_state_dict(torch.load(str(head_path), map_location=device or "cpu")) + self.device = device or _detect_device() + self.model.to(self.device) + self.model.eval() + self.max_length = max_length + self.segmenter = RegexSegmenter() + logger.info( + "ProbeInferenceEngine loaded: encoder=%s head=%s device=%s", + encoder_path, + head_path, + self.device, + ) + + def score_sentence(self, text: str) -> tuple[list[float], list[_SyllableSpan]]: + """Return (per-syllable probability list, syllable span list).""" + if not text: + return [], [] + + syllables = self.segmenter.segment_syllables(text) + if not syllables: + return [], [] + + cursor = 0 + spans: list[_SyllableSpan] = [] + for s in syllables: + idx = text.find(s, cursor) + if idx == -1: + idx = cursor + spans.append(_SyllableSpan(text=s, start=idx, end=idx + len(s))) + cursor = idx + len(s) + + enc = self.tokenizer( + text, + return_offsets_mapping=True, + truncation=True, + max_length=self.max_length, + return_tensors=None, + padding=False, + ) + T = len(enc["input_ids"]) + S = len(spans) + mask = np.zeros((S, T), dtype=np.float32) + for t, (cs, ce) in enumerate(enc["offset_mapping"]): + if cs == ce: + continue + for s_idx, span in enumerate(spans): + if cs < span.end and ce > span.start: + mask[s_idx, t] = 1.0 + valid = mask.sum(axis=1) > 0 + if not valid.any(): + return [0.0] * S, spans + + torch = self._torch + input_ids_t = torch.tensor(enc["input_ids"]).unsqueeze(0).to(self.device) + am_t = torch.tensor(enc["attention_mask"]).unsqueeze(0).to(self.device) + mask_t = torch.from_numpy(mask).unsqueeze(0).to(self.device) + logits = self.model.predict_logits(input_ids_t, am_t, mask_t) + probs = torch.sigmoid(logits[0]).cpu().numpy() + probs[~valid] = 0.0 + return probs.tolist(), spans diff --git a/src/myspellchecker/core/config/validation_configs.py b/src/myspellchecker/core/config/validation_configs.py index 3a0a9c1..8f03c4d 100644 --- a/src/myspellchecker/core/config/validation_configs.py +++ b/src/myspellchecker/core/config/validation_configs.py @@ -1364,6 +1364,144 @@ class ValidationConfig(BaseModel): description="Confidence score for ByT5 safety-net errors.", ) + # GECToR neural corrector (priority 85, after all rule-based strategies) + # Runs a fine-tuned ModernBERT token classifier as a parallel corrector. + # Predicts edit tags per word and emits corrections for positions not + # already flagged by earlier strategies. + use_gector: bool = Field( + default=False, + description=( + "Enable GECToRValidationStrategy (priority 85). Runs a fine-tuned " + "ModernBERT token classifier that predicts edit tags per word. " + "Requires `gector_model_path` to point at a model directory." + ), + ) + gector_model_path: str | None = Field( + default=None, + description=( + "Path to GECToR model directory containing config.json, " + "model.safetensors, tag_vocab.json, and tokenizer files." + ), + ) + gector_min_confidence: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Minimum tag confidence to consider a GECToR prediction.", + ) + gector_confidence: float = Field( + default=0.7, + ge=0.0, + le=1.0, + description="Confidence score cap for GECToR-emitted errors.", + ) + gector_max_existing_errors: int = Field( + default=0, + ge=0, + description=( + "Maximum number of existing errors (from earlier strategies) " + "allowed before GECToR fires. 0 means safety-net only; higher " + "values let GECToR fire even when the pipeline found errors." + ), + ) + + # Probe-based syllable-span detection (v1.7.x neural enhancement). + # Frozen GKLMIP-BERT + thin Linear head. Two strategies share one model: + # - ProbeValidationStrategy at priority 85 (replaces v3 GECToR) + # - ProbeBoostedCompoundStrategy at priority 24 (boosts BrokenCompoundStrategy) + # See `30_Audits/Probe Hybrid Ships at +0.0067 2026-05-03.md`. + use_probe_corrector: bool = Field( + default=False, + description=( + "Enable ProbeValidationStrategy (priority 85). Uses a probe model " + "(frozen encoder + thin head) instead of the v3 GECToR corrector. " + "Requires `probe_model_path` to point at a probe artifact " + "directory (head.pt + config.json)." + ), + ) + use_probe_compound: bool = Field( + default=False, + description=( + "Enable ProbeBoostedCompoundStrategy (priority 24). Pre-filter " + "before BrokenCompoundStrategy: emits broken_compound errors when " + "the probe scores a whitespace syllable highly AND the merged " + "compound exists in the dictionary at sufficient frequency." + ), + ) + probe_model_path: str | None = Field( + default=None, + description=( + "Path to probe model directory containing head.pt + config.json. " + "Used by both ProbeValidationStrategy and ProbeBoostedCompoundStrategy." + ), + ) + probe_corrector_threshold: float = Field( + default=0.75, + ge=0.0, + le=1.0, + description=( + "Per-syllable confidence threshold for ProbeValidationStrategy. " + "Optimal sweet spot is 0.75 (per benchmark sweep)." + ), + ) + probe_compound_threshold: float = Field( + default=0.7, + ge=0.0, + le=1.0, + description=( + "Whitespace-syllable confidence threshold for " + "ProbeBoostedCompoundStrategy. Optimal value is 0.7 — lower " + "values add FPs faster than TPs." + ), + ) + probe_compound_min_freq: int = Field( + default=50, + ge=0, + description=( + "Minimum corpus frequency for a merged compound to be suggested " + "by ProbeBoostedCompoundStrategy. Filters out rare compound " + "candidates that may be noisy." + ), + ) + probe_max_existing_errors: int = Field( + default=100, + ge=0, + description=( + "Maximum existing-errors count allowed before either probe " + "strategy fires. Default 100 = effectively no gate (probe runs " + "regardless). Lower values restrict probes to safety-net mode." + ), + ) + use_probe_segmenter_rescue: bool = Field( + default=False, + description=( + "Enable ProbeSegmenterRescueStrategy (priority 26). Targets the " + "no-whitespace over-segmentation residual: when the segmenter " + "splits a typo into adjacent dict-valid tokens, the probe " + "scores the merge boundary and SymSpell ed=1 lookup on the " + "merged form rescues a high-frequency dict word as the " + "broken_compound suggestion." + ), + ) + probe_rescue_threshold: float = Field( + default=0.75, + ge=0.0, + le=1.0, + description=( + "Probe per-syllable threshold for ProbeSegmenterRescueStrategy. " + "Optimal sweep value 0.75 (paired with min_freq=2000)." + ), + ) + probe_rescue_min_freq: int = Field( + default=2000, + ge=0, + description=( + "Minimum corpus frequency for the SymSpell ed=1 candidate to be " + "emitted by ProbeSegmenterRescueStrategy. Higher values reduce " + "FPs from low-confidence dict matches; 2000 is the sweep optimum." + ), + ) + # MLM post-filter for invalid_word / dangling_word FP suppression mlm_plausibility_threshold: float = Field( default=3.0, diff --git a/src/myspellchecker/core/factories/builders.py b/src/myspellchecker/core/factories/builders.py index 0baf211..2b426e8 100644 --- a/src/myspellchecker/core/factories/builders.py +++ b/src/myspellchecker/core/factories/builders.py @@ -400,6 +400,7 @@ def build_context_validation_strategies( 4. SyllableWindowOOVStrategy (priority 22, needs symspell) 5. HiddenCompoundStrategy (priority 23) 6. StatisticalConfusableStrategy (priority 24) + ProbeBoostedCompoundStrategy (priority 24, optional) 7. BrokenCompoundStrategy (priority 25) 8. POSSequenceValidationStrategy (priority 30) 9. QuestionStructureValidationStrategy (priority 40) @@ -408,6 +409,8 @@ def build_context_validation_strategies( 12. ConfusableSemanticStrategy (priority 48) 13. NgramContextValidationStrategy (priority 50) 14. SemanticValidationStrategy (priority 70) + 15. GECToRValidationStrategy (priority 85, optional) + ProbeValidationStrategy (priority 85, optional) This is the canonical strategy building logic used by both DI factories and ComponentFactory. @@ -880,6 +883,92 @@ def build_context_validation_strategies( ) logger.debug("Added ByT5SafetyNetStrategy (priority 80)") + # Priority 85: GECToR neural corrector (safety-net). + # Only fires on sentences where the rule pipeline found zero errors. + if validation_config.use_gector and validation_config.gector_model_path: + from myspellchecker.algorithms.gector_corrector import GECToRCorrector + from myspellchecker.core.validation_strategies.gector_strategy import ( + GECToRValidationStrategy, + ) + + gector_corrector = GECToRCorrector(validation_config.gector_model_path) + strategies.append( + GECToRValidationStrategy( + corrector=gector_corrector, + enabled=True, + min_confidence=validation_config.gector_min_confidence, + confidence=validation_config.gector_confidence, + max_existing_errors=validation_config.gector_max_existing_errors, + ) + ) + logger.debug("Added GECToRValidationStrategy (priority 85)") + + # Priority 24/26/85: probe-based syllable-span detection (v1.7.x neural). + # Frozen GKLMIP-BERT + thin Linear head. Three strategies share one engine: + # - ProbeBoostedCompoundStrategy at priority 24 (whitespace compound boost) + # - ProbeSegmenterRescueStrategy at priority 26 (no-whitespace over-seg rescue) + # - ProbeValidationStrategy at priority 85 (replaces v3 GECToR) + # See `30_Audits/Probe Hybrid Ships at +0.0067 2026-05-03.md` and + # `30_Audits/Probe Phase 2A Ships at +0.0111 2026-05-04.md`. + needs_probe = ( + validation_config.use_probe_corrector + or validation_config.use_probe_compound + or validation_config.use_probe_segmenter_rescue + ) + if needs_probe and validation_config.probe_model_path: + from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + ) + + probe_engine = ProbeInferenceEngine(validation_config.probe_model_path) + + if validation_config.use_probe_compound: + from myspellchecker.core.validation_strategies.probe_boosted_compound_strategy import ( + ProbeBoostedCompoundStrategy, + ) + + strategies.append( + ProbeBoostedCompoundStrategy( + engine=probe_engine, + provider=provider, + threshold=validation_config.probe_compound_threshold, + compound_min_freq=validation_config.probe_compound_min_freq, + max_existing_errors=validation_config.probe_max_existing_errors, + ) + ) + logger.debug("Added ProbeBoostedCompoundStrategy (priority 24)") + + if validation_config.use_probe_segmenter_rescue and symspell is not None: + from myspellchecker.core.validation_strategies.probe_segmenter_rescue_strategy import ( + ProbeSegmenterRescueStrategy, + ) + + strategies.append( + ProbeSegmenterRescueStrategy( + engine=probe_engine, + provider=provider, + symspell=symspell, + threshold=validation_config.probe_rescue_threshold, + min_freq=validation_config.probe_rescue_min_freq, + max_existing_errors=validation_config.probe_max_existing_errors, + ) + ) + logger.debug("Added ProbeSegmenterRescueStrategy (priority 26)") + + if validation_config.use_probe_corrector: + from myspellchecker.core.validation_strategies.probe_strategy import ( + ProbeValidationStrategy, + ) + + strategies.append( + ProbeValidationStrategy( + engine=probe_engine, + threshold=validation_config.probe_corrector_threshold, + max_existing_errors=validation_config.probe_max_existing_errors, + ) + ) + logger.debug("Added ProbeValidationStrategy (priority 85)") + logger.info( f"Built {len(strategies)} validation strategies: " f"{[s.__class__.__name__ for s in strategies]}" diff --git a/src/myspellchecker/core/validation_strategies/meta_fusion.py b/src/myspellchecker/core/validation_strategies/meta_fusion.py index 8bc3fb8..bd46843 100644 --- a/src/myspellchecker/core/validation_strategies/meta_fusion.py +++ b/src/myspellchecker/core/validation_strategies/meta_fusion.py @@ -61,6 +61,9 @@ { "LoanWordValidationStrategy", "VisargaStrategy", + "GECToRValidationStrategy", + "ProbeBoostedCompoundStrategy", + "ProbeSegmenterRescueStrategy", } ) diff --git a/src/myspellchecker/core/validation_strategies/probe_boosted_compound_strategy.py b/src/myspellchecker/core/validation_strategies/probe_boosted_compound_strategy.py new file mode 100644 index 0000000..546d6ba --- /dev/null +++ b/src/myspellchecker/core/validation_strategies/probe_boosted_compound_strategy.py @@ -0,0 +1,143 @@ +"""Probe-boosted broken-compound detection strategy (priority 24). + +Uses the same trained probe as ProbeValidationStrategy, but operates as a +pre-filter for the existing BrokenCompoundStrategy (priority 25). For each +adjacent word pair separated by whitespace, this strategy: + +1. Reads the probe's per-syllable score on the whitespace syllable between + the words (the broken_compound signal). +2. If the score >= threshold AND the merged compound exists in the dictionary + at sufficient frequency, emits a broken_compound error with the merged + compound as the top-1 suggestion. + +This bypasses BrokenCompoundStrategy's rare_threshold and compound_ratio +heuristic gates (which reject many true positives) and replaces them with +neural evidence + dictionary membership. + +Composes with ProbeValidationStrategy at priority 85. +See ``30_Audits/Probe Hybrid Ships at +0.0067 2026-05-03.md``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from myspellchecker.core.response import Error, Suggestion +from myspellchecker.core.validation_strategies.base import ( + ValidationContext, + ValidationStrategy, +) +from myspellchecker.utils.logging_utils import get_logger + +if TYPE_CHECKING: # pragma: no cover - type-only imports + from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + ) + from myspellchecker.providers.base import DictionaryProvider + +logger = get_logger(__name__) + +ET_BROKEN_COMPOUND = "broken_compound" + + +class ProbeBoostedCompoundStrategy(ValidationStrategy): + """Probe + dict gate for broken-compound detection (priority 24).""" + + bypass_fast_path = True + + def __init__( + self, + engine: "ProbeInferenceEngine", + provider: "DictionaryProvider", + threshold: float = 0.7, + compound_min_freq: int = 50, + max_existing_errors: int = 100, + ): + self._engine = engine + self._provider = provider + self._threshold = threshold + self._compound_min_freq = compound_min_freq + self._max_existing_errors = max_existing_errors + + def priority(self) -> int: + return 24 + + def validate(self, context: ValidationContext) -> list[Error]: + if len(context.words) < 2: + return [] + if len(context.existing_errors) > self._max_existing_errors: + return [] + if not context.sentence: + return [] + + try: + probs, syl_spans = self._engine.score_sentence(context.sentence) + except Exception: + logger.error("Probe inference failed in compound strategy", exc_info=True) + return [] + if not syl_spans: + return [] + + # Index probability of whitespace syllables by their start char position. + ws_prob_at_pos: dict[int, float] = {} + for s_idx, span in enumerate(syl_spans): + if span.text.strip() == "": + ws_prob_at_pos[span.start] = float(probs[s_idx]) + + errors: list[Error] = [] + for i in range(len(context.words) - 1): + if i >= len(context.word_positions): + break + if i + 1 >= len(context.word_positions): + break + pos_i = context.word_positions[i] + pos_next = context.word_positions[i + 1] + if pos_i in context.existing_errors or pos_next in context.existing_errors: + continue + w1 = context.words[i] + w2 = context.words[i + 1] + if context.is_name_mask: + if i < len(context.is_name_mask) and context.is_name_mask[i]: + continue + if i + 1 < len(context.is_name_mask) and context.is_name_mask[i + 1]: + continue + w1_end = pos_i + len(w1) + # Skip if no whitespace gap between the two words + if pos_next == w1_end: + continue + # Skip Pali stacking compounds — segmenter splits these unreliably + if "္" in w1 or "္" in w2: + continue + # Skip reduplication + if w1 == w2: + continue + + ws_prob = ws_prob_at_pos.get(w1_end, 0.0) + if ws_prob < self._threshold: + continue + + compound = w1 + w2 + try: + if not self._provider.is_valid_word(compound): + continue + compound_freq = self._provider.get_word_frequency(compound) + if compound_freq < self._compound_min_freq: + continue + except Exception: + continue + + errors.append( + Error( + text=f"{w1} {w2}", + position=pos_i, + error_type=ET_BROKEN_COMPOUND, + suggestions=[Suggestion(text=compound)], + confidence=min(0.95, ws_prob), + source_strategy="ProbeBoostedCompoundStrategy", + ) + ) + context.existing_errors[pos_i] = ET_BROKEN_COMPOUND + context.existing_errors[pos_next] = ET_BROKEN_COMPOUND + context.existing_suggestions[pos_i] = [compound] + context.existing_confidences[pos_i] = ws_prob + return errors diff --git a/src/myspellchecker/core/validation_strategies/probe_segmenter_rescue_strategy.py b/src/myspellchecker/core/validation_strategies/probe_segmenter_rescue_strategy.py new file mode 100644 index 0000000..13b2b0e --- /dev/null +++ b/src/myspellchecker/core/validation_strategies/probe_segmenter_rescue_strategy.py @@ -0,0 +1,215 @@ +"""Probe-boosted no-whitespace over-segmentation rescue strategy (priority 26). + +Targets the residual broken-compound bucket where the segmenter splits a +typo into adjacent dictionary-valid tokens (no whitespace between them). +Example: ``သံဂါ`` → segmented as ``သံ`` + ``ဂါ`` (both valid standalone), +so WordValidator never sees the typo. SymSpell would find the gold +``သံဃာ`` if given the merged form, but no upstream strategy passes it. + +Algorithm: for each adjacent (no-whitespace) Myanmar pair (w1, w2), if the +trained probe scores syllables on the merge boundary above threshold AND a +SymSpell ed=1 lookup on (w1+w2) returns a high-frequency dict word that +isn't the merged form itself, emit a broken_compound error with that +candidate as the top-1 suggestion. + +Composes with ProbeBoostedCompoundStrategy at priority 24 (which handles +the *whitespace-adjacent* compound case) and ProbeValidationStrategy at +priority 85 (general detection). All three share one ProbeInferenceEngine. + +See ``30_Audits/Probe Phase 2A Ships at +0.0111 2026-05-04.md``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from myspellchecker.core.response import Error, Suggestion +from myspellchecker.core.validation_strategies.base import ( + ValidationContext, + ValidationStrategy, +) +from myspellchecker.utils.logging_utils import get_logger + +if TYPE_CHECKING: # pragma: no cover - type-only imports + from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + ) + from myspellchecker.algorithms.symspell import SymSpell + from myspellchecker.providers.base import DictionaryProvider + +logger = get_logger(__name__) + +ET_BROKEN_COMPOUND = "broken_compound" +VIRAMA = "္" # U+1039 — Pali stacking marker; segmenter splits these unreliably + +# Particles that should never participate in a "broken-compound" merge. +# Sourced from over-segmentation research 2026-04-29. +NEVER_MERGE_PARTICLES = frozenset( + { + "က", + "ကို", + "မှာ", + "နဲ့", + "တယ်", + "ပါတယ်", + "လား", + "လဲ", + "ပဲ", + "တော့", + "ပြီး", + "သည်", + "ဟာ", + "၌", + "မှ", + "ရဲ့", + "၏", + "နှင့်", + "များ", + "ပါ", + "နှ", + "သို့", + } +) + + +class ProbeSegmenterRescueStrategy(ValidationStrategy): + """No-whitespace over-segmentation rescue (priority 26).""" + + bypass_fast_path = True + + def __init__( + self, + engine: "ProbeInferenceEngine", + provider: "DictionaryProvider", + symspell: "SymSpell", + threshold: float = 0.75, + min_freq: int = 2000, + max_existing_errors: int = 100, + ): + self._engine = engine + self._provider = provider + self._symspell = symspell + self._threshold = threshold + self._min_freq = min_freq + self._max_existing_errors = max_existing_errors + + def priority(self) -> int: + return 26 # after BrokenCompoundStrategy (25), so it doesn't preempt + + def _is_excluded(self, w1: str, w2: str) -> bool: + """Linguistic exclusion gates from over-segmentation research.""" + if VIRAMA in w1 or VIRAMA in w2: + return True + if w1 == w2: + return True + if w1 in NEVER_MERGE_PARTICLES or w2 in NEVER_MERGE_PARTICLES: + return True + if len(w1) <= 1 or len(w2) <= 1: + return True + return False + + def _lookup_merged(self, merged: str) -> str | None: + """Return top dict-valid candidate at ed=1 with sufficient frequency. + + Restricted to ed=1 because ed=2 introduces too many false matches in + the compound-merge use case (a different consonant cluster would + silently substitute valid but unrelated words). + """ + try: + res = self._symspell.lookup( + merged, level="word", max_suggestions=5, include_known=False + ) + except Exception: + return None + for r in res: + cand = r.term + if cand == merged or r.edit_distance > 1: + continue + try: + if not self._provider.is_valid_word(cand): + continue + freq = self._provider.get_word_frequency(cand) + except Exception: + continue + if freq is not None and freq >= self._min_freq: + return cand + return None + + def validate(self, context: ValidationContext) -> list[Error]: + if len(context.words) < 2: + return [] + if len(context.existing_errors) > self._max_existing_errors: + return [] + if not context.sentence: + return [] + + try: + probs, syl_spans = self._engine.score_sentence(context.sentence) + except Exception: + logger.error("Probe inference failed in rescue strategy", exc_info=True) + return [] + if not syl_spans: + return [] + + errors: list[Error] = [] + for i in range(len(context.words) - 1): + if i >= len(context.word_positions): + break + if i + 1 >= len(context.word_positions): + break + pos_i = context.word_positions[i] + pos_next = context.word_positions[i + 1] + if pos_i in context.existing_errors or pos_next in context.existing_errors: + continue + w1 = context.words[i] + w2 = context.words[i + 1] + w1_end = pos_i + len(w1) + # Only the no-whitespace path; whitespace cases are handled by + # ProbeBoostedCompoundStrategy at priority 24. + if pos_next != w1_end: + continue + if self._is_excluded(w1, w2): + continue + if context.is_name_mask: + if i < len(context.is_name_mask) and context.is_name_mask[i]: + continue + if i + 1 < len(context.is_name_mask) and context.is_name_mask[i + 1]: + continue + + merge_start = pos_i + merge_end = pos_next + len(w2) + max_prob = 0.0 + for s_idx, span in enumerate(syl_spans): + if span.start < merge_end and span.end > merge_start: + if probs[s_idx] > max_prob: + max_prob = float(probs[s_idx]) + if max_prob < self._threshold: + continue + + merged = w1 + w2 + cand = self._lookup_merged(merged) + if cand is None: + continue + try: + # Don't flag if the merged form itself is already a valid word + # (no fix needed; user may have intentionally typed it). + if self._provider.is_valid_word(merged): + continue + except Exception: + continue + + errors.append( + Error( + text=merged, + position=pos_i, + error_type=ET_BROKEN_COMPOUND, + suggestions=[Suggestion(text=cand)], + confidence=min(0.9, max_prob), + source_strategy="ProbeSegmenterRescueStrategy", + ) + ) + context.existing_errors[pos_i] = ET_BROKEN_COMPOUND + context.existing_errors[pos_next] = ET_BROKEN_COMPOUND + context.existing_suggestions[pos_i] = [cand] + context.existing_confidences[pos_i] = max_prob + return errors diff --git a/src/myspellchecker/core/validation_strategies/probe_strategy.py b/src/myspellchecker/core/validation_strategies/probe_strategy.py new file mode 100644 index 0000000..7500293 --- /dev/null +++ b/src/myspellchecker/core/validation_strategies/probe_strategy.py @@ -0,0 +1,117 @@ +"""Probe-based syllable-span detection strategy (priority 85). + +Drop-in replacement for the v3 GECToRValidationStrategy. Uses a frozen +GKLMIP-BERT encoder + a single Linear head to emit per-syllable detection +scores. High-prob syllables get projected onto words via direct overlap or +whitespace-adjacency (a high-prob whitespace syllable attaches to the +preceding Myanmar word — the broken_compound signal). + +Suggestions are emitted empty; the downstream suggestion pipeline generates +SymSpell candidates. Composes with ProbeBoostedCompoundStrategy at priority 24. + +See ``30_Audits/Probe Hybrid Ships at +0.0067 2026-05-03.md`` for design and +benchmark results. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from myspellchecker.core.response import Error +from myspellchecker.core.validation_strategies.base import ( + ValidationContext, + ValidationStrategy, +) +from myspellchecker.utils.logging_utils import get_logger + +if TYPE_CHECKING: # pragma: no cover - type-only imports + from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + ) + +logger = get_logger(__name__) + +ET_PROBE = "gector_correction" + + +class ProbeValidationStrategy(ValidationStrategy): + """Frozen-encoder syllable-span detection (priority 85).""" + + bypass_fast_path = True + + def __init__( + self, + engine: "ProbeInferenceEngine", + threshold: float = 0.75, + max_existing_errors: int = 100, + ): + self._engine = engine + self._threshold = threshold + self._max_existing_errors = max_existing_errors + + def priority(self) -> int: + return 85 + + def validate(self, context: ValidationContext) -> list[Error]: + if not context.words: + return [] + if len(context.existing_errors) > self._max_existing_errors: + return [] + if not context.sentence: + return [] + + try: + probs, syl_spans = self._engine.score_sentence(context.sentence) + except Exception: + logger.error("Probe inference failed", exc_info=True) + return [] + if not syl_spans: + return [] + + word_spans = [ + ( + context.word_positions[i], + context.word_positions[i] + len(context.words[i]), + ) + for i in range(min(len(context.words), len(context.word_positions))) + ] + + # Per-word max prob: include syllables that overlap the word OR are a + # whitespace syllable immediately following the word (compound boundary). + word_max_prob = [0.0] * len(word_spans) + for s_idx, span in enumerate(syl_spans): + if probs[s_idx] < self._threshold: + continue + is_whitespace = span.text.strip() == "" + for w_idx, (ws, we) in enumerate(word_spans): + overlaps = span.start < we and span.end > ws + adjacent = is_whitespace and span.start == we + if overlaps or adjacent: + if probs[s_idx] > word_max_prob[w_idx]: + word_max_prob[w_idx] = float(probs[s_idx]) + + errors: list[Error] = [] + for w_idx, max_prob in enumerate(word_max_prob): + if max_prob < self._threshold: + continue + if w_idx >= len(context.words): + continue + if context.is_name_mask and w_idx < len(context.is_name_mask): + if context.is_name_mask[w_idx]: + continue + word = context.words[w_idx] + position = context.word_positions[w_idx] + errors.append( + Error( + text=word, + position=position, + error_type=ET_PROBE, + suggestions=[], + confidence=max_prob, + source_strategy="GECToRValidationStrategy", + ) + ) + context.existing_errors[position] = ET_PROBE + context.existing_suggestions[position] = [] + context.existing_confidences[position] = max_prob + return errors diff --git a/tests/test_probe_strategy.py b/tests/test_probe_strategy.py new file mode 100644 index 0000000..3755c7d --- /dev/null +++ b/tests/test_probe_strategy.py @@ -0,0 +1,278 @@ +"""Tests for the v1.7.x probe-based neural strategies. + +Covers: +- ProbeInferenceEngine load/score happy path (skipped if model not available) +- ProbeValidationStrategy + ProbeBoostedCompoundStrategy fire/no-fire behavior +- Config flag wiring through builders + +Skips entire module if `models/probe-syllable-span-v1/` is not present +(which is the default state — model is gitignored, downloaded at deploy +time). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +PROBE_MODEL_PATH = Path("models/probe-syllable-span-v1") +PROBE_AVAILABLE = ( + PROBE_MODEL_PATH.exists() + and (PROBE_MODEL_PATH / "head.pt").exists() + and (PROBE_MODEL_PATH / "config.json").exists() +) + +pytestmark = pytest.mark.skipif( + not PROBE_AVAILABLE, + reason=f"Probe model not present at {PROBE_MODEL_PATH}; skip integration tests.", +) + + +@pytest.fixture(scope="module") +def probe_engine(): + from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + ) + + return ProbeInferenceEngine(str(PROBE_MODEL_PATH)) + + +@pytest.fixture(scope="module") +def provider(): + from myspellchecker.providers.sqlite import SQLiteProvider + + db_path = Path("data/mySpellChecker_production.db") + if not db_path.exists(): + pytest.skip(f"Production DB not present at {db_path}") + return SQLiteProvider(database_path=str(db_path)) + + +def _build_context(text: str, provider): + """Construct a ValidationContext from text using the production segmenter.""" + from myspellchecker.core.constants.myanmar_constants import contains_myanmar + from myspellchecker.core.spellchecker import SpellChecker + from myspellchecker.core.validation_strategies.base import ValidationContext + from myspellchecker.core.validators.base import Validator + + checker = SpellChecker(provider=provider) + words = checker.segmenter.segment_words(text) + cursor = 0 + raw_positions = [] + for w in words: + idx = text.find(w, cursor) + if idx == -1: + idx = cursor + raw_positions.append(idx) + cursor = idx + len(w) + fwords: list[str] = [] + fpos: list[int] = [] + for w, p in zip(words, raw_positions, strict=False): + if ( + w.strip() + and not Validator.is_punctuation(w) + and contains_myanmar(w, allow_extended=False) + ): + fwords.append(w) + fpos.append(p) + return ValidationContext( + sentence=text, + words=fwords, + word_positions=fpos, + is_name_mask=[False] * len(fwords), + ) + + +def test_inference_engine_loads_and_scores(probe_engine): + """Engine returns per-syllable probabilities and matching syllable spans.""" + text = "မြန်မာနိုင်ငံ" + probs, spans = probe_engine.score_sentence(text) + assert len(probs) == len(spans) + assert all(0.0 <= p <= 1.0 for p in probs) + + +def test_probe_strategy_fires_on_broken_compound(probe_engine, provider): + """ProbeValidationStrategy emits an error on a known broken-compound FN.""" + from myspellchecker.core.validation_strategies.probe_strategy import ( + ProbeValidationStrategy, + ) + + strategy = ProbeValidationStrategy(engine=probe_engine, threshold=0.75, max_existing_errors=100) + # BM-EXP-E003 — broken_compound FN + text = "Delos သည် သေးငယ်သော စတိုးဆိုင်ဖြစ်ပြီး ထိုနေ ရာကို မသွားတော့ပါ။" + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + assert len(errors) >= 1 + assert any(e.text == "နေ" for e in errors), ( + f"Expected fire on 'နေ', got: {[(e.position, e.text) for e in errors]}" + ) + + +def test_probe_strategy_silent_on_clean_text(probe_engine, provider): + """Probe should not over-fire on clean Myanmar sentences.""" + from myspellchecker.core.validation_strategies.probe_strategy import ( + ProbeValidationStrategy, + ) + + strategy = ProbeValidationStrategy(engine=probe_engine, threshold=0.75, max_existing_errors=100) + # Short clean sentence + text = "သူသည်ကျောင်းသို့သွားသည်။" + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + # We tolerate up to 1 false fire on a single clean sentence (the model + # trained without per-sentence calibration); production aggregate FPR + # is the real gate. + assert len(errors) <= 1 + + +def test_probe_boosted_compound_fires_with_dict_gate(probe_engine, provider): + """ProbeBoostedCompoundStrategy fires on whitespace + dict hit.""" + from myspellchecker.core.validation_strategies.probe_boosted_compound_strategy import ( + ProbeBoostedCompoundStrategy, + ) + + strategy = ProbeBoostedCompoundStrategy( + engine=probe_engine, + provider=provider, + threshold=0.7, + compound_min_freq=50, + max_existing_errors=100, + ) + text = "Delos သည် သေးငယ်သော စတိုးဆိုင်ဖြစ်ပြီး ထိုနေ ရာကို မသွားတော့ပါ။" + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + assert len(errors) == 1 + err = errors[0] + assert err.text == "နေ ရာ" + assert err.suggestions == ["နေရာ"] + assert err.error_type == "broken_compound" + assert err.source_strategy == "ProbeBoostedCompoundStrategy" + + +def test_probe_boosted_compound_silent_when_compound_not_in_dict(probe_engine, provider): + """ProbeBoostedCompoundStrategy must NOT fire if merged form is not a known word.""" + from myspellchecker.core.validation_strategies.probe_boosted_compound_strategy import ( + ProbeBoostedCompoundStrategy, + ) + + strategy = ProbeBoostedCompoundStrategy( + engine=probe_engine, + provider=provider, + threshold=0.7, + compound_min_freq=50, + max_existing_errors=100, + ) + # Two unrelated words separated by space — concatenation is gibberish + text = "ရှေ့ ပိတ်" + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + # Either dict-gate rejected or probe didn't fire — both acceptable + if errors: + # If anything fires, it must be a real dict word + for e in errors: + assert provider.is_valid_word(e.suggestions[0]) + + +def test_config_flags_wire_through_builder(provider): + """Enabling probe flags via config registers both strategies.""" + from myspellchecker.core.config.main import SpellCheckerConfig + from myspellchecker.core.spellchecker import SpellChecker + + config = SpellCheckerConfig() + config.validation.use_probe_corrector = True + config.validation.use_probe_compound = True + config.validation.probe_model_path = str(PROBE_MODEL_PATH) + checker = SpellChecker(config=config, provider=provider) + classes = {s.__class__.__name__ for s in checker.context_validator.strategies} + assert "ProbeValidationStrategy" in classes + assert "ProbeBoostedCompoundStrategy" in classes + + +def test_disabled_by_default(provider): + """With default config, none of the probe strategies are registered.""" + from myspellchecker.core.config.main import SpellCheckerConfig + from myspellchecker.core.spellchecker import SpellChecker + + config = SpellCheckerConfig() + # Defaults: use_probe_corrector / compound / segmenter_rescue all False + checker = SpellChecker(config=config, provider=provider) + classes = {s.__class__.__name__ for s in checker.context_validator.strategies} + assert "ProbeValidationStrategy" not in classes + assert "ProbeBoostedCompoundStrategy" not in classes + assert "ProbeSegmenterRescueStrategy" not in classes + + +def test_probe_segmenter_rescue_runs_without_error(probe_engine, provider): + """ProbeSegmenterRescueStrategy runs and returns a list (any cardinality).""" + from myspellchecker.core.spellchecker import SpellChecker + from myspellchecker.core.validation_strategies.probe_segmenter_rescue_strategy import ( + ProbeSegmenterRescueStrategy, + ) + + symspell = SpellChecker(provider=provider).symspell + strategy = ProbeSegmenterRescueStrategy( + engine=probe_engine, + provider=provider, + symspell=symspell, + threshold=0.75, + min_freq=2000, + max_existing_errors=100, + ) + text = "သံဂါအတွက် သူ စျေးကို စောစော သွားခဲ့တယ် လို့ မေမေက ပြောတယ်။" + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + # The strategy may or may not fire on this specific sentence; only requirement + # is that it runs without exception and returns a list of errors. Any errors + # emitted must have non-empty suggestions and ed=1 dict-valid candidates. + assert isinstance(errors, list) + for e in errors: + assert e.error_type == "broken_compound" + assert e.source_strategy == "ProbeSegmenterRescueStrategy" + assert len(e.suggestions) >= 1 + for s in e.suggestions: + cand = s.text if hasattr(s, "text") else str(s) + assert provider.is_valid_word(cand), f"Emitted suggestion {cand!r} is not in dict" + + +def test_probe_segmenter_rescue_silent_on_pali_stacking(probe_engine, provider): + """ProbeSegmenterRescueStrategy must NOT fire on Pali stacking sequences.""" + from myspellchecker.core.spellchecker import SpellChecker + from myspellchecker.core.validation_strategies.probe_segmenter_rescue_strategy import ( + ProbeSegmenterRescueStrategy, + ) + + # Use production-configured SymSpell (matches the shipping pipeline) + symspell = SpellChecker(provider=provider).symspell + + strategy = ProbeSegmenterRescueStrategy( + engine=probe_engine, + provider=provider, + symspell=symspell, + threshold=0.75, + min_freq=2000, + max_existing_errors=100, + ) + # Word containing Pali virama — should be excluded by linguistic gate + text = "ဗုဒ္ဓဘာသာ ဆိုသည်မှာ" # has ္ (virama) + ctx = _build_context(text, provider) + errors = strategy.validate(ctx) + # The strategy should not flag any Pali-stacking adjacency + for e in errors: + assert "္" not in e.text, f"Should not fire on virama-containing text: {e.text}" + + +def test_all_three_probe_flags_register(provider): + """Enabling all three probe flags registers all three strategies.""" + from myspellchecker.core.config.main import SpellCheckerConfig + from myspellchecker.core.spellchecker import SpellChecker + + config = SpellCheckerConfig() + config.validation.use_probe_corrector = True + config.validation.use_probe_compound = True + config.validation.use_probe_segmenter_rescue = True + config.validation.probe_model_path = str(PROBE_MODEL_PATH) + checker = SpellChecker(config=config, provider=provider) + classes = {s.__class__.__name__ for s in checker.context_validator.strategies} + assert "ProbeValidationStrategy" in classes + assert "ProbeBoostedCompoundStrategy" in classes + assert "ProbeSegmenterRescueStrategy" in classes