From 9f630d1d9345ab6dd8d8b3086b7f4f7cf3f70dea Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Tue, 2 Jun 2026 18:50:37 +0800 Subject: [PATCH 01/13] feat(fn): ortho-insertion rescue + honest FN telemetry (v1.8.0 seed) Stage-2 carve-out in _suppress_compound_split_valid_words: keep (do not suppress) compound-split tokens whose ed=1 SymSpell top-1 is a single in-syllable-diacritic INSERTION, emitting the corrected syllable at the narrow gold span. +0.0007 composite / +21 TP within the clean-FP cap, behind default-off flag compound_split_ortho_insertion_rescue. Also: split the candidate_not_generated FN telemetry into an honest candidate_suppressed bucket (generatable but not surfaced), and make the benchmark output filename collision-proof (microsecond resolution). Workstream: ortho-suppressor-rescue --- benchmarks/run_benchmark.py | 107 ++++++++++- .../core/config/validation_configs.py | 9 + src/myspellchecker/core/error_suppression.py | 178 +++++++++++++++++- 3 files changed, 286 insertions(+), 8 deletions(-) diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 18fb18f..a09b62a 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -330,14 +330,20 @@ def _is_token_boundary_hidden(target_text: str, words: list[str]) -> bool: return False -def classify_false_negative_reason( +def _classify_fn_raw( *, checker: Any, input_text: str, gold_error: dict[str, Any], segmented_words: list[str], ) -> str: - """Classify likely cause for one false negative.""" + """Semantic-MLM heuristic for one false negative (raw verdict). + + NOTE: this consults only the semantic checker, so it cannot distinguish a + true generation gap from a candidate that the word-path generated and + suppression/ranking later killed. The :func:`classify_false_negative_reason` + wrapper refines the ``candidate_not_generated`` verdict using SymSpell. + """ span = gold_error.get("span", {}) start = int(span.get("start", -1)) end = int(span.get("end", -1)) @@ -412,6 +418,69 @@ def classify_false_negative_reason( return "candidate_not_generated" +def _symspell_can_generate(checker: Any, target_text: str, gold_correction: Any) -> bool: + """True when the word-candidate path (SymSpell) can produce the gold for + ``target_text`` — i.e. a candidate WAS generatable, so a false negative + here is a surfacing / suppression / ranking failure, not a generation gap. + Used to split the conflated ``candidate_not_generated`` bucket honestly. + """ + symspell = getattr(checker, "symspell", None) + if symspell is None or not target_text: + return False + try: + candidates = symspell.lookup(target_text, level="word", max_suggestions=10) + except Exception: + return False + if not candidates: + return False + norm_gold = gold_correction.strip() if isinstance(gold_correction, str) else "" + for cand in candidates: + term = getattr(cand, "term", None) + if not term: + continue + if norm_gold: + if term == norm_gold: + return True + elif float(getattr(cand, "edit_distance", 99)) <= 2: + return True + return False + + +def classify_false_negative_reason( + *, + checker: Any, + input_text: str, + gold_error: dict[str, Any], + segmented_words: list[str], +) -> str: + """Classify the likely cause of one false negative (honest split). + + Wraps the semantic-MLM heuristic (:func:`_classify_fn_raw`) and refines its + ``candidate_not_generated`` verdict: when the word-candidate path (SymSpell) + CAN produce the gold but it did not reach the final response, the candidate + was generated-then-killed → ``candidate_suppressed``. Only when NEITHER the + MLM heuristic NOR SymSpell can produce the gold is it a true + ``candidate_not_generated`` (Stage-0 generation gap). This stops the bucket + from conflating the ~88% true-gap with the generated-then-suppressed slice. + """ + raw = _classify_fn_raw( + checker=checker, + input_text=input_text, + gold_error=gold_error, + segmented_words=segmented_words, + ) + if raw != "candidate_not_generated": + return raw + span = gold_error.get("span", {}) + start = int(span.get("start", -1)) + end = int(span.get("end", -1)) + if 0 <= start < end <= len(input_text): + target_text = input_text[start:end] + if _symspell_can_generate(checker, target_text, gold_error.get("gold_correction")): + return "candidate_suppressed" + return raw + + def merge_strategy_debug_telemetry( aggregate: dict[str, Any], check_telemetry: dict[str, Any], @@ -681,6 +750,9 @@ def run_benchmark( if _os.environ.get("MSC_USE_SEGMENTER_MERGE_RESCUE", "").lower() in ("1", "true", "yes"): config.validation.use_segmenter_post_merge_rescue = True print(" use_segmenter_post_merge_rescue: ENABLED (via MSC_USE_SEGMENTER_MERGE_RESCUE)") + if _os.environ.get("MSC_USE_ORTHO_RESCUE", "").lower() in ("1", "true", "yes", "on"): + config.validation.compound_split_ortho_insertion_rescue = True + print(" compound_split_ortho_insertion_rescue: ENABLED (via MSC_USE_ORTHO_RESCUE)") sme_bigram_env = _os.environ.get("MSC_SEG_MERGE_BIGRAM_THRESHOLD", "").strip() if sme_bigram_env: try: @@ -945,8 +1017,21 @@ def _in_scope(err: dict) -> bool: return err.get("scope", "spelling") in _scope_set # Domain filtering — spelling-first (benchmark v1.4.0+). + # 'lexical' is 'spelling' minus whitespace-only subtypes — used by the v1.11 + # FN-conditioned training pipeline which delegates spacing to the segmenter + # pre-pass and trains the corrector on lexical-only positions. + _LEXICAL_EXCLUDED_SUBTYPES = frozenset( + { + "spacing", + "spacing_error", + "extra_space", + "word_boundary", + "incorrect_word_segmentation", + } + ) + _domain_norm = domain.strip().lower() if domain else "all" - if _domain_norm not in {"spelling", "grammar", "both", "all"}: + if _domain_norm not in {"spelling", "grammar", "both", "all", "lexical"}: print( f"WARNING: unknown --domain {_domain_norm!r}; falling back to 'all'", file=sys.stderr, @@ -961,6 +1046,11 @@ def _in_domain(err: dict) -> bool: if err_domain is None: # v1.3.x benchmarks or hand-added spans without a label — pass only for 'all'. return False + if _domain_norm == "lexical": + # Lexical = spelling AND not a whitespace-only subtype. + if err_domain != "spelling": + return False + return err.get("error_subtype") not in _LEXICAL_EXCLUDED_SUBTYPES return err_domain == _domain_norm def _is_scorable(err: dict) -> bool: @@ -1911,13 +2001,15 @@ def main(): parser.add_argument( "--domain", type=str, - choices=["spelling", "grammar", "both", "all"], + choices=["spelling", "grammar", "both", "all", "lexical"], default="all", help=( "Spelling-first domain filter (benchmark v1.4.0+). Reads the per-span " "`domain` field assigned by scripts/label_benchmark_domains.py. " "Combined with --scope as AND. Errors without a `domain` field are " - "excluded from non-'all' runs. (default: all)" + "excluded from non-'all' runs. 'lexical' is 'spelling' minus " + "whitespace-only subtypes (spacing/extra_space/word_boundary/etc.) — " + "use this for word-level lexical correction evaluation. (default: all)" ), ) @@ -1972,7 +2064,10 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) db_name = args.db.stem - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + # Microsecond resolution makes the output filename collision-proof for + # concurrent runs writing to the same --output dir (per-second timestamps + # collided and interleaved two json.dump writes into one corrupt file). + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") semantic_tag = "_semantic" if args.semantic else "" ner_tag = "_ner" if args.ner else "" targeted_tags = "" diff --git a/src/myspellchecker/core/config/validation_configs.py b/src/myspellchecker/core/config/validation_configs.py index 3a0a9c1..373c7e4 100644 --- a/src/myspellchecker/core/config/validation_configs.py +++ b/src/myspellchecker/core/config/validation_configs.py @@ -779,6 +779,15 @@ class ValidationConfig(BaseModel): "against recall recovery; raising it tightens precision." ), ) + compound_split_ortho_insertion_rescue: bool = Field( + default=False, + description=( + "Carve-out: keep (do not suppress) compound-split tokens whose ed=1 " + "SymSpell top-1 is a single in-syllable diacritic INSERTION " + "(asat/visarga/dot-below/ha-htoe/ya-medial), non-Latin. Recovers " + "orthographic-insertion typos the skip-rule freq gate kills." + ), + ) # Broken compound detection (wrongly split compound words) use_broken_compound_detection: bool = Field( diff --git a/src/myspellchecker/core/error_suppression.py b/src/myspellchecker/core/error_suppression.py index 1aa31f6..19d4532 100644 --- a/src/myspellchecker/core/error_suppression.py +++ b/src/myspellchecker/core/error_suppression.py @@ -180,6 +180,24 @@ # sites from drifting. _COMPOUND_SPLIT_MIN_SYLLABLES = 4 +# In-syllable diacritic characters whose single INSERTION (gold = typo + 1 +# codepoint) marks an orthographic-insertion typo, distinguished from the +# deletion / whole-word-swap shapes that the compound-split suppressor must +# keep killing. Members: asat (U+103A), visarga (U+1038), dot-below (U+1037), +# ha-htoe pair (U+103D/U+103E), ya-medial pair (U+103B/U+103C). Used by +# ``_is_orthographic_insertion_typo`` as the SHAPE gate — never a freq/ed gate. +_ORTHO_INSERTION_CHARS = frozenset( + { + chr(0x103A), # asat (killer / virama-marker) + chr(0x1038), # visarga + chr(0x1037), # dot below + chr(0x103D), # medial wa / ha-htoe + chr(0x103E), # medial ha + chr(0x103B), # medial ya + chr(0x103C), # medial ra + } +) + def _greedy_syllable_reassembly( syllables: Sequence[str], @@ -881,6 +899,11 @@ def _suppress_compound_split_valid_words( if provider is None or segmenter is None: return + # Config flag for the orthographic-insertion carve-out (default off). + config = getattr(self, "config", None) + validation = getattr(config, "validation", None) if config else None + ortho_rescue = bool(getattr(validation, "compound_split_ortho_insertion_rescue", False)) + to_remove: set[int] = set() for idx, err in enumerate(errors): if err.error_type != ET_WORD: @@ -897,8 +920,28 @@ def _suppress_compound_split_valid_words( if result is None: continue word, _syllables, _parts = result - if not self._skip_rule_has_confident_candidate(word): - to_remove.add(idx) + # Keep (do not suppress) when either: + # (a) the existing skip-rule freq/ed gate has a confident top-1, OR + # (b) the ortho-rescue flag is on AND the token's ed=1 top-1 is a + # single in-syllable diacritic INSERTION (asat/visarga/ + # dot-below/ha-htoe/ya-medial), non-Latin, non-Latin-adjacent. + # The (b) carve-out is a pure SHAPE gate — it does not relax any + # freq/ed threshold; it recovers orthographic-insertion typos that + # the freq gate kills because TP and FP overlap in SymSpell freq. + if self._skip_rule_has_confident_candidate(word): + continue + if ortho_rescue: + detail = self._ortho_insertion_detail(word) + if detail is not None: + # Keep — and narrow the emission to the affected syllable + # with the corrected syllable as rank-1 suggestion, so the + # benchmark credits top-1/MRR (gold is the narrow syllable, + # not the whole merged token). Only when offset-clean + # (word == err.text) so positions map 1:1; else keep wide. + if word == err.text: + self._narrow_ortho_insertion_error(err, detail) + continue + to_remove.add(idx) if to_remove: errors[:] = [e for i, e in enumerate(errors) if i not in to_remove] @@ -1121,6 +1164,137 @@ def _skip_rule_has_confident_candidate(self, word: str) -> bool: min_freq=getattr(validation, "skip_rule_gate_min_freq", 1000), ) + def _is_orthographic_insertion_typo(self, word: str) -> bool: + """SHAPE gate: True iff the token's ed=1 SymSpell top-1 is a single + in-syllable diacritic INSERTION (gold = typo + 1 codepoint). + + Recovers orthographic-insertion typos (e.g. ``ရုပ`` → ``ရုပ်`` with an + asat inserted) that ``_suppress_compound_split_valid_words`` kills. + This is a *shape* gate only — it never relaxes a freq/ed threshold; + the FP-class and TP-class overlap in SymSpell frequency, so only the + correction shape (length-diff + identity of the inserted codepoint) + separates them. + + Returns True iff ALL of: + * ``word`` contains no ASCII/Latin codepoint (loanword grey-zone + guard — 'algorithm'+la / 'website'+asat share this shape but are + clean-text FPs); + * SymSpell returns a top-1 at edit_distance <= 1; + * ``len(top1) == len(word) + 1`` (insertion shape — deletions + (len_diff <= 0) and whole-word swaps (len_diff == 0) are the + FP-class shape and stay suppressed); + * removing the single divergent codepoint from ``top1`` reproduces + ``word`` exactly (a genuine single insertion, not a same-length + offset swap); + * that inserted codepoint is in ``_ORTHO_INSERTION_CHARS``. + + Defensive against a missing SymSpell handle, ``None``/empty top-1, + and length edge cases. + """ + return self._ortho_insertion_detail(word) is not None + + def _ortho_insertion_detail(self, word: str) -> tuple[int, str, str] | None: + """Detail variant of :meth:`_is_orthographic_insertion_typo`. + + Returns ``(insert_index, inserted_char, corrected_top1)`` when the + token's ed=1 SymSpell top-1 is a single in-syllable-diacritic + insertion, else ``None``. ``insert_index`` is the offset in ``word`` + at which ``inserted_char`` is added to yield ``corrected_top1``. Pure + SHAPE gate — never relaxes a freq/ed threshold. + """ + if not word: + return None + # Loanword grey-zone guard: any ASCII/Latin char in the token (or + # Latin-adjacency, which for a single OOV token reduces to "contains + # a Latin codepoint") disqualifies the rescue. Re-admitting these + # would resurrect clean-FP BM-EXT-C137 / BM-EXT-E026. + if any(ch.isascii() and ch.isalpha() for ch in word): + return None + + symspell = getattr(self, "symspell", None) + if symspell is None: + return None + try: + candidates = symspell.lookup(word, level="word", max_suggestions=1) + except (RuntimeError, ValueError, KeyError): + return None + if not candidates: + return None + top = candidates[0] + top1 = getattr(top, "term", None) + if not top1 or top1 == word: + return None + # SHAPE, not freq: only edit_distance is gated (<= 1) — never freq. + if float(getattr(top, "edit_distance", 99)) > 1: + return None + # Insertion shape: gold is exactly one codepoint longer than the typo. + if len(top1) != len(word) + 1: + return None + + # Locate the single inserted codepoint via the left-divergence point. + i = 0 + n = len(word) + while i < n and word[i] == top1[i]: + i += 1 + # `i` is the first divergent index; the inserted char is top1[i]. + inserted = top1[i] + # Confirm this is a true single insertion: deleting top1[i] must + # reproduce the typo exactly (guards same-length-offset swaps that + # merely happen to differ by one codepoint here and there). + if top1[:i] + top1[i + 1 :] != word: + return None + if inserted not in _ORTHO_INSERTION_CHARS: + return None + return i, inserted, top1 + + def _narrow_ortho_insertion_error(self, err: "Error", detail: tuple[int, str, str]) -> None: + """Narrow a kept compound-split ortho-insertion error to the affected + syllable, carrying the corrected syllable as the sole rank-1 + suggestion. Mutates ``err`` in place and marks it to survive + downstream filters. The benchmark scores top-1/MRR against the NARROW + gold correction (the corrected syllable, not the whole merged token), + so a wide-span suggestion never matches; narrowing recovers the + suggestion credit for the +TP this carve-out keeps. Safe no-op when + syllable segmentation does not line up (keeps the wide emission). + """ + i, inserted, _top1 = detail + segmenter = getattr(self, "segmenter", None) + if segmenter is None or i <= 0: + return + try: + syllables = segmenter.segment_syllables(err.text) + except Exception: + return + if not syllables or "".join(syllables) != err.text: + return + # The inserted diacritic attaches to the syllable containing index + # i-1 (the char immediately before the insertion point). + start = 0 + target: tuple[int, int, str] | None = None + for syl in syllables: + end = start + len(syl) + if start <= i - 1 < end: + target = (start, end, syl) + break + start = end + if target is None: + return + s_start, _s_end, syl = target + local = i - s_start # insertion offset within the syllable + if not (0 < local <= len(syl)): + return + corrected = syl[:local] + inserted + syl[local:] + err.position = err.position + s_start + err.text = syl + err.suggestions = [ + Suggestion( + text=corrected, + confidence=0.9, + source="ortho_insertion_rescue", + ) + ] + err._structural_early_exit = True + def _suppress_low_value_semantic_errors( self, errors: list[Error], From 188e8efad2ee93d93fbb3956032df86fdf73d52b Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Tue, 2 Jun 2026 21:42:19 +0800 Subject: [PATCH 02/13] chore(infra): hygiene-03 add models/ to .gitignore Move the models/ ignore rule from .git/info/exclude into the tracked .gitignore so model artifacts (symlinked to external storage) are never committed regardless of checkout. Part of v1.8.0 WS-1 baseline-lock. Workstream: v18-hygiene-baseline-lock --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 7246d6d..e58a7e4 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ benchmarks/_tmp/ *.db data/ +# Model artifacts (local only — symlinked to external storage; never committed) +models/ + # N-gram checkpoint (pipeline resume state) ngram_checkpoint/ From d5a83b48b98486a8f0723ec61fb1c3c3d389582a Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Tue, 2 Jun 2026 23:16:14 +0800 Subject: [PATCH 03/13] fix(bench): taxonomy-01 clean-set audit + flat-aa residue + cap mislabels Full clean-set sweep (Opus 4.8) over all 93 clean-FP rows recalibrated the true clean-FP to 91 (+1 cap headroom). Fixes: 7 corrupted gold answers + 17 corrupted clean-row inputs ({ed-pa-kha}+flat-aa residue from the 2026-04-15 sweep, normalized to tall-aa, kept clean) + 2 cap mislabels reclassified clean->error (BM-424 broken stacking, BM-EXT-E026 missing loan-final asat) + 32 confusable_semantic->synonym_substitution relabels. ~3800 other flat-aa occurrences are correct and untouched (narrow whitelist only). Workstream: v18-taxonomy-cleanup Benchmark: myspellchecker_benchmark.yaml@1.5.0-v18b-clean-audit --- benchmarks/myspellchecker_benchmark.yaml | 151 +++++++++++++---------- 1 file changed, 89 insertions(+), 62 deletions(-) diff --git a/benchmarks/myspellchecker_benchmark.yaml b/benchmarks/myspellchecker_benchmark.yaml index 62ab135..07d4637 100644 --- a/benchmarks/myspellchecker_benchmark.yaml +++ b/benchmarks/myspellchecker_benchmark.yaml @@ -1,4 +1,4 @@ -version: 1.5.0-bhv17-byte-identical +version: 1.5.0-v18b-clean-audit category: benchmark description: 'Myanmar spell checker benchmark v1.5. Spelling-first domain labeling refined (2026-04-18): added text_overrides for 18 particle-family spans mislabeled as spelling by v1.4.0 (aukmyit_confusion case-markers + homophone_confusion sentence-final particles). Mechanical mapping resolves 100% of 1,716 spans (spelling 82.4% / grammar 17.6% / both 0%). v1.4.0: initial domain labeling. Based on v1.3.0 (v1.2 audited 2026-04-15 via 6-agent + Gemini 2.5 Flash pipeline).' metadata: @@ -5245,12 +5245,25 @@ sentences: notes: Annotated sentence with 1 error(s). logical_contradiction span removed (reclassified as writing_quality, not a spelling error). - id: BM-424 input: တက္ကသိုလ်မှာ ဝိဇ်ဇာဘာသာရပ်ကို အထူးပြု တက်ခဲ့ပါတယ်။ - is_clean: true + is_clean: false domain: academic register: formal difficulty_tier: 2 - expected_errors: [] - notes: Natural sentence with planted spelling error. + expected_errors: + - error_id: BM-424-E1 + error_type: wrong_word + error_subtype: stacking_error + detection_layer: word + span: + start: 13 + end: 26 + erroneous_text: ဝိဇ်ဇာဘာသာရပ် + gold_correction: ဝိဇ္ဇာဘာသာရပ် + edit_distance: 1 + context_required: false + notes: 'Broken stacking: first ဇ joined with asat U+103A instead of stacking virama U+1039 (vijja must stack as ဝိဇ္ဇာ).' + domain: spelling + notes: 'WS-0b reclassified from clean (was mislabeled): Broken stacking: first ဇ joined with asat U+103A instead of stacking virama U+1039 (vijja must stack as ဝိဇ္ဇာ).' - id: BM-426 input: အင်ဂလိပ် ဘက်ထရီအကြောင်း ရှာကြည့်နေတယ်။ is_clean: false @@ -7659,11 +7672,25 @@ sentences: notes: Annotated sentence with 1 error(s). Error can be detected without surrounding context. - id: BM-EXT-E026 input: ဝက်ဘဆိုက်ရဲ့ ပါဖောမန့်ကို တိုင်းတာပါ။ - is_clean: true + is_clean: false domain: technical register: formal difficulty_tier: 2 - notes: Annotated sentence with 1 error(s). Error can be detected without surrounding context. + notes: "WS-0b reclassified from clean (was mislabeled): Loan 'website': syllable-final stop ဘ missing its asat U+103A (ဝက်ဘ်ဆိုက်)." + expected_errors: + - error_id: BM-EXT-E026-E1 + error_type: wrong_word + error_subtype: missing_asat + detection_layer: word + span: + start: 0 + end: 9 + erroneous_text: ဝက်ဘဆိုက် + gold_correction: ဝက်ဘ်ဆိုက် + edit_distance: 1 + context_required: false + notes: "Loan 'website': syllable-final stop ဘ missing its asat U+103A (ဝက်ဘ်ဆိုက်)." + domain: spelling - id: BM-EXT-E027 input: ဒီပရောဂျက်ကို ဖရိန်းဝပ် သုံးပြီး ဆောက်မယ်။ is_clean: false @@ -9167,7 +9194,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (27 words). - id: BM-EXP-C030 - input: ပထမဆုံး စပြောရမယ့်သူကတော့ NLD အစိုးရ၊ အထူးသဖြင့် နိုင်ငံတော် အတိုင်ပင်ခံပုဂ္ဂိုလ် ဒော်အောင်ဆန်းစုကြည်နဲ့ ဆက်ဆံရေး ကောင်းတယ်လို့ သတင်းထွက်နေသူ ဒုတိယ ဗိုလ်ချုပ်မှူးကြီး စိုးဝင်းဟာ စစ်တက္ကသိုလ် အပတ်စဉ် က ဖြစ်ပါတယ်။ + input: ပထမဆုံး စပြောရမယ့်သူကတော့ NLD အစိုးရ၊ အထူးသဖြင့် နိုင်ငံတော် အတိုင်ပင်ခံပုဂ္ဂိုလ် ဒေါ်အောင်ဆန်းစုကြည်နဲ့ ဆက်ဆံရေး ကောင်းတယ်လို့ သတင်းထွက်နေသူ ဒုတိယ ဗိုလ်ချုပ်မှူးကြီး စိုးဝင်းဟာ စစ်တက္ကသိုလ် အပတ်စဉ် က ဖြစ်ပါတယ်။ is_clean: true domain: technical register: formal @@ -9474,7 +9501,7 @@ sentences: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (9 words). - id: BM-EXP-C050 - input: တကယ်လို့ အမေဟာ ဒီလို စိတ်ဓာတ် ပြင်းထန်သူမျိုး မဟုတ်ဘဲ နူးညံ့တဲ့ စိတ်သဘောထား ရှိသူမျိုး ဖြစ်ရင်တောင် အမေဟာ သူ့ရဲ့ အနာဂတ် သူရဲကောင်း၊ သူ့ရဲ့ အိပ်မက်ထဲက ဟီးရိုးကို သူ့ရဲ့ လက်ရှိအားနွဲ့တဲ့ ကလေးဘဝ၊ တစ်နည်းပြောရရင် အမေ့ အပော်မှာ မှီခိုနေရတဲ့ ဘဝကို ပြန်ပြီး သတိရသွားအောင် ကလေးကို ဆက်ဆံ ပြတတ်ပါတယ်။ + input: တကယ်လို့ အမေဟာ ဒီလို စိတ်ဓာတ် ပြင်းထန်သူမျိုး မဟုတ်ဘဲ နူးညံ့တဲ့ စိတ်သဘောထား ရှိသူမျိုး ဖြစ်ရင်တောင် အမေဟာ သူ့ရဲ့ အနာဂတ် သူရဲကောင်း၊ သူ့ရဲ့ အိပ်မက်ထဲက ဟီးရိုးကို သူ့ရဲ့ လက်ရှိအားနွဲ့တဲ့ ကလေးဘဝ၊ တစ်နည်းပြောရရင် အမေ့ အပေါ်မှာ မှီခိုနေရတဲ့ ဘဝကို ပြန်ပြီး သတိရသွားအောင် ကလေးကို ဆက်ဆံ ပြတတ်ပါတယ်။ is_clean: true domain: literary register: formal @@ -9482,7 +9509,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (30 words). - id: BM-EXP-C051 - input: “ကိုယ်က အနုပညာအလုပ်တွေလည်း လုပ်နေတဲ့လူ ဆေးကျောင်းက ဘွဲ့ရတဲ့အချိန်မှာတော့ ဆေးကောင်စီက ခော်ပြောတယ် အလုပ် ခုထဲက တစ်ခုပဲ လုပ်ရမယ်ဆိုပြီးတော့ ရွေးတဲ့အချိန်မှာ ညီမက ဆရာဝန်ဘက်ကို ရွေးလိုက်တော့ အဲဒီတုန်းကတည်းက ဆရာဝန်ပဲလုပ်ဖြစ်နေတာပါ။ ဆေးကျောင်းတက်ရင်း အနုပညာအလုပ်တွေ လုပ်နေရတုန်းက တစ်ချိန်ချိန်မှာ အခုလို ရွေးရမယ်ဆိုတာ သိနေခဲ့တယ်။ ဆရာဝန်အလုပ် လုပ်ဖြစ်မယ်ဆိုတာလည်း သိနေတယ်။ ဒါပေမယ့် တကယ်တမ်း ရွေးရမယ့်အချိန်မှာ တော်တော် ခက်ခဲခဲ့ပါတယ်။ စိတ်ညစ်ခဲ့ရတဲ့ အချိန်တွေလည်းရှိပါတယ်” လို့ ဆိုပါတယ်။ + input: “ကိုယ်က အနုပညာအလုပ်တွေလည်း လုပ်နေတဲ့လူ ဆေးကျောင်းက ဘွဲ့ရတဲ့အချိန်မှာတော့ ဆေးကောင်စီက ခေါ်ပြောတယ် အလုပ် ခုထဲက တစ်ခုပဲ လုပ်ရမယ်ဆိုပြီးတော့ ရွေးတဲ့အချိန်မှာ ညီမက ဆရာဝန်ဘက်ကို ရွေးလိုက်တော့ အဲဒီတုန်းကတည်းက ဆရာဝန်ပဲလုပ်ဖြစ်နေတာပါ။ ဆေးကျောင်းတက်ရင်း အနုပညာအလုပ်တွေ လုပ်နေရတုန်းက တစ်ချိန်ချိန်မှာ အခုလို ရွေးရမယ်ဆိုတာ သိနေခဲ့တယ်။ ဆရာဝန်အလုပ် လုပ်ဖြစ်မယ်ဆိုတာလည်း သိနေတယ်။ ဒါပေမယ့် တကယ်တမ်း ရွေးရမယ့်အချိန်မှာ တော်တော် ခက်ခဲခဲ့ပါတယ်။ စိတ်ညစ်ခဲ့ရတဲ့ အချိန်တွေလည်းရှိပါတယ်” လို့ ဆိုပါတယ်။ is_clean: true domain: technical register: formal @@ -9499,7 +9526,7 @@ sentences: notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (42 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' - id: BM-EXP-C053 - input: ၁၄-၁၅။ အလုံးစုံ တန်ဆာတို့ဖြင့် တန်ဆာဆင်အပ်ကုန်သော အမျိုးအားဖြင့် အာဇာနည် မျိုးဖြစ်ကုန်သော လျင်မြန်စွာ ဝန်ကို ဆောင်တတ်ကုန်သော သက်တင်ရေးနှင့် တူသော လေးကို ဆောင်ကုန်သော မြင်းစီးသူရဲတို့သည် တက်စီးအပ်ကုန်သော သိန္ဓောမြင်းပောင်း ခြောက်သောင်းတို့သည် ဤသူကို အမြဲ ခြံရံကုန်လတ္တံ့။ ဤသို့ ဖြစ်ရခြင်းသည် ဘုရားကို ပူဇော်ရခြင်း၏ အကျိုးပေတည်း။ + input: ၁၄-၁၅။ အလုံးစုံ တန်ဆာတို့ဖြင့် တန်ဆာဆင်အပ်ကုန်သော အမျိုးအားဖြင့် အာဇာနည် မျိုးဖြစ်ကုန်သော လျင်မြန်စွာ ဝန်ကို ဆောင်တတ်ကုန်သော သက်တင်ရေးနှင့် တူသော လေးကို ဆောင်ကုန်သော မြင်းစီးသူရဲတို့သည် တက်စီးအပ်ကုန်သော သိန္ဓောမြင်းပေါင်း ခြောက်သောင်းတို့သည် ဤသူကို အမြဲ ခြံရံကုန်လတ္တံ့။ ဤသို့ ဖြစ်ရခြင်းသည် ဘုရားကို ပူဇော်ရခြင်း၏ အကျိုးပေတည်း။ is_clean: true domain: religious register: informal @@ -9582,7 +9609,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (8 words). - id: BM-EXP-C060 - input: စာမေးပွဲတွေပြီးလို့ နွေကျောင်းပိတ်ရက် ရောက်လာတော့ ကျွန်တော် အိမ်ခဏ ပြန်ရမယ်ပော့ဗျာ။ ဒါပေမဲ့ ဒော်လေးမြတ် အိမ်မှာက အဖော်ရှိတာ မဟုတ်တော့ အကြာကြီး ပြန်လို့ မရဘူးပော့။ ဒီတစ်ခါ ဒော်လေးမြတ်က သူလည်း ကျွန်တော်တို့ မြို့ကို လိုက်လည်မယ်တဲ့။ ဇော်မင်းအောင်ကို အိမ်စောင့်ဖို့ ထားခဲ့ပြီး ကျွန်တော်တို့ တူအရီး နှစ်ယောက်ပဲ ထွက်လာခဲ့ကြတာပော့။ + input: စာမေးပွဲတွေပြီးလို့ နွေကျောင်းပိတ်ရက် ရောက်လာတော့ ကျွန်တော် အိမ်ခဏ ပြန်ရမယ်ပေါ့ဗျာ။ ဒါပေမဲ့ ဒေါ်လေးမြတ် အိမ်မှာက အဖော်ရှိတာ မဟုတ်တော့ အကြာကြီး ပြန်လို့ မရဘူးပေါ့။ ဒီတစ်ခါ ဒေါ်လေးမြတ်က သူလည်း ကျွန်တော်တို့ မြို့ကို လိုက်လည်မယ်တဲ့။ ဇော်မင်းအောင်ကို အိမ်စောင့်ဖို့ ထားခဲ့ပြီး ကျွန်တော်တို့ တူအရီး နှစ်ယောက်ပဲ ထွက်လာခဲ့ကြတာပေါ့။ is_clean: true domain: technical register: informal @@ -9646,7 +9673,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (6 words). - id: BM-EXP-C068 - input: ဆိုပြီး ထီပောက်တဲ့အကြောင်းကို တင်ထားတာ ဖြစ်ပါတယ် ဦးကံပွင့်ကတော့ သူထိုးထားတဲ့ ထီ(၁၀) စောင်တွဲကနေ တစ်သိန်းဆုတစ်ဆုကို ဆွတ်ခူးရရှိသွားတာပဲ ဖြစ်ပါတယ်နော်။ + input: ဆိုပြီး ထီပေါက်တဲ့အကြောင်းကို တင်ထားတာ ဖြစ်ပါတယ် ဦးကံပွင့်ကတော့ သူထိုးထားတဲ့ ထီ(၁၀) စောင်တွဲကနေ တစ်သိန်းဆုတစ်ဆုကို ဆွတ်ခူးရရှိသွားတာပဲ ဖြစ်ပါတယ်နော်။ is_clean: true domain: general register: colloquial @@ -9686,7 +9713,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-cc100-dataset.txt (7 words). - id: BM-EXP-C073 - input: ဟိုဘက်ရက်တွေတုန်းက ကျော်အထိ တက်သွားတဲ့ အောက်တိန်း ဆီဈေးက ဒော်လာကို ဗဟိုဘဏ်က ထိန်းချုပ်လိုက်ပြီးတဲ့ နောက်မှာတော့ ဆိုပြီး ဆီဆိုင်တွေက အယ်လ်အီးဒီဘုတ်တွေမှာ တွေ့လာရတယ်။ + input: ဟိုဘက်ရက်တွေတုန်းက ကျော်အထိ တက်သွားတဲ့ အောက်တိန်း ဆီဈေးက ဒေါ်လာကို ဗဟိုဘဏ်က ထိန်းချုပ်လိုက်ပြီးတဲ့ နောက်မှာတော့ ဆိုပြီး ဆီဆိုင်တွေက အယ်လ်အီးဒီဘုတ်တွေမှာ တွေ့လာရတယ်။ is_clean: true domain: conversational register: colloquial @@ -10642,7 +10669,7 @@ sentences: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'စုစုပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'စုစုပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-c4-dataset.txt (30 words). - id: BM-EXP-C118 - input: ကျွန်တော်တို့နှစ်ဦးစလုံး၏ အပောင်းအသင်းများကလည်း အခါ အခွင့်သင့်တိုင်း ကျွန်တော်တို့နှစ်ယောက်၏ မတူညီမှုများကို ပြောပြတတ် ကြသည်။ + input: ကျွန်တော်တို့နှစ်ဦးစလုံး၏ အပေါင်းအသင်းများကလည်း အခါ အခွင့်သင့်တိုင်း ကျွန်တော်တို့နှစ်ယောက်၏ မတူညီမှုများကို ပြောပြတတ် ကြသည်။ is_clean: true domain: literary register: polite @@ -10740,7 +10767,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (13 words). - id: BM-EXP-C130 - input: အလောင်းမင်းတရားလက်ထက် မေ တွင် ရန်ကုန်မြို့တော်ကို တည်ခဲ့သည်။ တောအုပ်ထူထပ်လှသဖြင့် မြေတိုင်းတာရာ၌ အခက်အခဲအချို့ ကြုံတွေ့ခဲ့ရ၏။ တစ်ခါတစ်ရံ ရေအိုင်များအတွင်းသို့ ဖြတ်သန်းတိုင်းတာရသည်လည်း ရှိသည်။ ထိုစဉ်က ကျုံးကို လက်ဝဲသုန္ဒရ (ဦးမြတ်သာနေ)နှင့် ဝေဠုရာဇာတို့ စီစဉ်ဆောင်ရွက်ခဲ့ရကြောင်း အလောင်းမင်းတရား အရေးတော်ပုံ ရာဇဝင်၌ တွေ့ရှိရ၏။ ဝေးလံခောင်ဖျားဒေသ ဖြစ်သည့်အပြင် စစ်ရေးအတွက်ပါ အရေးတကြီးပြင်ဆင်နေချိန်ဖြစ်သဖြင့် ရန်ကုန်မြို့ ကျုံးတူးဖော်ရေး၌ ပါဝင်ခဲ့ခြင်း မရှိပေ။ + input: အလောင်းမင်းတရားလက်ထက် မေ တွင် ရန်ကုန်မြို့တော်ကို တည်ခဲ့သည်။ တောအုပ်ထူထပ်လှသဖြင့် မြေတိုင်းတာရာ၌ အခက်အခဲအချို့ ကြုံတွေ့ခဲ့ရ၏။ တစ်ခါတစ်ရံ ရေအိုင်များအတွင်းသို့ ဖြတ်သန်းတိုင်းတာရသည်လည်း ရှိသည်။ ထိုစဉ်က ကျုံးကို လက်ဝဲသုန္ဒရ (ဦးမြတ်သာနေ)နှင့် ဝေဠုရာဇာတို့ စီစဉ်ဆောင်ရွက်ခဲ့ရကြောင်း အလောင်းမင်းတရား အရေးတော်ပုံ ရာဇဝင်၌ တွေ့ရှိရ၏။ ဝေးလံခေါင်ဖျားဒေသ ဖြစ်သည့်အပြင် စစ်ရေးအတွက်ပါ အရေးတကြီးပြင်ဆင်နေချိန်ဖြစ်သဖြင့် ရန်ကုန်မြို့ ကျုံးတူးဖော်ရေး၌ ပါဝင်ခဲ့ခြင်း မရှိပေ။ is_clean: true domain: academic register: informal @@ -11137,7 +11164,7 @@ sentences: expected_errors: [] notes: Marked clean during benchmark audit 2026-04-15. - id: BM-EXP-C158 - input: ဆရာမင်းလူ ရသ အစွမ်းနဲ့ တည်ဆောက်ခဲ့တဲ့ ဝတ္ထုထဲက မြို့မကျ၊ လူ့အဖွဲ့အစည်း တခုဟာ တကယ်တော့ ဗမာပြည် တပြည်လုံးကို ကိုယ်စားပြုတယ်။ အခြေခံလူတန်းစား တခု ကို အခြေခံပြီး တိုင်းပြည်မှာ အခြေတည်သင့်၊ ကျင့်သုံးသင့် တဲ့ အကျင့်၊ အကြံ၊ အတွေးအခော်၊ အယူအဆ၊ သဘောတရားတွေကို ရသပညာ နဲ့ သွတ်သွင်း ဖန်တီး လမ်းညွှန် ပြထားခြင်း ဖြစ်တယ်။ + input: ဆရာမင်းလူ ရသ အစွမ်းနဲ့ တည်ဆောက်ခဲ့တဲ့ ဝတ္ထုထဲက မြို့မကျ၊ လူ့အဖွဲ့အစည်း တခုဟာ တကယ်တော့ ဗမာပြည် တပြည်လုံးကို ကိုယ်စားပြုတယ်။ အခြေခံလူတန်းစား တခု ကို အခြေခံပြီး တိုင်းပြည်မှာ အခြေတည်သင့်၊ ကျင့်သုံးသင့် တဲ့ အကျင့်၊ အကြံ၊ အတွေးအခေါ်၊ အယူအဆ၊ သဘောတရားတွေကို ရသပညာ နဲ့ သွတ်သွင်း ဖန်တီး လမ်းညွှန် ပြထားခြင်း ဖြစ်တယ်။ is_clean: true domain: general register: colloquial @@ -12149,7 +12176,7 @@ sentences: expected_errors: [] notes: Clean control sentence from alpaca_myanmar_burmese_taco.txt (30 words). - id: BM-EXP-C209 - input: ဗီယက်နမ် နိုင်ငံရဲ့ ပထမဆုံး နျူကလီးယား ဓာတ်ပောင်းဖိုကို ခုနှစ် မတိုင်မီ လုပ်ငန်း စတင် လည်ပတ်မှာ ဖြစ်ပြီး စွမ်းအင် မီဂါဝပ် တစ်သောင်း ခုနစ်ရာ ထုတ်လုပ်နိုင်မှာ ဖြစ်ပါတယ်။ + input: ဗီယက်နမ် နိုင်ငံရဲ့ ပထမဆုံး နျူကလီးယား ဓာတ်ပေါင်းဖိုကို ခုနှစ် မတိုင်မီ လုပ်ငန်း စတင် လည်ပတ်မှာ ဖြစ်ပြီး စွမ်းအင် မီဂါဝပ် တစ်သောင်း ခုနစ်ရာ ထုတ်လုပ်နိုင်မှာ ဖြစ်ပါတယ်။ is_clean: true domain: conversational register: formal @@ -12173,7 +12200,7 @@ sentences: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (12 words). - id: BM-EXP-C212 - input: “ကျွန်တော့်ကို ခုနှစ် စက်တင်ဘာ အရေးအခင်းပြီးကတည်းက ဇာတ်ကားရိုက်ကူးခွင့် ပိတ်ထား တာပါ။ နောက်ပိုင်း ရိုက်ခွင့်ပြုတော့လည်း ကျွန်တော် ပြန်မရိုက်တော့ဘူး။ ပရဟိတ လုပ်ငန်းတွေနဲ့ပဲ ဘဝကို ဖြတ်သန်းနေတယ်။ ဦးသုခတို့လို ပြည်သူကို အကျိုးပြုတဲ့ ဇာတ်လမ်းမျိုးဆို ရိုက်မယ်လို့ ဆုံးဖြတ်ထားတာ။ ဒီကားက ဆရာကြီး မင်းသိင်္ခရဲ့ ဇာတ်လည်းဖြစ်တော့ ဆရာကြီးကိုလည်း ဂါရဝပြုချင် တယ်။ ဒါရိုက်တာ ခင်စောမျိုး ဆိုတာကလည်း ကျွန်တော်နဲ့ လက်တွဲဖူးတဲ့အတွက် သူ့ကို ကူညီချင်တာလည်း ပါပါတယ်” ဟု ဇာတ်ကားရိုက်ကူးမှုများ ပရော်ဖက်ဆာ ဒောက်တာဆိတ်ဖွား ရုပ်ရှင်ဇာတ် ကားကို လက်ခံရိုက်ကူးဖြစ်ပုံကို ထပ်လောင်းပြောကြားသည်။ + input: “ကျွန်တော့်ကို ခုနှစ် စက်တင်ဘာ အရေးအခင်းပြီးကတည်းက ဇာတ်ကားရိုက်ကူးခွင့် ပိတ်ထား တာပါ။ နောက်ပိုင်း ရိုက်ခွင့်ပြုတော့လည်း ကျွန်တော် ပြန်မရိုက်တော့ဘူး။ ပရဟိတ လုပ်ငန်းတွေနဲ့ပဲ ဘဝကို ဖြတ်သန်းနေတယ်။ ဦးသုခတို့လို ပြည်သူကို အကျိုးပြုတဲ့ ဇာတ်လမ်းမျိုးဆို ရိုက်မယ်လို့ ဆုံးဖြတ်ထားတာ။ ဒီကားက ဆရာကြီး မင်းသိင်္ခရဲ့ ဇာတ်လည်းဖြစ်တော့ ဆရာကြီးကိုလည်း ဂါရဝပြုချင် တယ်။ ဒါရိုက်တာ ခင်စောမျိုး ဆိုတာကလည်း ကျွန်တော်နဲ့ လက်တွဲဖူးတဲ့အတွက် သူ့ကို ကူညီချင်တာလည်း ပါပါတယ်” ဟု ဇာတ်ကားရိုက်ကူးမှုများ ပရော်ဖက်ဆာ ဒေါက်တာဆိတ်ဖွား ရုပ်ရှင်ဇာတ် ကားကို လက်ခံရိုက်ကူးဖြစ်ပုံကို ထပ်လောင်းပြောကြားသည်။ is_clean: true domain: news register: polite @@ -12215,7 +12242,7 @@ sentences: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (27 words). - id: BM-EXP-C217 - input: အာဆာအကြမ်းဖက်အဖွဲ့သည် နိုင်ငံတကာအကြမ်းဖက်အဖွဲ့များဖြစ်သည့် Al-Qaeda ISIL နှင့် Tehrik-e-Taliban Pakistan (TTP) တို့၏ ကူညီပံ့ပိုးမှုများ ရရှိနေကြောင်း၊ မကြာသေးမီက နိုင်ငံခြားသား အကြမ်းဖက်သမားများက အိမ်နီးချင်းနိုင်ငံ တစ်နိုင်ငံမှတစ်ဆင့် မြန်မာနိုင်ငံအတွင်းသို့ ဝင်ရောက်၍ အာဆာအဖွဲ့အား ပူးပောင်းကူညီရန် ကြိုးစားခဲ့သည့် သာဓကရှိကြောင်း၊ + input: အာဆာအကြမ်းဖက်အဖွဲ့သည် နိုင်ငံတကာအကြမ်းဖက်အဖွဲ့များဖြစ်သည့် Al-Qaeda ISIL နှင့် Tehrik-e-Taliban Pakistan (TTP) တို့၏ ကူညီပံ့ပိုးမှုများ ရရှိနေကြောင်း၊ မကြာသေးမီက နိုင်ငံခြားသား အကြမ်းဖက်သမားများက အိမ်နီးချင်းနိုင်ငံ တစ်နိုင်ငံမှတစ်ဆင့် မြန်မာနိုင်ငံအတွင်းသို့ ဝင်ရောက်၍ အာဆာအဖွဲ့အား ပူးပေါင်းကူညီရန် ကြိုးစားခဲ့သည့် သာဓကရှိကြောင်း၊ is_clean: true domain: news register: informal @@ -12420,7 +12447,7 @@ sentences: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (6 words). - id: BM-EXP-C229 - input: ၆၉၁.၆ မီတာ ရှည်ပြီး လာအိုဘက် ချဉ်းကပ်လမ်း မီတာ၊ မြန်မာဘက် ချဉ်းကပ်လမ်း မီတာ ရှိသည်။ ဒော်လာ မီလီယံ သုံးခဲ့ပြီး ခု၊ ဖေဖော်ဝါရီလ ရက်တွင် စဆောက်သည်။ လ အပြီးဆောက်ရန် မှန်းထားသော်လည်း လနှင့် ပြီးသည်။ တန် ဝန် ခံနိုင်သည်။ + input: ၆၉၁.၆ မီတာ ရှည်ပြီး လာအိုဘက် ချဉ်းကပ်လမ်း မီတာ၊ မြန်မာဘက် ချဉ်းကပ်လမ်း မီတာ ရှိသည်။ ဒေါ်လာ မီလီယံ သုံးခဲ့ပြီး ခု၊ ဖေဖော်ဝါရီလ ရက်တွင် စဆောက်သည်။ လ အပြီးဆောက်ရန် မှန်းထားသော်လည်း လနှင့် ပြီးသည်။ တန် ဝန် ခံနိုင်သည်။ is_clean: true domain: academic register: formal @@ -12453,7 +12480,7 @@ sentences: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (9 words). - id: BM-EXP-C233 - input: အမေရိကန်တပ်ဖွဲ့တွေရဲ့ ပူးတွဲ စစ်ဦးစီးချုပ် ဥက္ကဌအဖြစ် သမ္မတ ဒော်နယ်ထရမ့်က အဆိုပြုထားသူ ကြည်းတပ် ဗိုလ်ချုပ်ကြီး က အမေရိကန်တပ်ဖွဲ့တွေကို အာဖဂန် နစ္စတန်ကနေ အချိန်မတိုင်ခင် ရုပ်သိမ်းမယ်ဆိုရင် မဟာဗျူဟာ အမှားတစ်ခု ဖြစ်လိမ့်မယ်လို့ ဆိုပါတယ်။ + input: အမေရိကန်တပ်ဖွဲ့တွေရဲ့ ပူးတွဲ စစ်ဦးစီးချုပ် ဥက္ကဌအဖြစ် သမ္မတ ဒေါ်နယ်ထရမ့်က အဆိုပြုထားသူ ကြည်းတပ် ဗိုလ်ချုပ်ကြီး က အမေရိကန်တပ်ဖွဲ့တွေကို အာဖဂန် နစ္စတန်ကနေ အချိန်မတိုင်ခင် ရုပ်သိမ်းမယ်ဆိုရင် မဟာဗျူဟာ အမှားတစ်ခု ဖြစ်လိမ့်မယ်လို့ ဆိုပါတယ်။ is_clean: true domain: conversational register: polite @@ -13031,7 +13058,7 @@ sentences: domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E030 - input: '''''ဘယ်လိုဖြစ်သွားလဲ ဆို တော့ ကလေးရဲ့ သဘောသဘာဝ က သူတို့တစ်ခုခုလိုချင်ရင် မိဘ ကို တစ်နည်းနည်းနဲ့ပြပြီးတောင်း တာ။ ပေါက်ကွဲပြပြီး တောင်းတဲ့ လူကြီးကလိုက်လျော လို့ရှိရင် ကလေးစိတ်ထဲမှာ ငါ ကောင်းကောင်းမွန်မွန်တောင်း တုန်းကတော့ မပေးဘူး။ လိုချင်တာရ တယ်ဆိုပြီး နောက်အခါတွေမှာ ပေါက်ကွဲတဲ့အပြုအမူတွေ လုပ်ပြီး တောင်းတာ။ အဲဒီလိုနဲ့ တချို့ ကလေးတွေမှာ ဝုန်းဒိုင်းကြဲပြီး ပေါက်ကွဲတဲ့ ကလေးလေးတွေ ဖြစ်လာတာပေါ့'''' ဟု ကလေးငယ် များ၏ အပြုအမူပိုင်းဆိုင်ရာ အဆင့်စီမံခန့်ခွဲပေးသော ဒောက်တာ ဌေးဌေး (စိတ်ပညာ) ကဆိုသည်။' + input: '''''ဘယ်လိုဖြစ်သွားလဲ ဆို တော့ ကလေးရဲ့ သဘောသဘာဝ က သူတို့တစ်ခုခုလိုချင်ရင် မိဘ ကို တစ်နည်းနည်းနဲ့ပြပြီးတောင်း တာ။ ပေါက်ကွဲပြပြီး တောင်းတဲ့ လူကြီးကလိုက်လျော လို့ရှိရင် ကလေးစိတ်ထဲမှာ ငါ ကောင်းကောင်းမွန်မွန်တောင်း တုန်းကတော့ မပေးဘူး။ လိုချင်တာရ တယ်ဆိုပြီး နောက်အခါတွေမှာ ပေါက်ကွဲတဲ့အပြုအမူတွေ လုပ်ပြီး တောင်းတာ။ အဲဒီလိုနဲ့ တချို့ ကလေးတွေမှာ ဝုန်းဒိုင်းကြဲပြီး ပေါက်ကွဲတဲ့ ကလေးလေးတွေ ဖြစ်လာတာပေါ့'''' ဟု ကလေးငယ် များ၏ အပြုအမူပိုင်းဆိုင်ရာ အဆင့်စီမံခန့်ခွဲပေးသော ဒေါက်တာ ဌေးဌေး (စိတ်ပညာ) ကဆိုသည်။' is_clean: true domain: general register: formal @@ -13138,7 +13165,7 @@ sentences: start: 206 end: 216 erroneous_text: စုဆုပောင်း - gold_correction: စုစုပောင်း + gold_correction: စုစုပေါင်း edit_distance: 1 context_required: false notes: Injected consonant_substitution error. @@ -13328,7 +13355,7 @@ sentences: start: 228 end: 251 erroneous_text: ကပ်သောသိပောက်မျှကိုလည်း - gold_correction: ကပ်သောဆီပောက်မျှကိုလည်း + gold_correction: ကပ်သောဆီပေါက်မျှကိုလည်း edit_distance: 1 context_required: false notes: Injected compound_confusion error. @@ -13413,7 +13440,7 @@ sentences: start: 11 end: 25 erroneous_text: ကိုခော င်းကြီး - gold_correction: ကိုခောင်းကြီး + gold_correction: ကိုခေါင်းကြီး edit_distance: 1 context_required: true notes: Injected broken_compound error. @@ -13726,7 +13753,7 @@ sentences: domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E063 - input: တောင်း။ အဲဒီတော့ ကိုဆွေဝင်းကို သြဂုတ်လ ရက်နေ့မှာ ရုံးချိန်းပြန်ခော်ထားတယ်ဆိုတော့ ကိုဆွေဝင်းက ဆက်သွားပြီး အမှုရင်ဆိုင်ရမှာပော့နော်။ + input: တောင်း။ အဲဒီတော့ ကိုဆွေဝင်းကို သြဂုတ်လ ရက်နေ့မှာ ရုံးချိန်းပြန်ခေါ်ထားတယ်ဆိုတော့ ကိုဆွေဝင်းက ဆက်သွားပြီး အမှုရင်ဆိုင်ရမှာပေါ့နော်။ is_clean: true domain: general register: colloquial @@ -14111,7 +14138,7 @@ sentences: start: 203 end: 214 erroneous_text: စုစု ပောင်း - gold_correction: စုစုပောင်း + gold_correction: စုစုပေါင်း edit_distance: 1 context_required: true notes: Injected broken_compound error. @@ -14775,7 +14802,7 @@ sentences: start: 17 end: 36 erroneous_text: ကောင်းဆောင်ကြီးများ - gold_correction: ခောင်းဆောင်ကြီးများ + gold_correction: ခေါင်းဆောင်ကြီးများ edit_distance: 1 context_required: false notes: Injected compound_confusion error. @@ -15853,7 +15880,7 @@ sentences: start: 203 end: 212 erroneous_text: စုစုပောင် - gold_correction: စုစုပောင်း + gold_correction: စုစုပေါင်း edit_distance: 1 context_required: false notes: Injected missing_visarga error. @@ -16042,7 +16069,7 @@ sentences: start: 245 end: 271 erroneous_text: သားသမီးအပော်ပြော့ရေးဆိုရေး - gold_correction: သားသမီးအပော်ပြောရေးဆိုရေး + gold_correction: သားသမီးအပေါ်ပြောရေးဆိုရေး edit_distance: 1 context_required: false notes: Injected aukmyit_confusion error. @@ -19900,7 +19927,7 @@ sentences: expected_errors: - error_id: BM-1146-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 7 @@ -20740,7 +20767,7 @@ sentences: domain: spelling - error_id: BM-1184-E2 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 25 @@ -20973,7 +21000,7 @@ sentences: expected_errors: - error_id: BM-1195-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 39 @@ -21797,7 +21824,7 @@ sentences: expected_errors: - error_id: BM-1234-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 37 @@ -23326,7 +23353,7 @@ sentences: expected_errors: - error_id: BM-1306-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 22 @@ -25661,7 +25688,7 @@ sentences: expected_errors: - error_id: BM-1416-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 20 @@ -25947,7 +25974,7 @@ sentences: expected_errors: - error_id: BM-1429-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 14 @@ -26833,7 +26860,7 @@ sentences: expected_errors: - error_id: BM-1473-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 33 @@ -26875,7 +26902,7 @@ sentences: expected_errors: - error_id: BM-1475-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 30 @@ -27241,7 +27268,7 @@ sentences: expected_errors: - error_id: BM-1494-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 46 @@ -27346,7 +27373,7 @@ sentences: expected_errors: - error_id: BM-1499-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 70 @@ -27474,7 +27501,7 @@ sentences: expected_errors: - error_id: BM-1505-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 69 @@ -27692,7 +27719,7 @@ sentences: expected_errors: - error_id: BM-1516-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 15 @@ -27755,7 +27782,7 @@ sentences: expected_errors: - error_id: BM-1519-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 11 @@ -28107,7 +28134,7 @@ sentences: expected_errors: - error_id: BM-1535-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 35 @@ -28217,7 +28244,7 @@ sentences: domain: spelling - error_id: BM-1539-E2 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 35 @@ -28259,7 +28286,7 @@ sentences: expected_errors: - error_id: BM-1541-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 40 @@ -28540,7 +28567,7 @@ sentences: expected_errors: - error_id: BM-1555-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 41 @@ -28721,7 +28748,7 @@ sentences: expected_errors: - error_id: BM-1563-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 23 @@ -28742,7 +28769,7 @@ sentences: expected_errors: - error_id: BM-1564-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 28 @@ -28784,7 +28811,7 @@ sentences: expected_errors: - error_id: BM-1566-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 0 @@ -28978,7 +29005,7 @@ sentences: expected_errors: - error_id: BM-1574-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 38 @@ -29251,7 +29278,7 @@ sentences: expected_errors: - error_id: BM-1587-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 38 @@ -29658,7 +29685,7 @@ sentences: expected_errors: - error_id: BM-1607-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 11 @@ -30947,7 +30974,7 @@ sentences: expected_errors: - error_id: BM-1672-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 37 @@ -32108,7 +32135,7 @@ sentences: expected_errors: - error_id: BM-1725-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 21 @@ -33439,7 +33466,7 @@ sentences: expected_errors: - error_id: BM-1790-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 16 @@ -34465,7 +34492,7 @@ sentences: expected_errors: - error_id: BM-1838-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 32 @@ -35158,7 +35185,7 @@ sentences: expected_errors: - error_id: BM-1873-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 41 @@ -35624,7 +35651,7 @@ sentences: expected_errors: - error_id: BM-1898-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 31 @@ -37882,7 +37909,7 @@ sentences: domain: spelling - error_id: BM-2004-E2 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 40 @@ -38734,7 +38761,7 @@ sentences: expected_errors: - error_id: BM-2045-E1 error_type: wrong_word - error_subtype: confusable_semantic + error_subtype: synonym_substitution detection_layer: context span: start: 43 From cbf27da39978ca8802dee52f99cdbe3d5121842f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 Jun 2026 05:42:20 +0000 Subject: [PATCH 04/13] chore(deps): bump actions/checkout from 6.0.2 to 6.0.3 Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.2 to 6.0.3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/de0fac2e4500dabe0009e67214ff5f5447ce83dd...df4cb1c069e1874edd31b4311f1884172cec0e10) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 6.0.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/audit.yml | 2 +- .github/workflows/ci.yml | 4 ++-- .github/workflows/publish.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 7c5e38d..7052a18 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -9,7 +9,7 @@ jobs: audit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0560482..ad543b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: @@ -55,7 +55,7 @@ jobs: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 558c6cd..5e75eba 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -13,7 +13,7 @@ jobs: id-token: write contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: From 28dacc74c381142d1fc9945867abc8921816518f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 Jun 2026 05:42:25 +0000 Subject: [PATCH 05/13] chore(deps): bump astral-sh/setup-uv from 8.1.0 to 8.2.0 Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 8.1.0 to 8.2.0. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/08807647e7069bb48b6ef5acd8ec9567f424441b...fac544c07dec837d0ccb6301d7b5580bf5edae39) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-version: 8.2.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/audit.yml | 2 +- .github/workflows/ci.yml | 4 ++-- .github/workflows/publish.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 7c5e38d..79bec89 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -15,7 +15,7 @@ jobs: with: python-version: "3.12" - - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install dependencies run: uv pip install --system pip-audit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0560482..2ce55d1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,7 +34,7 @@ jobs: with: python-version: "3.12" - - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install dependencies run: uv pip install --system ruff @@ -62,7 +62,7 @@ jobs: python-version: ${{ matrix.python-version }} allow-prereleases: true - - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install dependencies run: | diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 558c6cd..10711fa 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -19,7 +19,7 @@ jobs: with: python-version: "3.12" - - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 - name: Install build tools run: uv pip install --system build From 2b32d2d2bd9bb28d9dbad82d0f48348b050a32e3 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Sun, 7 Jun 2026 02:13:30 +0800 Subject: [PATCH 06/13] feat(strategy): unmask-02 aw-vowel un-mask detector + v18c bench fix Pre-norm detector surfaces aw-vowel typos the normalizer silently repairs; deterministic canonical suggestion, default-off flag. Benchmark v18c annotates 67 generator-planted aw typos it missed. Workstream: v18-normalizer-unmask-aw-vowel Benchmark: myspellchecker_benchmark.yaml@1.5.0-v18c-aw-unmask-annotations Metrics: composite 0.6520 -> 0.6870 (+0.0350) --- benchmarks/myspellchecker_benchmark.yaml | 1931 ++++++++++++----- benchmarks/run_benchmark.py | 3 + .../core/config/validation_configs.py | 14 + .../core/detectors/pre_normalization.py | 271 ++- src/myspellchecker/core/spellchecker.py | 15 +- .../rules/detector_confidences.yaml | 1 + src/myspellchecker/text/normalize.py | 36 +- tests/test_aw_vowel_unmask_detector.py | 298 +++ tests/test_normalize_e_vowel_tall_aa.py | 35 + 9 files changed, 2060 insertions(+), 544 deletions(-) create mode 100644 tests/test_aw_vowel_unmask_detector.py diff --git a/benchmarks/myspellchecker_benchmark.yaml b/benchmarks/myspellchecker_benchmark.yaml index 07d4637..910d704 100644 --- a/benchmarks/myspellchecker_benchmark.yaml +++ b/benchmarks/myspellchecker_benchmark.yaml @@ -1,4 +1,4 @@ -version: 1.5.0-v18b-clean-audit +version: 1.5.0-v18c-aw-unmask-annotations category: benchmark description: 'Myanmar spell checker benchmark v1.5. Spelling-first domain labeling refined (2026-04-18): added text_overrides for 18 particle-family spans mislabeled as spelling by v1.4.0 (aukmyit_confusion case-markers + homophone_confusion sentence-final particles). Mechanical mapping resolves 100% of 1,716 spans (spelling 82.4% / grammar 17.6% / both 0%). v1.4.0: initial domain labeling. Based on v1.3.0 (v1.2 audited 2026-04-15 via 6-agent + Gemini 2.5 Flash pipeline).' metadata: @@ -1021,7 +1021,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-073 @@ -1134,7 +1134,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-087 @@ -2227,7 +2227,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-197 @@ -2277,7 +2277,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-211 @@ -2663,7 +2663,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-251 @@ -2797,7 +2797,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: No observable surface error can be marked in this sentence under the revised annotation policy, so it is treated as a clean control. - id: BM-258 @@ -5346,7 +5346,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-472 @@ -5354,7 +5354,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-476 @@ -5362,7 +5362,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-478 @@ -5370,7 +5370,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-479 @@ -5378,7 +5378,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-483 @@ -5386,7 +5386,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-486 @@ -5394,7 +5394,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-488 @@ -5402,7 +5402,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-490 @@ -5410,7 +5410,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-493 @@ -5418,7 +5418,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-495 @@ -5426,7 +5426,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-497 @@ -5434,7 +5434,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-500 @@ -5442,7 +5442,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-502 @@ -5450,7 +5450,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-504 @@ -5458,7 +5458,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-507 @@ -5466,7 +5466,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-509 @@ -5474,7 +5474,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-510 @@ -5482,7 +5482,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-511 @@ -5490,7 +5490,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-512 @@ -5498,7 +5498,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-513 @@ -5506,7 +5506,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-513-E1 error_type: semantic_error @@ -5528,7 +5528,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-514-E1 error_type: grammar @@ -5550,7 +5550,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-518 @@ -5558,7 +5558,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-519 @@ -5566,7 +5566,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-520 @@ -5574,7 +5574,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-521 @@ -6089,7 +6089,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C002 @@ -6097,7 +6097,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C003 @@ -6105,7 +6105,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C004 @@ -6113,7 +6113,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C005 @@ -6121,7 +6121,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C006 @@ -6129,7 +6129,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C007 @@ -6137,7 +6137,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C008 @@ -6145,7 +6145,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C009 @@ -6153,7 +6153,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C010 @@ -6161,7 +6161,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C011 @@ -6169,7 +6169,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C012 @@ -6177,7 +6177,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C013 @@ -6185,7 +6185,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C014 @@ -6193,7 +6193,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C015 @@ -6201,7 +6201,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C016 @@ -6209,7 +6209,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C017 @@ -6217,7 +6217,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C018 @@ -6225,7 +6225,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C019 @@ -6233,7 +6233,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C020 @@ -6241,7 +6241,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C021 @@ -6249,7 +6249,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C022 @@ -6257,7 +6257,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C023 @@ -6265,7 +6265,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C024 @@ -6273,7 +6273,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C025 @@ -6281,7 +6281,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C026 @@ -6289,7 +6289,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C027 @@ -6297,7 +6297,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C028 @@ -6305,7 +6305,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C029 @@ -6313,7 +6313,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C030 @@ -6321,7 +6321,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C031 @@ -6329,7 +6329,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C032 @@ -6337,7 +6337,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C033 @@ -6345,7 +6345,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C034 @@ -6353,7 +6353,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C035 @@ -6361,7 +6361,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C036 @@ -6369,7 +6369,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C037 @@ -6377,7 +6377,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C038 @@ -6385,7 +6385,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C039 @@ -6393,7 +6393,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C040 @@ -6401,7 +6401,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C041 @@ -6409,7 +6409,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C042 @@ -6417,7 +6417,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C043 @@ -6425,7 +6425,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C044 @@ -6433,7 +6433,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C045 @@ -6441,7 +6441,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C046 @@ -6449,7 +6449,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C047 @@ -6457,7 +6457,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C048 @@ -6465,7 +6465,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C049 @@ -6473,7 +6473,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C050 @@ -6481,7 +6481,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C051 @@ -6489,7 +6489,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C052 @@ -6497,7 +6497,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C053 @@ -6505,7 +6505,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C054 @@ -6513,7 +6513,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C055 @@ -6521,7 +6521,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C056 @@ -6529,7 +6529,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C057 @@ -6537,7 +6537,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C058 @@ -6545,7 +6545,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C059 @@ -6553,7 +6553,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C060 @@ -6561,7 +6561,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C061 @@ -6569,7 +6569,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C062 @@ -6577,7 +6577,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C063 @@ -6585,7 +6585,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C064 @@ -6593,7 +6593,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C065 @@ -6601,7 +6601,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C066 @@ -6609,7 +6609,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C067 @@ -6617,7 +6617,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C068 @@ -6625,7 +6625,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C069 @@ -6633,7 +6633,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C070 @@ -6641,7 +6641,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C071 @@ -6649,7 +6649,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C072 @@ -6657,7 +6657,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C073 @@ -6665,7 +6665,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C074 @@ -6673,7 +6673,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C075 @@ -6681,7 +6681,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C076 @@ -6689,7 +6689,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C077 @@ -6697,7 +6697,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C078 @@ -6705,7 +6705,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C079 @@ -6713,7 +6713,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C080 @@ -6721,7 +6721,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C081 @@ -6729,7 +6729,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C082 @@ -6737,7 +6737,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C083 @@ -6745,7 +6745,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C084 @@ -6753,7 +6753,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C085 @@ -6761,7 +6761,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C086 @@ -6769,7 +6769,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C087 @@ -6777,7 +6777,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C088 @@ -6785,7 +6785,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C089 @@ -6793,7 +6793,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C090 @@ -6801,7 +6801,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C091 @@ -6809,7 +6809,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C092 @@ -6817,7 +6817,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C093 @@ -6825,7 +6825,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C094 @@ -6833,7 +6833,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C095 @@ -6841,7 +6841,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C096 @@ -6849,7 +6849,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C097 @@ -6857,7 +6857,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C098 @@ -6865,7 +6865,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C099 @@ -6873,7 +6873,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C100 @@ -6881,7 +6881,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C101 @@ -6889,7 +6889,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C102 @@ -6897,7 +6897,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C103 @@ -6905,7 +6905,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C104 @@ -6913,7 +6913,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C105 @@ -6921,7 +6921,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C106 @@ -6929,7 +6929,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C107 @@ -6937,7 +6937,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C108 @@ -6945,7 +6945,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C109 @@ -6953,7 +6953,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C110 @@ -6961,7 +6961,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C111 @@ -6969,7 +6969,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C112 @@ -6977,7 +6977,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C113 @@ -6985,7 +6985,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C114 @@ -6993,7 +6993,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C115 @@ -7001,7 +7001,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C116 @@ -7009,7 +7009,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C117 @@ -7017,7 +7017,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C118 @@ -7025,7 +7025,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C119 @@ -7033,7 +7033,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C120 @@ -7041,7 +7041,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C121 @@ -7049,7 +7049,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C122 @@ -7057,7 +7057,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C123 @@ -7065,7 +7065,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C124 @@ -7073,7 +7073,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C125 @@ -7081,7 +7081,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C126 @@ -7089,7 +7089,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C127 @@ -7097,7 +7097,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C128 @@ -7105,7 +7105,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C129 @@ -7113,7 +7113,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C130 @@ -7121,7 +7121,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C131 @@ -7129,7 +7129,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C132 @@ -7137,7 +7137,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C133 @@ -7145,7 +7145,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C134 @@ -7153,7 +7153,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C135 @@ -7161,7 +7161,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C136 @@ -7169,7 +7169,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C137 @@ -7177,7 +7177,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C138 @@ -7185,7 +7185,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C139 @@ -7193,7 +7193,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C140 @@ -7201,7 +7201,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C141 @@ -7209,7 +7209,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C142 @@ -7217,7 +7217,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C143 @@ -7225,7 +7225,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C144 @@ -7233,7 +7233,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C145 @@ -7241,7 +7241,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C146 @@ -7249,7 +7249,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C147 @@ -7257,7 +7257,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C148 @@ -7265,7 +7265,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C149 @@ -7273,7 +7273,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-C150 @@ -7281,7 +7281,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence with no intentional spelling, lexical, grammar, or semantic error. - id: BM-EXT-E001 @@ -8834,7 +8834,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (33 words). - id: BM-EXP-C002 @@ -8842,7 +8842,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (9 words). - id: BM-EXP-C003 @@ -8850,7 +8850,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (8 words). - id: BM-EXP-C004 @@ -8858,7 +8858,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from pali-myanmar-dictionary-corpus.txt (4 words). - id: BM-EXP-C005 @@ -8866,7 +8866,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (22 words). - id: BM-EXP-C006 @@ -8874,7 +8874,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (4 words). - id: BM-EXP-C007 @@ -8882,7 +8882,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C007-E1 error_type: semantic_error @@ -8904,7 +8904,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (7 words). - id: BM-EXP-C009 @@ -8912,7 +8912,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (19 words). - id: BM-EXP-C010 @@ -8920,7 +8920,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102C (ော) used instead of U+102B (ေါ) in 'ပောများ'. Should be 'ပေါများ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောများ'. Should be 'ပေါများ'. This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from myanmar-literature-corpus.txt (26 words). - id: BM-EXP-C011 @@ -8928,7 +8928,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102C (ော) used instead of U+102B (ေါ) in 'ပော်ပောက်'. Should be 'ပေါ်ပေါက်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ခောင်းဆောင်'. Should be 'ခေါင်းဆောင်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'အပော်'. Should be 'အပေါ်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပော်ပောက်'. Should be 'ပေါ်ပေါက်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ခောင်းဆောင်'. Should be 'ခေါင်းဆောင်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'အပော်'. Should be 'အပေါ်'. This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from myanmar_spoken_corpus.txt (31 words). - id: BM-EXP-C012 @@ -8936,7 +8936,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (4 words). - id: BM-EXP-C013 @@ -8944,7 +8944,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ထွက်'. Should be 'ပေါ်ထွက်'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ထွက်'. Should be 'ပေါ်ထွက်'. This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from URajinda-myanmar_spoken_corpus.txt (48 words). - id: BM-EXP-C014 @@ -8952,7 +8952,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (8 words). - id: BM-EXP-C015 @@ -8960,7 +8960,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102C (ော) used instead of U+102B (ေါ) in 'ပော့'. Should be 'ပေါ့'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပော့'. Should be 'ပေါ့'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'ပောက်ကွဲ'. Should be 'ပေါက်ကွဲ'. This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from myanmar-culturax-dataset.txt (43 words). - id: BM-EXP-C016 @@ -8968,7 +8968,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102C (ော) used instead of U+102B (ေါ) in 'စုစုပောင်း'. Should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion error.. U+102C (ော) used instead of U+102B (ေါ) in 'စုစုပောင်း'. Should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from myanmar-wikipedia-dataset.txt (18 words). - id: BM-EXP-C017 @@ -8976,7 +8976,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (33 words). - id: BM-EXP-C018 @@ -8984,7 +8984,7 @@ sentences: is_clean: false domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C018-E1 error_type: wrong_word @@ -9005,7 +9005,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ဆီပောက်မျှ' uses U+102B (ါ) incorrectly. It should be 'ဆီပေါက်မျှ' using U+102C (ာ) for the meaning 'a drop/spot of oil'. This is a common Z. The word 'ဆီပောက်မျှ' uses U+102B (ါ) incorrectly. It should be 'ဆီပေါက်မျှ' using U+102C (ာ) for the meaning 'a drop/spot of oil'. This is a common Z. Clean control sentence from myanmar-wikipedia-dataset.txt (30 words). - id: BM-EXP-C020 @@ -9013,7 +9013,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Pro-confirmed clean 2026-04-15. In Myanmar script, spaces are not strictly required between words. 'ကျွန်တော်တို့ဘယ်လောက်' is perfec - id: BM-EXP-C021 @@ -9021,7 +9021,7 @@ sentences: is_clean: false domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C021-E1 error_type: wrong_word @@ -9042,7 +9042,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ကိုခောင်းကြီး' uses U+102C (ာ) incorrectly. It should be 'ကိုခေါင်းကြီး' using U+102B (ါ) for 'head'. This is a common Zawgyi-to-Unicode con. The word 'ကိုခောင်းကြီး' uses U+102C (ာ) incorrectly. It should be 'ကိုခေါင်းကြီး' using U+102B (ါ) for 'head'. This is a common Zawgyi-to-Unicode con. Clean control sentence from myanmar-literature-corpus.txt (8 words). - id: BM-EXP-C023 @@ -9050,7 +9050,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: '[ZAWGYI_CHECK] The word ''ဖောက်ကြံ့ပင်'' uses U+102C (ာ) incorrectly. It should be ''ဖေါက်ကြံ့ပင်'' using U+102B (ါ) for the name of the tree. This is a common Zawgyi-to. [ZAWGYI_CHECK] The word ''ဖောက်ကြံ့ပင်'' uses U+102C (ာ) incorrectly. It should be ''ဖေါက်ကြံ့ပင်'' using U+102B (ါ) for the name of the tree. This is a common Zawgyi-to. Clean control sentence from tipitaka_myanmar_translation_books.txt (18 words).' - id: BM-EXP-C024 @@ -9058,7 +9058,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ခောင်းဆောင်ကြီးများ' uses U+102C (ာ) incorrectly. It should be 'ခေါင်းဆောင်ကြီးများ' using U+102B (ါ) for 'leader'. This is a common Zawgyi-. The word 'ခောင်းဆောင်ကြီးများ' uses U+102C (ာ) incorrectly. It should be 'ခေါင်းဆောင်ကြီးများ' using U+102B (ါ) for 'leader'. This is a common Zawgyi-. Clean control sentence from myanmar-literature-corpus.txt (19 words). - id: BM-EXP-C025 @@ -9066,7 +9066,7 @@ sentences: is_clean: false domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C025-E1 error_type: word_error @@ -9114,7 +9114,7 @@ sentences: is_clean: false domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C026-E1 error_type: invalid_syllable @@ -9148,7 +9148,7 @@ sentences: is_clean: false domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C027-E1 error_type: wrong_word @@ -9182,7 +9182,7 @@ sentences: is_clean: true domain: general register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (12 words). - id: BM-EXP-C029 @@ -9190,7 +9190,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (27 words). - id: BM-EXP-C030 @@ -9198,7 +9198,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (17 words). - id: BM-EXP-C031 @@ -9206,7 +9206,7 @@ sentences: is_clean: false domain: religious register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C031-E1 error_type: wrong_word @@ -9228,7 +9228,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (5 words). - id: BM-EXP-C033 @@ -9236,7 +9236,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. [AUDIT] word_segmentation: The compound verb ''ဖတ်ရွတ်ရပါမည်'' (must recite) is incorrectly segmented as ''ဖတ်ရွတ် ရပါမည်''.. [AUDIT] word_segmentation: The compound word ''လွတ်မြောက်ရန်'' (to be free from) is incorrectly segmented as ''လွတ် မြောက် ရန်''.. [AUDIT] word_segmentation: The compound verb ''ဖတ်ရွတ်ရပါမည်'' (must recite) is incorrectly segmented as ''ဖတ်ရွတ် ရပါမည်''.. [AUDIT] word_segmentation: The compound word ''လွတ်မြောက်ရန်'' (to be free from) is incorrectly segmented as ''လွတ် မြောက် ရန်''.. Clean control sentence from myanmar-c4-dataset.txt (11 words).' - id: BM-EXP-C034 @@ -9244,7 +9244,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (5 words). - id: BM-EXP-C035 @@ -9252,7 +9252,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. [AUDIT] word_segmentation: The verb phrase ''သက်သာရာရစေပါလိမ့်မယ်'' (will make feel better) is incorrectly segmented as ''သက်သာရာရ. [AUDIT] word_segmentation: The verb phrase ''သက်သာရာရစေပါလိမ့်မယ်'' (will make feel better) is incorrectly segmented as ''သက်သာရာရ. Clean control sentence from myanmar-c4-dataset.txt (9 words).' - id: BM-EXP-C036 @@ -9260,7 +9260,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. [AUDIT] word_segmentation: The compound verb ''ရရှိမည်ဖြစ်သည်။'' (will be obtained) is incorrectly segmented as ''ရရှိ မည်ဖြစ်သည်။. [AUDIT] word_segmentation: The adverb ''လုံလောက်စွာ'' (sufficiently) is incorrectly segmented as ''လုံလောက် စွာ''.. [AUDIT] word_segmentation: The compound phrase ''လာမည့်ဆယ်နှစ်အတွင်း'' (within the next ten years) is incorrectly segmented as ''လ. [AUDIT] word_segmentation: The compound phrase ''ဤခန့်မှန်းချက်အရ'' (according to this estimate) is incorrectly segmented as ''ဤ ခ. [AUDIT] punctuation_error: A dot is incorrectly used in the number ''၁. သန်း''.. [AUDIT] word_segmentation: The compound verb ''ဖြည့်ဆည်းပေးနိုင်သည်။'' (can fulfill) is incorrectly segmented as ''ဖြည့်ဆည်း ပေးနိ. [AUDIT] punctuation_error: A dot is incorrectly used in the number ''၂၇. သန်း''.. [AUDIT] grammatical_error: The particle ''ကို'' is incorrectly used before ''အပြည့်အဝ ပါက''. It should likely be ''အပြည့်အဝ ပါက'' or . [AUDIT] word_segmentation: The compound verb ''ရရှိမည်ဖြစ်သည်။'' (will be obtained) is incorrectly segmented as ''ရရှိ မည်ဖြစ်သည်။. [AUDIT] word_segmentation: The adverb ''လုံလောက်စွာ'' (sufficiently) is incorrectly segmented as ''လုံလောက် စွာ''.. [AUDIT] word_segmentation: The compound phrase ''လာမည့်ဆယ်နှစ်အတွင်း'' (within the next ten years) is incorrectly segmented as ''လ. [AUDIT] word_segmentation: The compound phrase ''ဤခန့်မှန်းချက်အရ'' (according to this estimate) is incorrectly segmented as ''ဤ ခ. [AUDIT] punctuation_error: A dot is incorrectly used in the number ''၁. သန်း''.. [AUDIT] word_segmentation: The compound verb ''ဖြည့်ဆည်းပေးနိုင်သည်။'' (can fulfill) is incorrectly segmented as ''ဖြည့်ဆည်း ပေးနိ. [AUDIT] punctuation_error: A dot is incorrectly used in the number ''၂၇. သန်း''.. [AUDIT] grammatical_error: The particle ''ကို'' is incorrectly used before ''အပြည့်အဝ ပါက''. It should likely be ''အပြည့်အဝ ပါက'' or . Clean control sentence from huggingface_myanmar_english_translation.txt (34 words).' - id: BM-EXP-C037 @@ -9268,7 +9268,7 @@ sentences: is_clean: false domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C037-E1 error_type: wrong_word @@ -9289,7 +9289,7 @@ sentences: is_clean: false domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C038-E1 error_type: wrong_word @@ -9323,7 +9323,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပော်' uses U+102B (ါ) incorrectly. It should be 'ပေါ်' using U+102C (ာ). This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပော်' uses U+102B (ါ) incorrectly. It should be 'ပေါ်' using U+102C (ာ). This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from huggingface_myanmar_english_translation.txt (5 words). - id: BM-EXP-C040 @@ -9331,7 +9331,7 @@ sentences: is_clean: false domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C040-E1 error_type: spelling @@ -9352,7 +9352,7 @@ sentences: is_clean: false domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C041-E1 error_type: spelling @@ -9400,7 +9400,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C042-E1 error_type: spelling @@ -9421,7 +9421,7 @@ sentences: is_clean: true domain: literary register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပော်' uses U+102B (ါ) incorrectly. It should be 'ပေါ်' using U+102C (ာ). This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပော်' uses U+102B (ါ) incorrectly. It should be 'ပေါ်' using U+102C (ာ). This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-literature-corpus.txt (8 words). - id: BM-EXP-C044 @@ -9429,7 +9429,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C044-E1 error_type: spelling @@ -9450,7 +9450,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (8 words). note: 'Re-labeled: pure Pali/Sanskrit verse, not Myanmar language' @@ -9459,7 +9459,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Marked clean during benchmark audit 2026-04-15. - id: BM-EXP-C047 @@ -9467,7 +9467,7 @@ sentences: is_clean: false domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C047-E1 error_type: orthography @@ -9489,7 +9489,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (10 words). - id: BM-EXP-C049 @@ -9497,7 +9497,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (9 words). - id: BM-EXP-C050 @@ -9505,7 +9505,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (30 words). - id: BM-EXP-C051 @@ -9513,7 +9513,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (35 words). - id: BM-EXP-C052 @@ -9521,7 +9521,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (42 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -9530,7 +9530,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (29 words). - id: BM-EXP-C054 @@ -9538,7 +9538,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (7 words). - id: BM-EXP-C055 @@ -9546,7 +9546,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Audit 2026-04-15: has unannotated error(s). Gemini: The sentence is semantically incomplete. It states ''ရွာနေရာကုတ်မှာ ဖြစ်သည်။'' (The village code is) but provides no code. Similarly, ''ကျား ဦး၊ မ ဦး၊ လူ. Needs manual error annotation.' - id: BM-EXP-C056 @@ -9554,7 +9554,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from mm_corpus_003.txt (4 words). - id: BM-EXP-C057 @@ -9562,7 +9562,7 @@ sentences: is_clean: false domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C057-E1 error_type: semantic_error @@ -9584,7 +9584,7 @@ sentences: is_clean: false domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C058-E1 error_type: spacing @@ -9605,7 +9605,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (8 words). - id: BM-EXP-C060 @@ -9613,7 +9613,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (27 words). - id: BM-EXP-C061 @@ -9621,7 +9621,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (4 words). - id: BM-EXP-C062 @@ -9629,7 +9629,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (14 words). - id: BM-EXP-C063 @@ -9637,7 +9637,7 @@ sentences: is_clean: true domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (28 words). - id: BM-EXP-C064 @@ -9645,7 +9645,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (16 words). - id: BM-EXP-C065 @@ -9653,7 +9653,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102B vs U+102C error: ''တိုက်ပွဲပောင်း'' (U+1015 U+1031 U+102C U+1004 U+1039) should be ''တိုက်ပွဲပေါင်း'' (U+1015 U+1031 U+102B U+1004 U+1039) for ''tot. U+102B vs U+102C error: ''တိုက်ပွဲပောင်း'' (U+1015 U+1031 U+102C U+1004 U+1039) should be ''တိုက်ပွဲပေါင်း'' (U+1015 U+1031 U+102B U+1004 U+1039) for ''tot. Clean control sentence from URajinda-myanmar_spoken_corpus.txt (27 words).' - id: BM-EXP-C066 @@ -9661,7 +9661,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. U+102B vs U+102C error: ''စုစုပောင်း'' (U+1015 U+1031 U+102C U+1004 U+1039) should be ''စုစုပေါင်း'' (U+1015 U+1031 U+102B U+1004 U+1039) for ''total''.. U+102B vs U+102C error: ''သန်းခောင်စာရင်း'' (U+1001 U+1031 U+102C U+1004 U+1039) should be ''သန်းခေါင်စာရင်း'' (U+1001 U+1031 U+102B U+1004 U+1039) for ''c. U+102B vs U+102C error: ''စုစုပောင်း'' (U+1015 U+1031 U+102C U+1004 U+1039) should be ''စုစုပေါင်း'' (U+1015 U+1031 U+102B U+1004 U+1039) for ''total''.. U+102B vs U+102C error: ''သန်းခောင်စာရင်း'' (U+1001 U+1031 U+102C U+1004 U+1039) should be ''သန်းခေါင်စာရင်း'' (U+1001 U+1031 U+102B U+1004 U+1039) for ''c. Clean control sentence from kalixlouiis-myanmar-wikipedia_text_not_cleaned.txt (21 words).' - id: BM-EXP-C067 @@ -9669,7 +9669,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (6 words). - id: BM-EXP-C068 @@ -9677,7 +9677,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (13 words). - id: BM-EXP-C069 @@ -9685,7 +9685,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Audit 2026-04-15: has unannotated error(s). Gemini: The sentence fragment ''ရွာနေရာကုတ်မှာ ဖြစ်သည်။'' is incomplete as it states ''The village code is .'' without providing the actual code.. Needs manual error annotation.' - id: BM-EXP-C070 @@ -9693,7 +9693,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (4 words). - id: BM-EXP-C071 @@ -9701,7 +9701,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Incorrect vowel sign: ''ခော်'' (U+1001 U+102D U+101B) should be ''ခေါ်'' (U+1001 U+102B U+101B) for ''called''. This is a common Zawgyi conversion error.. Audit 2026-04-15: has unannotated error(s). Gemini: Typo/wrong word: ''ဖြေဖြန်'' is not a standard word. It should likely be ''ဖြေဖျန့်'' (to mediate) or ''ဖြေရှင်း'' (to resolve) in this context.. Needs manual error annotation.' - id: BM-EXP-C072 @@ -9709,7 +9709,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-cc100-dataset.txt (7 words). - id: BM-EXP-C073 @@ -9717,7 +9717,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (13 words). - id: BM-EXP-C074 @@ -9725,7 +9725,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (9 words). - id: BM-EXP-C075 @@ -9733,7 +9733,7 @@ sentences: is_clean: true domain: literary register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Incorrect vowel sign: ''ပော်'' (U+1015 U+102D U+101B) should be ''ပေါ်'' (U+1015 U+102B U+101B) for ''on/upon''. This is a common Zawgyi conversion error.. Incorrect vowel sign: ''ပော်'' (U+1015 U+102D U+101B) should be ''ပေါ်'' (U+1015 U+102B U+101B) for ''on/upon''. This is a common Zawgyi conversion error.. Clean control sentence from myanmar-literature-corpus.txt (26 words).' - id: BM-EXP-C076 @@ -9741,7 +9741,7 @@ sentences: is_clean: false domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C076-E1 error_type: register_error @@ -9791,7 +9791,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (20 words). - id: BM-EXP-C078 @@ -9799,7 +9799,7 @@ sentences: is_clean: false domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C078-E1 error_type: grammar_error @@ -9887,7 +9887,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'spacing_error: Missing space between `ဤ` and `အကြောင်းကိုလျှောက်ကြကုန်၏`.. spacing_error: Missing space between `ဤ` and `အကြောင်းကိုလျှောက်ကြကုန်၏`.. Clean control sentence from tipitaka_myanmar_translation_books.txt (8 words).' - id: BM-EXP-C080 @@ -9895,7 +9895,7 @@ sentences: is_clean: false domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C080-E1 error_type: spelling_error @@ -9929,7 +9929,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'spacing_error: Missing space after comma: `ရေ,ရေအပါအဝင်ဖြစ်သော` should be `ရေ, ရေအပါအဝင်ဖြစ်သော`.. spacing_error: Missing space after comma: `ရေ,ရေအပါအဝင်ဖြစ်သော` should be `ရေ, ရေအပါအဝင်ဖြစ်သော`.. Clean control sentence from tipitaka_myanmar_translation_books.txt (41 words).' note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -9938,7 +9938,7 @@ sentences: is_clean: false domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C082-E1 error_type: spelling_error @@ -9960,7 +9960,7 @@ sentences: is_clean: false domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C083-E1 error_type: spelling_error @@ -9975,13 +9975,26 @@ sentences: context_required: false notes: Added by Gemini 3.1 Flash-Lite native review 2026-04-15. domain: spelling + - error_id: BM-EXP-C083-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 138 + end: 153 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: 'data_quality: Missing numerical values for `ကျား ဦး`, `မ ဦး`, and `လူဦးရေ စုစုပောင်း ဦး`.. data_quality: Missing value after `ရွာနေရာကုတ်မှာ` (village code is).. data_quality: Empty parentheses `()` after `ကြည့်မြင့်ရွာ` indicate missing information.. data_quality: Missing numerical values for `ကျား ဦး`, `မ ဦး`, and `လူဦးရေ စုစုပောင်း ဦး`.. data_quality: Missing value after `ရွာနေရာကုတ်မှာ` (village code is).. data_quality: Empty parentheses `()` after `ကြည့်မြင့်ရွာ` indicate missing information.. Clean control sentence from kalixlouiis-myanmar-wikipedia_text_not_cleaned.txt (21 words).' - id: BM-EXP-C084 input: သတင်းကြားခဲ့ရပါသည်။ ထို့ပြင် A မျိုးသားဒီမိုကရေစီA ဖွဲ့ချုပ်Aား ကျင်းပဆဲ A နုတ်ထက is_clean: false domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C084-E1 error_type: ocr_error @@ -10015,7 +10028,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C085-E1 error_type: spelling_error @@ -10036,7 +10049,7 @@ sentences: is_clean: false domain: conversational register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C086-E1 error_type: grammar_error @@ -10057,7 +10070,7 @@ sentences: is_clean: false domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C087-E1 error_type: spelling_error @@ -10078,7 +10091,7 @@ sentences: is_clean: false domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C088-E1 error_type: spelling @@ -10099,7 +10112,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (20 words). - id: BM-EXP-C090 @@ -10107,7 +10120,7 @@ sentences: is_clean: false domain: literary register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C090-E1 error_type: spacing @@ -10154,7 +10167,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (19 words). - id: BM-EXP-C092 @@ -10162,7 +10175,7 @@ sentences: is_clean: false domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C092-E1 error_type: spacing @@ -10229,13 +10242,26 @@ sentences: context_required: false notes: Added by Gemini 3.1 Flash-Lite native review 2026-04-15. domain: spelling + - error_id: BM-EXP-C092-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 45 + end: 59 + erroneous_text: မြန်မာဝမ်ပောင် + gold_correction: မြန်မာဝမ်ပေါင် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: 'Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ထိမ်းသိန်း'' is misspelled. It should be ''ထိန်းသိမ်း'' (to detain/conserve).. Needs manual error annotation.' - id: BM-EXP-C093 input: တောင်ကိုရီးယားသမ္မတဟောင်း လီမြောင်ဘတ် (Lee Myung-bak) ကို အဂတိလိုက်စားမှုဖြင့် ပြီးခဲ့တဲ့ မတ်လ ရက် ညပိုင်းမှာ ဖမ်းဆီးထိမ်းသိန်းခဲ့ပြီး ရာဇဝတ်မှုအရ ဖမ်းဆီးခံရတဲ့ သက်ရှိထင်ရှားရှိနေဆဲ စတုတ္တမြောက် တောင်ကိုရီးယားသမ္မတဟောင်းလည်း ဖြစ်ပါတယ်။ is_clean: false domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C093-E1 error_type: lexical_error @@ -10282,7 +10308,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C094-E1 error_type: spacing @@ -10446,7 +10472,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (5 words). - id: BM-EXP-C096 @@ -10454,7 +10480,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (4 words). - id: BM-EXP-C097 @@ -10462,7 +10488,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C097-E1 error_type: lexical_error @@ -10512,7 +10538,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''စုစုပောင်း'' should be ''စုစုပေါင်း''. This is a U+102B vs U+102C issue.. The word ''စုစုပောင်း'' should be ''စုစုပေါင်း''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''စုစုပောင်း'' should be ''စုစုပေါင်း''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to U+102C . Needs manual error annotation.' - id: BM-EXP-C099 @@ -10520,7 +10546,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''တံခါးပောက်ကြီး'' should be ''တံခါးပေါက်ကြီး''. This is a U+102B vs U+102C issue.. The word ''တံခါးပောက်ကြီး'' should be ''တံခါးပေါက်ကြီး''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''တံခါးပောက်ကြီး'' should be ''တံခါးပေါက်ကြီး''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to. Needs manual error annotation.' - id: BM-EXP-C100 @@ -10528,7 +10554,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''အပော်'' should be ''အပေါ်''. This is a U+102B vs U+102C issue.. The word ''အပော်'' should be ''အပေါ်''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''အပော်'' should be ''အပေါ်''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to U+102C (ာ).. Needs manual error annotation.' - id: BM-EXP-C101 @@ -10536,7 +10562,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ပောက်ပါတယ်'' should be ''ပေါက်ပါတယ်''. This is a U+102B vs U+102C issue.. The word ''ပောက်ပါတယ်'' should be ''ပေါက်ပါတယ်''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ပောက်ပါတယ်'' should be ''ပေါက်ပါတယ်''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to U+102C . Needs manual error annotation.' - id: BM-EXP-C102 @@ -10544,7 +10570,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-pretrain-data2.txt (8 words). - id: BM-EXP-C103 @@ -10552,7 +10578,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (6 words). - id: BM-EXP-C104 @@ -10560,7 +10586,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (9 words). - id: BM-EXP-C105 @@ -10568,7 +10594,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-written-corpus.txt (14 words). - id: BM-EXP-C106 @@ -10576,7 +10602,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ဆိုလိုတာပော့'' should be ''ဆိုလိုတာပေါ့''. This is a U+102B vs U+102C issue.. The word ''ဆိုလိုတာပော့'' should be ''ဆိုလိုတာပေါ့''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ဆိုလိုတာပော့'' should be ''ဆိုလိုတာပေါ့''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to U+1. Needs manual error annotation.' note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -10585,7 +10611,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ဒောက်တာ'' should be ''ဒေါက်တာ''. This is a U+102B vs U+102C issue.. The word ''ဒောက်တာ'' should be ''ဒေါက်တာ''. This is a U+102B vs U+102C issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ဒောက်တာ'' should be ''ဒေါက်တာ''. This is a common Zawgyi-to-Unicode conversion error where U+102B (ါ) is incorrectly converted to U+102C (ာ).. Needs manual error annotation.' - id: BM-EXP-C108 @@ -10593,7 +10619,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပူးပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပူးပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပူးပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပူးပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-wikipedia-dataset.txt (37 words). - id: BM-EXP-C109 @@ -10601,7 +10627,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (21 words). - id: BM-EXP-C110 @@ -10609,7 +10635,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'အရေးပော်' uses U+102C (ာ) instead of U+102B (ါ). It should be 'အရေးပေါ်'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'အရေးပော်' uses U+102C (ာ) instead of U+102B (ါ). It should be 'အရေးပေါ်'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar_spoken_corpus.txt (13 words). - id: BM-EXP-C111 @@ -10617,7 +10643,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (4 words). - id: BM-EXP-C112 @@ -10625,7 +10651,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The words 'တဈဘဝလုံး' and 'ကြိနျစာဆိုး' contain Zawgyi rendering for 'တစ်' (တဈ -> တစ်) and 'ကျိန်' (ကြိနျ -> ကျိန်).. The words 'တဈဘဝလုံး' and 'ကြိနျစာဆိုး' contain Zawgyi rendering for 'တစ်' (တဈ -> တစ်) and 'ကျိန်' (ကြိနျ -> ကျိန်).. Clean control sentence from myanmar-literature-corpus.txt (4 words). - id: BM-EXP-C113 @@ -10633,7 +10659,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပောက်ကုန်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါက်ကုန်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပောက်ကုန်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါက်ကုန်း'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-wikipedia-dataset.txt (5 words). - id: BM-EXP-C114 @@ -10641,7 +10667,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-pretrain-data2.txt (22 words). - id: BM-EXP-C115 @@ -10649,7 +10675,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'စိမ့်မြေပော်' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စိမ့်မြေပေါ်'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'စိမ့်မြေပော်' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စိမ့်မြေပေါ်'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-xnli_sentence1_my.txt (14 words). - id: BM-EXP-C116 @@ -10657,7 +10683,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (8 words). - id: BM-EXP-C117 @@ -10665,7 +10691,7 @@ sentences: is_clean: true domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'စုစုပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'စုစုပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'စုစုပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. The word 'ပောင်း' uses U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါင်း'. This is a common Zawgyi-to-Unicode conversion issue.. Clean control sentence from myanmar-c4-dataset.txt (30 words). - id: BM-EXP-C118 @@ -10673,7 +10699,7 @@ sentences: is_clean: true domain: literary register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (8 words). - id: BM-EXP-C119 @@ -10681,7 +10707,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Marked clean during benchmark audit 2026-04-15. note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -10690,7 +10716,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (11 words). - id: BM-EXP-C121 @@ -10698,7 +10724,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (14 words). - id: BM-EXP-C122 @@ -10706,7 +10732,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (29 words). - id: BM-EXP-C123 @@ -10714,7 +10740,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (14 words). - id: BM-EXP-C124 @@ -10722,7 +10748,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (9 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -10731,7 +10757,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Pro-confirmed clean 2026-04-15. The phrase 'အ ကား' is contextually valid here, likely meaning 'As for [a character named Ah]', which - id: BM-EXP-C126 @@ -10739,7 +10765,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Marked clean during benchmark audit 2026-04-15. - id: BM-EXP-C127 @@ -10747,7 +10773,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (7 words). - id: BM-EXP-C128 @@ -10755,7 +10781,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-written-corpus.txt (4 words). - id: BM-EXP-C129 @@ -10763,7 +10789,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (13 words). - id: BM-EXP-C130 @@ -10771,7 +10797,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (31 words). - id: BM-EXP-C131 @@ -10779,7 +10805,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ပော်'' (U+102C) should be ''ပေါ်'' (U+102B), indicating a Zawgyi-to-Unicode conversion issue.. The word ''ပော်'' (U+102C) should be ''ပေါ်'' (U+102B), indicating a Zawgyi-to-Unicode conversion issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ပော်'' (U+102C) is incorrect. It should be ''ပေါ်'' (U+102B), which is a common Zawgyi-to-Unicode conversion error.. Needs manual error annotation.' - id: BM-EXP-C132 @@ -10787,7 +10813,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ခော်'' (U+102C) should be ''ခေါ်'' (U+102B), indicating a Zawgyi-to-Unicode conversion issue.. The word ''ခော်'' (U+102C) should be ''ခေါ်'' (U+102B), indicating a Zawgyi-to-Unicode conversion issue.. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ခော်'' (U+102C) is incorrect. It should be ''ခေါ်'' (U+102B), which is a common Zawgyi-to-Unicode conversion error.. Needs manual error annotation.' - id: BM-EXP-C133 @@ -10795,7 +10821,7 @@ sentences: is_clean: false domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C133-E1 error_type: wrong_word @@ -10829,7 +10855,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-cc100-dataset.txt (26 words). - id: BM-EXP-C135 @@ -10837,7 +10863,7 @@ sentences: is_clean: false domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C135-E1 error_type: wrong_word @@ -10858,7 +10884,7 @@ sentences: is_clean: false domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C136-E1 error_type: grammar_error @@ -10880,7 +10906,7 @@ sentences: is_clean: false domain: general register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C137-E1 error_type: grammar_error @@ -10901,7 +10927,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from pali-myanmar-dictionary-corpus.txt (10 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -10910,7 +10936,7 @@ sentences: is_clean: false domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C139-E1 error_type: wrong_word @@ -10944,7 +10970,7 @@ sentences: is_clean: false domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C140-E1 error_type: grammar_error @@ -10966,7 +10992,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (5 words). - id: BM-EXP-C142 @@ -10974,7 +11000,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (7 words). - id: BM-EXP-C143 @@ -10982,7 +11008,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (14 words). - id: BM-EXP-C144 @@ -10990,7 +11016,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပော်' (U+101D U+102C U+103D) is incorrect. It should be 'ပေါ်' (U+101D U+102B U+103D), which means 'on' or 'appear'. This is a common Zawgyi. The word 'ပော်' (U+101D U+102C U+103D) is incorrect. It should be 'ပေါ်' (U+101D U+102B U+103D), which means 'on' or 'appear'. This is a common Zawgyi. Clean control sentence from myanmar-xnli_sentence2_my.txt (5 words). - id: BM-EXP-C145 @@ -10998,7 +11024,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (8 words). - id: BM-EXP-C146 @@ -11006,7 +11032,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (6 words). - id: BM-EXP-C147 @@ -11014,7 +11040,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (20 words). - id: BM-EXP-C148 @@ -11022,7 +11048,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (6 words). - id: BM-EXP-C149 @@ -11030,7 +11056,7 @@ sentences: is_clean: true domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Marked clean during benchmark audit 2026-04-15. - id: BM-EXP-C150 @@ -11038,7 +11064,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-cc100-dataset.txt (9 words). - id: BM-EXP-C151 @@ -11046,7 +11072,7 @@ sentences: is_clean: false domain: general register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C151-E1 error_type: wrong_word @@ -11067,7 +11093,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (17 words). - id: BM-EXP-C153 @@ -11075,7 +11101,7 @@ sentences: is_clean: false domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C153-E1 error_type: wrong_word @@ -11090,6 +11116,19 @@ sentences: context_required: false notes: Added by Gemini 3.1 Pro arbiter 2026-04-15. domain: spelling + - error_id: BM-EXP-C153-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 221 + end: 231 + erroneous_text: ဒိန်းဒောင် + gold_correction: ဒိန်းဒေါင် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Marked clean during benchmark audit 2026-04-15. note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' - id: BM-EXP-C154 @@ -11097,7 +11136,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (4 words). - id: BM-EXP-C155 @@ -11105,7 +11144,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C155-E1 error_type: spacing @@ -11139,7 +11178,7 @@ sentences: is_clean: false domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C156-E1 error_type: spacing @@ -11160,7 +11199,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Marked clean during benchmark audit 2026-04-15. - id: BM-EXP-C158 @@ -11168,7 +11207,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (32 words). - id: BM-EXP-C159 @@ -11176,7 +11215,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (22 words). - id: BM-EXP-C160 @@ -11184,7 +11223,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (4 words). - id: BM-EXP-C161 @@ -11192,7 +11231,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence1_my.txt (6 words). - id: BM-EXP-C162 @@ -11200,7 +11239,7 @@ sentences: is_clean: false domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C162-E1 error_type: wrong_word @@ -11235,7 +11274,7 @@ sentences: is_clean: false domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C163-E1 error_type: grammar @@ -11271,7 +11310,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (5 words). - id: BM-EXP-C165 @@ -11279,7 +11318,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-culturax-dataset.txt (16 words). - id: BM-EXP-C166 @@ -11287,7 +11326,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (5 words). - id: BM-EXP-C167 @@ -11295,7 +11334,7 @@ sentences: is_clean: false domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C167-E1 error_type: orthography @@ -11368,7 +11407,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C168-E1 error_type: orthography @@ -11389,7 +11428,7 @@ sentences: is_clean: false domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C169-E1 error_type: grammar @@ -11411,7 +11450,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (7 words). - id: BM-EXP-C171 @@ -11419,7 +11458,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (27 words). - id: BM-EXP-C172 @@ -11427,7 +11466,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (16 words). - id: BM-EXP-C173 @@ -11435,7 +11474,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence2_my.txt (4 words). - id: BM-EXP-C174 @@ -11443,7 +11482,7 @@ sentences: is_clean: false domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C174-E1 error_type: orthography @@ -11529,7 +11568,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (5 words). - id: BM-EXP-C176 @@ -11537,7 +11576,7 @@ sentences: is_clean: false domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C176-E1 error_type: invalid_syllable @@ -11623,7 +11662,7 @@ sentences: is_clean: false domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C177-E1 error_type: spelling @@ -11638,13 +11677,26 @@ sentences: context_required: false notes: Added by Gemini 3.1 Flash-Lite native review 2026-04-15. domain: spelling + - error_id: BM-EXP-C177-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 68 + end: 87 + erroneous_text: ဒော်အောင်ဆန်းစုကြည် + gold_correction: ဒေါ်အောင်ဆန်းစုကြည် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (12 words). - id: BM-EXP-C178 input: ကိုသက်ထွန်း --- ဘာကြောင့်လဲဆိုတော့ အဲဒီအချိန်မှာ တာဝန်ယူနေတဲ့ စစ်တပ်တွေပါ လှည့်တယ်။ တွေ့တဲ့လူ၊ သူတို့ မသင်္ကာဘူးဆိုရင် အကုန်ဖမ်းသွားတာ။ သူတို့က အမှားအမှန်ကို ရှင်းလင်းတာ။ အဲဒီအတွက် အရင်ဆုံး သူတို့ မကျေနပ်တာ။ သို့တည်းမဟုတ် မသင်္ကာတာ တွေ့သမျှကို အကုန်ဖမ်းတယ်။ ဥပဒေအရ လုံခြုံမှုတော့ ကျနော်တို့ လက်ထဲမှာ မရှိဘူး။ is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (26 words). - id: BM-EXP-C179 @@ -11652,7 +11704,7 @@ sentences: is_clean: true domain: general register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'ပောက်' in 'ပြတင်း ပောက်' should be 'ပေါက်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. The word 'သဘောပောက်' should be 'သဘောပေါက်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. The word 'ပော်' should be 'ပေါ်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. The word 'ပောက်' in 'ပြတင်း ပောက်' should be 'ပေါက်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. The word 'သဘောပောက်' should be 'သဘောပေါက်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. The word 'ပော်' should be 'ပေါ်' (U+102B). This is a common Zawgyi-to-Unicode conversion error.. Clean control sentence from myanmar-culturax-dataset.txt (48 words). - id: BM-EXP-C180 @@ -11660,7 +11712,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (14 words). - id: BM-EXP-C181 @@ -11668,7 +11720,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (27 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -11677,7 +11729,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (30 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -11686,7 +11738,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (9 words). - id: BM-EXP-C184 @@ -11694,7 +11746,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-xnli_sentence1_my.txt (44 words). - id: BM-EXP-C185 @@ -11702,7 +11754,7 @@ sentences: is_clean: false domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C185-E1 error_type: spacing @@ -11775,7 +11827,7 @@ sentences: is_clean: false domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C186-E1 error_type: grammar @@ -11868,7 +11920,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ခော်သွားကာ'' should be ''ခေါ်သွားကာ'' (U+102B). This is a critical Zawgyi-to-Unicode conversion error.. The word ''ခော်သွားကာ'' should be ''ခေါ်သွားကာ'' (U+102B). This is a critical Zawgyi-to-Unicode conversion error.. Audit 2026-04-15: has unannotated error(s). Gemini: Spacing error: ''ရောက်ရှိ လာသော'' should be ''ရောက်ရှိလာသော''.. Needs manual error annotation.' - id: BM-EXP-C188 @@ -11876,7 +11928,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-cc100-dataset.txt (7 words). - id: BM-EXP-C189 @@ -11884,7 +11936,7 @@ sentences: is_clean: false domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C189-E1 error_type: wrong_word @@ -11931,7 +11983,7 @@ sentences: is_clean: false domain: literary register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C190-E1 error_type: wrong_word @@ -11952,7 +12004,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (11 words). - id: BM-EXP-C192 @@ -11960,7 +12012,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (9 words). - id: BM-EXP-C193 @@ -11968,7 +12020,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (12 words). - id: BM-EXP-C194 @@ -11976,7 +12028,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (9 words). - id: BM-EXP-C195 @@ -11984,7 +12036,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. Sentence is marked as clean but contains U+102C instead of U+102B, which is a common Zawgyi conversion error.. Sentence is marked as clean but contains U+102C instead of U+102B, which is a common Zawgyi conversion error.. Clean control sentence from myanmar_qna_dataset.txt (28 words). - id: BM-EXP-C196 @@ -11992,7 +12044,7 @@ sentences: is_clean: false domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C196-E1 error_type: grammar @@ -12028,7 +12080,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (23 words). - id: BM-EXP-C198 @@ -12036,7 +12088,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (5 words). - id: BM-EXP-C199 @@ -12044,7 +12096,7 @@ sentences: is_clean: true domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ပောင်း'' (U+1015 U+102C U+103B U+1004) uses U+102C (ာ) where U+102B (ါ) is required for the correct pronunciation and meaning (''ပေါင်း'' - to . The word ''ပောင်း'' (U+1015 U+102C U+103B U+1004) uses U+102C (ာ) where U+102B (ါ) is required for the correct pronunciation and meaning (''ပေါင်း'' - to . Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ပောင်း'' (U+1015 U+102C U+103B U+1004) is incorrect. It should be ''ပေါင်း'' (U+1015 U+102B U+103B U+1004) with U+102B (ါ) for the ''aw'' sound, . Needs manual error annotation.' - id: BM-EXP-C200 @@ -12052,7 +12104,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (35 words). - id: BM-EXP-C201 @@ -12060,7 +12112,7 @@ sentences: is_clean: true domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (14 words). - id: BM-EXP-C202 @@ -12068,7 +12120,7 @@ sentences: is_clean: false domain: technical register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C202-E1 error_type: semantic_error @@ -12090,7 +12142,7 @@ sentences: is_clean: false domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C203-E1 error_type: semantics @@ -12112,7 +12164,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''ခော်ဝော်'' (U+101E U+102C U+103B U+101D U+102C U+103B) incorrectly uses U+102C (ာ) twice where U+102B (ါ) is required for the correct pronunc. The word ''ခော်ဝော်'' (U+101E U+102C U+103B U+101D U+102C U+103B) incorrectly uses U+102C (ာ) twice where U+102B (ါ) is required for the correct pronunc. Audit 2026-04-15: has unannotated error(s). Gemini: The word ''ခော်ဝော်'' (U+101E U+102C U+103B U+101D U+102C U+103B) is misspelled. It should be ''ခေါ်ဝေါ်'' (U+101E U+102B U+103B U+101D U+102B U+103B), me. Needs manual error annotation.' - id: BM-EXP-C205 @@ -12120,7 +12172,7 @@ sentences: is_clean: false domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C205-E1 error_type: semantic_error @@ -12156,7 +12208,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word ''စင်ပော်'' (U+1021 U+1004 U+103B U+1015 U+102C U+103B) likely uses U+102C (ာ) where U+102B (ါ) is required for the correct word ''စင်ပေါ်'' (pla. The word ''စင်ပော်'' (U+1021 U+1004 U+103B U+1015 U+102C U+103B) likely uses U+102C (ာ) where U+102B (ါ) is required for the correct word ''စင်ပေါ်'' (pla. Audit 2026-04-15: has unannotated error(s). Gemini: The sentence is incomplete. It ends abruptly with ''လွှသွားများ'' (saw blades) and lacks a verb or predicate to form a complete thought.. Needs manual error annotation.' - id: BM-EXP-C207 @@ -12164,7 +12216,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (32 words). - id: BM-EXP-C208 @@ -12172,7 +12224,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from alpaca_myanmar_burmese_taco.txt (30 words). - id: BM-EXP-C209 @@ -12180,7 +12232,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (17 words). - id: BM-EXP-C210 @@ -12188,7 +12240,7 @@ sentences: is_clean: true domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (17 words). - id: BM-EXP-C211 @@ -12196,7 +12248,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (12 words). - id: BM-EXP-C212 @@ -12204,7 +12256,7 @@ sentences: is_clean: true domain: news register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (43 words). - id: BM-EXP-C213 @@ -12212,7 +12264,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'အပော်' (a paw) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'အပေါ်'. This is a common Zawgyi-to-Unicode conve. The word 'ပော်ထွက်' (paw htwet) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါ်ထွက်'. This is a common Zawgyi-to-Uni. The word 'အပော်' (a paw) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'အပေါ်'. This is a common Zawgyi-to-Unicode conve. The word 'ပော်ထွက်' (paw htwet) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'ပေါ်ထွက်'. This is a common Zawgyi-to-Uni. Clean control sentence from myanmar-c4-dataset.txt (44 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -12221,7 +12273,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Input fixed (Zawgyi U+102C→U+102B) by Pro 2026-04-15. The word 'မပော်' (ma paw) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'မပေါ်'. This is a common Zawgyi-to-Unicode conv. The word 'မပော်' (ma paw) is incorrectly spelled using U+102C (ာ) instead of U+102B (ါ). It should be 'မပေါ်'. This is a common Zawgyi-to-Unicode conv. Clean control sentence from tipitaka_myanmar_translation_books.txt (29 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -12230,7 +12282,7 @@ sentences: is_clean: true domain: technical register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-fineweb-2-dataset-filtered.txt (11 words). - id: BM-EXP-C216 @@ -12238,7 +12290,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (27 words). - id: BM-EXP-C217 @@ -12246,7 +12298,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (17 words). - id: BM-EXP-C218 @@ -12254,7 +12306,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from pali-myanmar-dictionary-corpus.txt (5 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -12263,7 +12315,7 @@ sentences: is_clean: false domain: academic register: polite - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C219-E1 error_type: wrong_word @@ -12310,7 +12362,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from huggingface_myanmar_english_translation.txt (13 words). - id: BM-EXP-C221 @@ -12318,7 +12370,7 @@ sentences: is_clean: false domain: literary register: formal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C221-E1 error_type: grammar @@ -12340,7 +12392,7 @@ sentences: is_clean: false domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: - error_id: BM-EXP-C222-E1 error_type: grammar_error @@ -12402,7 +12454,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar_spoken_corpus.txt (5 words). - id: BM-EXP-C224 @@ -12410,7 +12462,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: 'Pro-confirmed clean 2026-04-15. The sentence is truncated at the end (''ရှေ့ဆုံး''), which is a corpus extraction issue rather than a ' - id: BM-EXP-C225 @@ -12418,7 +12470,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (4 words). - id: BM-EXP-C226 @@ -12426,7 +12478,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (7 words). note: 'Re-labeled: pure Pali/Sanskrit verse, not Myanmar language' @@ -12435,7 +12487,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-c4-dataset.txt (5 words). - id: BM-EXP-C228 @@ -12443,7 +12495,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-literature-corpus.txt (6 words). - id: BM-EXP-C229 @@ -12451,7 +12503,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-wikipedia-dataset.txt (27 words). - id: BM-EXP-C230 @@ -12459,7 +12511,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (29 words). - id: BM-EXP-C231 @@ -12467,7 +12519,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from tipitaka_myanmar_translation_books.txt (12 words). note: 'Re-labeled: contains formatting issues (hyphens/ASCII commas/no-space-after-fullstop/numbers-attached)' @@ -12476,7 +12528,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (9 words). - id: BM-EXP-C233 @@ -12484,7 +12536,7 @@ sentences: is_clean: true domain: conversational register: polite - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from URajinda-myanmar_spoken_corpus.txt (19 words). - id: BM-EXP-C234 @@ -12492,7 +12544,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: expected_errors: [] notes: Clean control sentence from myanmar-written-corpus.txt (8 words). - id: BM-EXP-E001 @@ -12578,6 +12630,45 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E004-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 113 + end: 118 + erroneous_text: အပော် + gold_correction: အပေါ် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E004-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 139 + end: 150 + erroneous_text: ခောင်းဆောင် + gold_correction: ခေါင်းဆောင် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E004-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 348 + end: 357 + erroneous_text: ပော်ပောက် + gold_correction: ပေါ်ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E005 input: ဌာနနှင့် ဌာနီကို မခွဲခြားဘဲ ဆိုခြင်း၊ ဌာန၏အမည်ကို ဌာနီ၌ ထင်စား၍ ဆိုခြင်း။ @@ -12691,6 +12782,19 @@ sentences: context_required: false notes: Injected vowel_medial_substitution error. domain: spelling + - error_id: BM-EXP-E010-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 237 + end: 242 + erroneous_text: ပောက် + gold_correction: ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 vowel_medial_substitution error. - id: BM-EXP-E011 input: 'သိန်းကျင်းရွာ (အင်္ဂလိပ်: Theinkyin)သည် မကွေးတိုင်းဒေသကြီး၊ ဂန့်ဂောခရိုင်၊ ဆောမြို့နယ်၊ ဆီမီးကျေးရွာအုပ်စု၌ တည်ရှိသည်။ ရွာနေရာကုတ်မှာ ဖြစ်သည်။ သန်းခောင်စာရင်းအရ ဆီမီးကျေးရွာအုပ်စုတွင် ကျား ဦး၊ မ ဦး၊ လူဦရေ စုစုပောင်း ဦးနေထိုင်သည်။' @@ -12712,6 +12816,32 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E011-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 143 + end: 158 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E011-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 205 + end: 215 + erroneous_text: စုစုပောင်း + gold_correction: စုစုပေါင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E013 input: တောင်တွင်းကြီးမြို့နယ်မှာ စစ်တပ်က အာဏာသိမ်းပြီးနောက် စစ်ကောင်စီတပ်တွေရဲ့ ရိုက်နှက်ပစ်ခတ် သတ်ဖြတ်မှုကြောင့် ဒေသခံပြည်သူ ခုနစ်ဦးအထိ သေဆုံးခဲ့ရတယ်လို့ ဒေသခံတွေရဲ့ မတ်တမ်းတွေအရ သိရပါတယ်။ @@ -12817,6 +12947,19 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling + - error_id: BM-EXP-E017-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 215 + end: 225 + erroneous_text: ပောင်းစည်း + gold_correction: ပေါင်းစည်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E018 input: ၅၂၉။ သကဝါဒီ။ ကုသိုလ်စိတ်သည် ဖြစ်စေအပ်သော ကာယကံရု ပ်သည် အာရုံမရှိသော ကုသိုလ်လော။ @@ -12880,6 +13023,32 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling + - error_id: BM-EXP-E020-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 134 + end: 149 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E020-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 203 + end: 213 + erroneous_text: စုစုပောင်း + gold_correction: စုစုပေါင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E021 input: ကိုဘုထန်ပိုင်က "မနေ့က ညကတည်းက အင်တာနက်လိုင်းတွေတော့ ပြန်ပွင့်တယ်။ ဒါပေမယ့် လိုင်းကတော့ သိပ် မကောင်းဘူး။ ကျနော့် ဖုန်းကြောင့် ဖြစ်တာလား။ ဘယ် လို ဖွင့်ပေးတာလည်း ဆိုတာတော့ မသိတော့ဘူး" ဟု ဧရာဝတီသို့ ပြောသည်။ @@ -12922,6 +13091,19 @@ sentences: context_required: false notes: Injected aukmyit_confusion error. domain: spelling + - error_id: BM-EXP-E022-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 237 + end: 242 + erroneous_text: ပောက် + gold_correction: ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 aukmyit_confusion error. - id: BM-EXP-E023 input: ထိုအခါ သမင်မသည် အရှင် မကြောက်ပါလင့် မိမိကိုယ်စွမ်းဖြင့် မုဆိုးကို တောင်းပန်ကာ အရှင်၏အသက်ကို ချမ်းသာစေမည်၊ အကယ်၍ တောင်းပန်မရပါမူ အရှင်၏အသက်ကို ကယ်ဆယ်ပါမည် ဟု ပြော့ဆို၍ သမင်မင်းအား သက်သာရာကို ရစေပြီးလျှင် သမင်မင်းကို မိမိကိုယ်၌ မှီစေလျက် ရပ်တည်လေ၏။ @@ -12985,6 +13167,32 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E025-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 144 + end: 159 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E025-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 207 + end: 217 + erroneous_text: စုစုပောင်း + gold_correction: စုစုပေါင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E026 input: ကဲတပည့်တို့ လောကကြီးမှာ လူတွေဝင်ငွေရရှိနိုင်တဲ့ နည်းလမ်းတွေ ဘယ်လောက်ပောများ လဲဆိုတာ အခုခင်ဗျားတို့ မြင်တွေ့ခဲ့ရပြီ၊ အဲဒီနည်းလမ်း သို့မဟုတ် လုပ်ငန်းမှန်သမျှဟာ လူတွေဆီ ကို စမ်းရေစီးသလို စီးဆင်းလာနေတဲ့ ရွှေဒင်္ဂါးတန်း ငွေဒင်္ဂါးတန်းတွေပဲ၊ အလုပ်လုပ်တဲ့ လူတိုင်းဟာ ဒီစီးကြောင်းထဲက အသပြာခို ကိုယ့်အိတ်ထဲ မြောင်းသွယ်လို့ ရသလောက် သွယ်ယူနေကြတဲ့ သဘောပါပဲ။ @@ -13006,6 +13214,19 @@ sentences: context_required: false notes: Injected consonant_substitution error. domain: spelling + - error_id: BM-EXP-E026-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 68 + end: 71 + erroneous_text: ပော + gold_correction: ပေါ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 consonant_substitution error. - id: BM-EXP-E027 input: ရှမ်းပြည်နယ် တောင်ပိုင်းရှိ တောင်ကြီး-ညောင်ရွှေ ယာဉ်လိုင်းကားခများ ဆခွဲဈေးတက်သွား၍ ခရီးသည်များ ဒုက္ခရောက်နေကြသည်ဟု ဒေသခံများက ပြော့သည်။ @@ -13170,6 +13391,19 @@ sentences: context_required: false notes: Injected consonant_substitution error. domain: spelling + - error_id: BM-EXP-E035-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 143 + end: 158 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 consonant_substitution error. - id: BM-EXP-E036 input: ဝတ္ထု, ဒွာရ၊ ဝတ္ထု နှင့် ဒွာရ။ @@ -13212,6 +13446,19 @@ sentences: context_required: false notes: Injected vowel_medial_substitution error. domain: spelling + - error_id: BM-EXP-E037-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 17 + end: 32 + erroneous_text: ခောင်းဆောင်ကြီး + gold_correction: ခေါင်းဆောင်ကြီး + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 vowel_medial_substitution error. - id: BM-EXP-E038 input: ထိုကြောင့် ကိုခောင်းကြီး လမ်း ဆိုင်တွေကို မသုံးဘဲ လမ်ဘက် ဆိုင်များကို စိတ်ကူးထား၏။ @@ -13233,6 +13480,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E038-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 14 + end: 20 + erroneous_text: ခောင်း + gold_correction: ခေါင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E039 input: '''''ဘယ်လိုဖြစ်သွားလဲ ပြော တော့ ခလေးရဲ့ သဘောသဘာဝ က သူတို့တစ်ခုခုလိုချင်ရင် မိဘ ကို တစ်နည်းနည်းနဲ့ပြပြီးတောင်း တာ။ ပောက်ကွဲပြပြီး တောင်းတဲ့ လူကြီးကလိုက်လျော လို့ရှိရင် ကလေးစိတ်ထဲမှာ ငါ ကောင်းကောင်းမွန်မွန်တောင်း တုန်းကတော့ မပေးဘူး။ လိုချင်တာရ တယ်ဆိုပြီး နောက်အခါတွေမှာ ပောက်ကွဲတဲ့အပြုအမူတွေ လုပ်ပြီး တောင်းတာ။ အဲဒီလိုနဲ့ တချို့ ကလေးတွေမှာ ဝုန်းဒိုင်းကြဲပြီး ပောက်ကွဲတဲ့ ကလေးလေးတွေ ဖြစ်လာတာပော့'''' ဟု ကလေးငယ် များ၏ အပြုအမူပိုင်းဆိုင်ရာ အ ဆင့်စီမံခန့်ခွဲပေးသော ဒောက်တာ ဌေးဌေး (စိတ်ပညာ) ကဆိုသည်။' @@ -13254,6 +13514,71 @@ sentences: context_required: false notes: 'Rewritten from BM-EXP-E039-E1 (was: not a spelling error). Now: consonant_substitution.' domain: spelling + - error_id: BM-EXP-E039-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 111 + end: 119 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E039-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 265 + end: 273 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E039-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 354 + end: 362 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E039-E8 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 385 + end: 389 + erroneous_text: ပော့ + gold_correction: ပေါ့ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E039-E10 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 454 + end: 461 + erroneous_text: ဒောက်တာ + gold_correction: ဒေါက်တာ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 confusable_semantic error. - id: BM-EXP-E040 input: '- Cussilage သည် မိခင်၏ချစ်ခြင်းမေတ္တာဟု အဓိပ္ပါယ်ရသော မိခင်၏ချစ်ခြင်းမေတ္တာကို ရည်ညွှန်းသောကြောင့် လက်ဆောင်အဖြစ် ပေးက မ်းရန် အကောင်းဆုံးဖြစ်သည်။' @@ -13338,22 +13663,87 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling - notes: Generated error sentence with 1 compound_confusion error. -- id: BM-EXP-E044 - input: ထိုအခါ ရဟန်းတို့သည် တရားသဘင်၌ စကားစပ်မိ၍ ထိုရဟန်းအကြောင်းကို ပြောဆိုနေကြရာ မြတ်စွာဘုရား ကြွလာတော်မူ၍ ထိုအကြောင်းကို သိတော်မူလျှင်– “ရဟန်းတို့...ရှေးအခါ ဤရဟန်းသည် သဒ္ဓါတရားမရှိ၊ အလှူခံတို့ကို မြင်၍ ကြည်ညိုခြင်းမရှိ၊ မြက်ဖျားတွင် ကပ်သောသိပောက်မျှကိုလည်း သူတပါးအား မပေးလေ၊ ထိုသူကြွယ်ကို ငါဆုံးမ၍ အလှူကို လှူစေပြီးလျှင် ဘဝတပါးသို့ ရောက်၍ပင် မစွန့်”ဟု မိန့်တော်မူကာ အောက်ပါ အတိတီဇာတ်ကို ဆောင်တော်မူသတည်း။ - is_clean: false - domain: academic - register: informal - difficulty_tier: 2 - expected_errors: - - error_id: BM-EXP-E044-E1 + - error_id: BM-EXP-E043-E2 error_type: wrong_word - error_subtype: consonant_substitution - fix_owner: dict + error_subtype: zawgyi_conversion_error detection_layer: word span: - start: 228 - end: 251 + start: 110 + end: 118 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E043-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 264 + end: 272 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E043-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 353 + end: 361 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E043-E8 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 384 + end: 388 + erroneous_text: ပော့ + gold_correction: ပေါ့ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E043-E10 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 453 + end: 460 + erroneous_text: ဒောက်တာ + gold_correction: ဒေါက်တာ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + notes: Generated error sentence with 1 compound_confusion error. +- id: BM-EXP-E044 + input: ထိုအခါ ရဟန်းတို့သည် တရားသဘင်၌ စကားစပ်မိ၍ ထိုရဟန်းအကြောင်းကို ပြောဆိုနေကြရာ မြတ်စွာဘုရား ကြွလာတော်မူ၍ ထိုအကြောင်းကို သိတော်မူလျှင်– “ရဟန်းတို့...ရှေးအခါ ဤရဟန်းသည် သဒ္ဓါတရားမရှိ၊ အလှူခံတို့ကို မြင်၍ ကြည်ညိုခြင်းမရှိ၊ မြက်ဖျားတွင် ကပ်သောသိပောက်မျှကိုလည်း သူတပါးအား မပေးလေ၊ ထိုသူကြွယ်ကို ငါဆုံးမ၍ အလှူကို လှူစေပြီးလျှင် ဘဝတပါးသို့ ရောက်၍ပင် မစွန့်”ဟု မိန့်တော်မူကာ အောက်ပါ အတိတီဇာတ်ကို ဆောင်တော်မူသတည်း။ + is_clean: false + domain: academic + register: informal + difficulty_tier: 2 + expected_errors: + - error_id: BM-EXP-E044-E1 + error_type: wrong_word + error_subtype: consonant_substitution + fix_owner: dict + detection_layer: word + span: + start: 228 + end: 251 erroneous_text: ကပ်သောသိပောက်မျှကိုလည်း gold_correction: ကပ်သောဆီပေါက်မျှကိုလည်း edit_distance: 1 @@ -13751,6 +14141,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E062-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 82 + end: 101 + erroneous_text: ဒော်အောင်ဆန်းစုကြည် + gold_correction: ဒေါ်အောင်ဆန်းစုကြည် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E063 input: တောင်း။ အဲဒီတော့ ကိုဆွေဝင်းကို သြဂုတ်လ ရက်နေ့မှာ ရုံးချိန်းပြန်ခေါ်ထားတယ်ဆိုတော့ ကိုဆွေဝင်းက ဆက်သွားပြီး အမှုရင်ဆိုင်ရမှာပေါ့နော်။ @@ -13863,6 +14266,32 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling + - error_id: BM-EXP-E068-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 143 + end: 158 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E068-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 206 + end: 216 + erroneous_text: စုစုပောင်း + gold_correction: စုစုပေါင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E069 input: အင်း နော်၊ ကိုဆန်းလွင် ပြော့တော့ မ သဘော ပောက်ပါတယ်၊ မက ကိုဆန်းလွင်လို မပြောတတ်လို့၊ မ ဆိုလိုတာလည်း ဒီသဘောပါပဲ၊ မ အဖြေ ပေးသင့်တဲ့ အချိန်ဟာ ဘယ်အချိန် ဖြစ်မလဲ ဆိုတာ မ အနေနှင့် ကြိုမသိနိုင်လို့ ကြိုပြီး အသိပေးတဲ့ သဘော ပြောတာပါ @@ -13996,6 +14425,32 @@ sentences: context_required: false notes: Injected consonant_substitution error. domain: spelling + - error_id: BM-EXP-E075-E3 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 138 + end: 149 + erroneous_text: ခောင်းဆောင် + gold_correction: ခေါင်းဆောင် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E075-E5 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 347 + end: 356 + erroneous_text: ပော်ပောက် + gold_correction: ပေါ်ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 consonant_substitution error. - id: BM-EXP-E076 input: ပြည်ထောင်စု ဒီမိုကရေစီပါတီ ဥက္ကဌ ကိုဖြိုးမင်းသိန်း ဖြစ်ပါတယ်။ တပ်မတော်အစိုးရဟာ နိုင်ငံဝန်ထမ်းတွေ ဖြစ်တယ်လို့ စစ်အစိုးရက တချိန်က ပြောခဲ့ဖူးပါတယ်။ ဒါကြောင့် အခုလက်ရှိ ရွေးကောက်ပွဲဥပဒေအရ နိုင်ငံရေး မလုပ်ရ၊ နိုင်ငံရေးပါတီမထောင်ရ ကန့်သတ်မှုရှိတာကြောင့် ရွေးကောက်ပွဲဥပဒေနဲ့များ ငြိစွန်းနေသလား စဉ်းစားသူတွေ လည်း ရှိပါတယ်။ ဒီအ တိုင်း ခွင့်ပြုချက်ရသွားပြီး နိုင်ငံရေးပါတီထောင်ပြီး ရွေးကောက်ပွဲဝင်မယ် ဆိုရင်တော့ ရွေးကောက်ပွဲ အခြေအနေက တဘက်သတ်ဖြစ်သွားမယ်လို့ ကိုဝင်းမင်းက ယူဆပါတယ်။ @@ -14143,6 +14598,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E082-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 134 + end: 149 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E083 input: ကိုဘုထန်ပိုင်က "မနေ့က ညကတည်းက အင်တာနက်လိုင်းတွေတော့ ပြန်ပွင့်တယ်။ ဒါပေမယ့် လိုင်းကတော့ သိပ် မကောင်းဘူး။ ကျနော့် ဖုန်းကြောင့် ဖြစ်တာလား။ ဘယ်လို ဖွင့်ပေးတာလည်း ဆိုတာတော့ မသိတော့ဘူး" ဟု ဧရာဝတီသို့ ပြော့ပြသည်။ @@ -14206,6 +14674,45 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling + - error_id: BM-EXP-E085-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 111 + end: 116 + erroneous_text: အပော် + gold_correction: အပေါ် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E085-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 137 + end: 148 + erroneous_text: ခောင်းဆောင် + gold_correction: ခေါင်းဆောင် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E085-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 346 + end: 355 + erroneous_text: ပော်ပောက် + gold_correction: ပေါ်ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E086 input: '- Cussilage သည် မိခင်၏ချစ်ခြင်မေတ္တာဟု အဓိပ္ပါယ်ရသော မိခင်၏ချစ်ခြင်းမေတ္တာကို ရည်ညွှန်းသောကြောင့် လက်ဆောင်အဖြစ် ပေးကမ်းရန် အကောင်းဆုံးဖြစ်သည်။' @@ -14380,6 +14887,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E092-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 235 + end: 240 + erroneous_text: ပောက် + gold_correction: ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E093 input: မိသားစုနှင့် ခွဲခွာနေထိုင်ရသဖြင့် အဆင်မပြေသောကြောင့် အိုဟိုင်ယိုးပြည်နယ် အုပ်ချုပ်ရေးမှူးရာထူးအတွက် ပြန်သွားအရွေးခံရာ နှင့် တို့တွင် နှစ်ကြိမ်ဆက် အရွေးခံရသည်။ @@ -14506,6 +15026,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E098-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 17 + end: 32 + erroneous_text: ခောင်းဆောင်ကြီး + gold_correction: ခေါင်းဆောင်ကြီး + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E099 input: ကဲတပည့်တို့ လောကကြီးမှာ လူတွေဝင်ငွေရရှိနိုင်တဲ့ နည်းလမ်းတွေ ဘယ်လောက်ပောများ လဲဆိုတာ အခုခင်ဗျားတို့ မြင်တွေ့ခဲ့ရပြီ၊ အဲဒီနည်းလမ်း သို့မဟုတ် လုပ်ငန်းမှန်သမျှဟာ လူတွေဆီ ကို စမ်းရေစီးသလို စီးဆင်းလာနေတဲ့ ရွှေဒင်္ဂါးတန်း ငွေဒင်္ဂါးတန်းတွေပဲ၊ အလုပ်လုပ်တဲ့ လူတိုင်းဟာ ဒီစီးကြောင်းထဲက အသပြာ ကို ကိုယ့်အိတ်ထဲ မြောင်းသွယ်လို့ ရသလောက် သွယ်ယူနေကြတဲ့ သဘောပါပဲ။ @@ -14527,6 +15060,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E099-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 68 + end: 71 + erroneous_text: ပော + gold_correction: ပေါ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E100 input: ဒါးကွင်းဆည်တည်ဆောက်ရေးနှင့် ပတ်သက်၍ အစီအစဉ်ရေဆွဲခြင်းနှင့် ဖြစ်နိုင်ခြေ စမ်းသပ်လေ့လာနေဆဲ အဆင့်၌သာ ရှိပါသေးသည်။ @@ -14632,6 +15178,19 @@ sentences: context_required: false notes: Injected aukmyit_confusion error. domain: spelling + - error_id: BM-EXP-E104-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 83 + end: 102 + erroneous_text: ဒော်အောင်ဆန်းစုကြည် + gold_correction: ဒေါ်အောင်ဆန်းစုကြည် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 aukmyit_confusion error. - id: BM-EXP-E105 input: ကဲတပည့်တို့ လောကကြီးမှာ လူတွေဝင်ငွေရရှိနိုင်တဲ့ နည်းလမ်းတွေ ဘယ်လောက်ပောများ လဲဆိုတာ အခုခင်ဗျားတို့ မြင်တွေ့ခဲ့ရပြီ၊ အဲဒီနည်းလမ်း သို့မဟုတ် လုပ်ငန်းမှန်သမျှဟာ လူတွေသိ ကို စမ်းရေစီးသလို စီးဆင်းလာနေတဲ့ ရွှေဒင်္ဂါးတန်း ငွေဒင်္ဂါးတန်းတွေပဲ၊ အလုပ်လုပ်တဲ့ လူတိုင်းဟာ ဒီစီးကြောင်းထဲက အသပြာကို ကိုယ့်အိတ်ထဲ မြောင်းသွယ်လို့ ရသလောက် သွယ်ယူနေကြတဲ့ သဘောပါပဲ။ @@ -14653,6 +15212,19 @@ sentences: context_required: false notes: Injected compound_confusion error. domain: spelling + - error_id: BM-EXP-E105-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 68 + end: 71 + erroneous_text: ပော + gold_correction: ပေါ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 compound_confusion error. - id: BM-EXP-E106 input: ပြည်ထောင်စု ဒီမိုကရေစီပါတီ ဥက္ကဌ ကိုဖြိုးမင်းသိန်း ဖြစ်ပါတယ်။ တပ်မတော်အစိုးရဟာ နိုင်ငံဝန်ထမ်းတွေ ဖြစ်တယ်လို့ စစ်အစိုးရက တချိန်က ပြော့ခဲ့ဖူးပါတယ်။ ဒါကြောင့် အခုလက်ရှိ ရွေးကောက်ပွဲဥပဒေအရ နိုင်ငံရေး မလုပ်ရ၊ နိုင်ငံရေးပါတီမထောင်ရ ကန့်သတ်မှုရှိတာကြောင့် ရွေးကောက်ပွဲဥပဒေနဲ့များ ငြိစွန်းနေသလား စဉ်းစားသူတွေ လည်း ရှိပါတယ်။ ဒီအတိုင်း ခွင့်ပြုချက်ရသွားပြီး နိုင်ငံရေးပါတီထောင်ပြီး ရွေးကောက်ပွဲဝင်မယ် ဆိုရင်တော့ ရွေးကောက်ပွဲ အခြေအနေက တဘက်သတ်ဖြစ်သွားမယ်လို့ ကိုဝင်းမင်းက ယူဆပါတယ်။ @@ -14779,6 +15351,19 @@ sentences: context_required: false notes: Injected aukmyit_confusion error. domain: spelling + - error_id: BM-EXP-E111-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 26 + end: 38 + erroneous_text: အပောင်းအသင်း + gold_correction: အပေါင်းအသင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 aukmyit_confusion error. - id: BM-EXP-E112 input: နာတာလူး ပွဲတော်ပြန်လာစဉ် သောက် နမ်းစဉ်က ယခုလို မိသွားလျှင်၊ သို့မဟုတ် မြိုင်သာယာက သစ်သီးခြံဝင်းတွင် သူနှင့် အတူ ထွက်ပြေးကြဖို့ ပြောစဉ်က မိသွားလျှင်၊ သူတို့နှစ်ယောက် မရိုးမဖြောင့် တွေ့သည့်အခါမျိုးတွင် မိသွားလျှင် စကားလက် ခံနိုင်သည်။ @@ -14951,6 +15536,19 @@ sentences: context_required: false notes: Injected aukmyit_confusion error. domain: spelling + - error_id: BM-EXP-E118-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 76 + end: 80 + erroneous_text: ခော် + gold_correction: ခေါ် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 aukmyit_confusion error. - id: BM-EXP-E119 input: တပ်မတော်သို့ အသက် နှစ် မပြည်သေးဘဲ ဝင်ရောက်သူများကို စိစစ်ပြီး မိဘ အုပ်ထိန်းသူများ ထံသို့ ပြန်လွှဲ ပြော့င်းပေးမှု အကြိမ်ရေ ကြိမ်တွင် စစ်သည် ဦး ရှိကြောင်း၊ ယင်းသို့ စုဆောင်း မှုတွင် တာဝန်ရှိသူတပ်မ တော်သား အရာရှိ ဦး၊ အခြား အဆင့် ဦးကို စစ်စည်းကမ်းအရ အရေးယူခဲ့ကြောင်း တပ်မတော်သားကိုယ် စားလှယ်များ၏ ဆွေးနွေးချက်အရ သိရသည်။ @@ -15035,6 +15633,71 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E122-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 110 + end: 118 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E122-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 264 + end: 272 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E122-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 353 + end: 361 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E122-E8 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 384 + end: 388 + erroneous_text: ပော့ + gold_correction: ပေါ့ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E122-E10 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 454 + end: 461 + erroneous_text: ဒောက်တာ + gold_correction: ဒေါက်တာ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E123 input: '- ကျောက်စရစ်ခဲများနှင့် အပင်အပိုင်းအစများကို မြေသိလွှာ၏ “ခြေနင်းများ” ပော်တွင် ပံ့ပိုးထားသည်။' @@ -15426,6 +16089,71 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E135-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 110 + end: 118 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E135-E4 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 264 + end: 272 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E135-E6 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 352 + end: 360 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E135-E8 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 383 + end: 387 + erroneous_text: ပော့ + gold_correction: ပေါ့ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E135-E10 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 452 + end: 459 + erroneous_text: ဒောက်တာ + gold_correction: ဒေါက်တာ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E136 input: လူနည်းနည်းနေတဲ့၊အုတ်နံရံကာ၊မျက်နှာကြက်လုံအခန်းဆိုတဲ့အတွက်6 နဲ့ထပ်မြှောက်ရပါမယ်။ဒီအခန်းအတွက်ဆိုရင်1500 ft3 x 6=9000 Btu8000 နဲ့ စာတော့ (1.1 Hp)ကျော်ကျော်ရပါတယ်။အရပ်ပြော့ပြောရရင် 1 Hp တကောင် စက်တပ်ရမယ်လို့ဆိုတာပါ။ @@ -15552,6 +16280,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E141-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 237 + end: 242 + erroneous_text: ပောက် + gold_correction: ပေါက် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E142 input: “ကိုယ်က အနုပညာအလုပ်တွေလည်း လုပ်နေတဲ့လူ ဆေးကျောင်းက ဘွဲ့ရတဲ့အချိန်မှာတော့ ဆေးကောင်စီက ခော်ပြော့တယ် အလုပ် ခုထဲက တစ်ခုပဲ လုပ်ရမယ်ဆိုပြီးတော့ ရွေးတဲ့အချိန်မှာ ညီမက ဆရာဝန်ဘက်ကို ရွေးလိုက်တော့ အဲဒီတုန်းကတည်းက ဆရာဝန်ပဲလုပ်ဖြစ်နေတာပါ။ ဆေးကျောင်းတက်ရင်း အနုပညာအလုပ်တွေ လုပ်နေရတုန်းက တစ်ချိန်ချိန်မှာ အခုလို ရွေးရမယ်ဆိုတာ သိနေခဲ့တယ်။ ဆရာဝန်အလုပ် လုပ်ဖြစ်မယ်ဆိုတာလည်း သိနေတယ်။ ဒါပေမယ့် တကယ်တမ်း ရွေးရမယ့်အချိန်မှာ တော်တော် ခက်ခဲခဲ့ပါတယ်။ စိတ်ညစ်ခဲ့ရတဲ့ အချိန်တွေလည်းရှိပါတယ်” လို့ ဆိုပါတယ်။ @@ -15573,6 +16314,19 @@ sentences: context_required: false notes: Token-level aukmyit visarga removal (ပြော့→ပြော). Relabeled in Sprint E to match single-token detection semantics. domain: spelling + - error_id: BM-EXP-E142-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 85 + end: 89 + erroneous_text: ခော် + gold_correction: ခေါ် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 aukmyit_confusion error. - id: BM-EXP-E143 input: ပထမဆုံး စပြောရမယ့်သူကတော့ NLD အစိုးရ၊ အထူးသဖြင့် နိုင်ငံတော် အတိုင်ပင်ခံပုဂ္ဂိုလ် ဒော်အောင်ဆန်းစုကြည်နဲ့ ဆက်ဆံရေ ကောင်းတယ်လို့ သတင်းထွက်နေသူ ဒုတိယ ဗိုလ်ချုပ်မှူးကြီး စိုးဝင်းဟာ စစ်တက္ကသိုလ် အပတ်စဉ် က ဖြစ်ပါတယ်။ @@ -15594,6 +16348,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E143-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 82 + end: 101 + erroneous_text: ဒော်အောင်ဆန်းစုကြည် + gold_correction: ဒေါ်အောင်ဆန်းစုကြည် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E144 input: ကမ္ဘာ့ပစ္စည်းမဲ့ ခောင်းဆောင်ကြီးများ ဖြစ်ကြသော ကားလ်မတ်နှင့် အိန်ဂျယ်တို့က အာရှတိုက်တွင် နယ်ချဲ့ သမားတို့ ကျူးလွန်နေသော ကိုလိုနီစစ်ပွဲများကို ဝေဖန်တိုက်ခိုက်နေသည့် ကြားကပင် အင်္ဂလိပ်တို့သည် နတ်မြစ်ဝရှိ ရှင်မဖြူကျွန်းအရေးကို အကြောင်းပြု၍ မြန်မာနို င်ငံကို စစ်အင်အားဖြင့် ချီတက်တိုက်ခိုက်လေတော့သည်။ @@ -15615,6 +16382,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E144-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 17 + end: 32 + erroneous_text: ခောင်းဆောင်ကြီး + gold_correction: ခေါင်းဆောင်ကြီး + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E145 input: စစ်ကိုင်းတိုင်း၊ ဒီပဲယင်းမြို့နယ်၊ စိုင်ပြင်ကြီးမြို့နယ်ခွဲ၊ မုံရွာ ရေဦး ကားလမ်းပော်မှာ မနေ့ အောက်တိုဘာ ရက် ညနေပိုင်းက ပြည်ခိုင်ဖြိုးပါတီ အောင်နိုင်ရေး မဲဆွယ်စည်းရုံးရေးယာဉ်တန်းမှာ လိုက်ပါလာတဲ့ ဆိုင်ကယ် တင် ယာဉ်တိုက်မှုဖြစ်ရာ လူ ယောက်သေဆုံးပြီး ယောက် ဒဏ်ရာရရှိခဲ့တယ်လို့ သိရပါတယ်။ @@ -15691,6 +16471,58 @@ sentences: context_required: false notes: Injected vowel_medial_substitution error. domain: spelling + - error_id: BM-EXP-E147-E3 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 265 + end: 273 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E147-E5 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 354 + end: 362 + erroneous_text: ပောက်ကွဲ + gold_correction: ပေါက်ကွဲ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E147-E7 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 385 + end: 389 + erroneous_text: ပော့ + gold_correction: ပေါ့ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling + - error_id: BM-EXP-E147-E9 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 454 + end: 461 + erroneous_text: ဒောက်တာ + gold_correction: ဒေါက်တာ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 vowel_medial_substitution error. - id: BM-EXP-E148 input: ကျနော်က လွှတ်တော်မှာ ဗဟိုဝန်ထမ်းတက္ကသိုလ်ကို ဖျက်ပစ်ဖို့ ဒါမှမဟုတ် ပြုပြင်ပြော့င်းလဲရေး လုပ်ဖို့ တင်ခဲ့ပေမယ့် မအောင်မြင်ခဲ့ဘူး။ @@ -15885,6 +16717,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E155-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 134 + end: 149 + erroneous_text: သန်းခောင်စာရင်း + gold_correction: သန်းခေါင်စာရင်း + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E156 input: ငါ့သျှင်တို့ တစ်ဖန်ထို့ပြင်လည်း သီလ ရှိသော သီလနှင့်ပြည့်စုံသော သူသည် ခန္ဓာကိုယ်ပျက်စီး၍ နောက်၌ ကောင်းသော လားရာဖြစ်သော နတ်ပြည်၌ ဖြစ်ရ၏။ ဤကား သီလရှိသော သူ၏ သီလနှင့်ပြည့်စုံခြင်း၌ ငါးခုမြောက် အကျိုး 'အာနိသင်' ပေတည်း။ (၁၄) @@ -15990,6 +16835,19 @@ sentences: context_required: true notes: Injected broken_compound error. domain: spelling + - error_id: BM-EXP-E160-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 66 + end: 70 + erroneous_text: ခော် + gold_correction: ခေါ် + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 broken_compound error. - id: BM-EXP-E161 input: ဦးလင်းဇော်ထွန်းသည် လက်ရှိ တပ်မတော် ကာကွယ်ရေးဦးစီးချုပ် ဗိုလ်ချုပ်မှူးကြီး မင်းအောင်လှိုင်၏ ကိုယ်ရေးလုံခြုံရေး အရာရှိအဖြစ်တာဝန်ယူခဲ့ဖူးသည်။ လုပ်ကြံမှုကို ကြိုတင်သိရှိပြီး သတင်းထိန်ချိန်ထားခြင်း ဖြစ်နိုင်သည်ဟု သူ့ ကို လူမှုကွန်ရက်တွင် အကြီးအကျယ် စွပ်စွဲကြသည်။ သတင်းသမားများက မေးမြန်းသောအခါ အောင်ဝင်းခိုင်နှင့် ဇေယျာဖြိုးကို သူသိကြောင်း ဝန်ခံသည်။ @@ -16032,6 +16890,19 @@ sentences: context_required: false notes: Injected missing_visarga error. domain: spelling + - error_id: BM-EXP-E162-E2 + error_type: wrong_word + error_subtype: zawgyi_conversion_error + detection_layer: word + span: + start: 68 + end: 71 + erroneous_text: ပော + gold_correction: ပေါ + edit_distance: 1 + context_required: false + notes: 'Added by aw-vowel unmask audit 2026-06-07 (unmask-02): generator-planted typo left unannotated because the pre-lookup normalizer silently repaired it before detection; raw form is corpus freq-0.' + domain: spelling notes: Generated error sentence with 1 missing_visarga error. - id: BM-EXP-E163 input: ဦးလင်းဇော်ထွန်းသည် လက်ရှိ တပ်မတော် ကာကွယ်ရေးဦးစီးချုပ် ဗိုလ်ချုပ်မှူးကြီး မင်းအောင်လှိုင်၏ ကိုယ်ရေးလုံခြုံရေး အရာရှိအဖြစ်တာဝန်ယူခဲ့ဖူးသည်။ လုပ်ကြံမှုကို ကြိုတင်သိရှိပြီး သတင်းထိန်ချိန်ထားခြင်း ဖြစ်နိုင်သည်ဟု သူ့ကို လူမှုကွန်ရက်တွင် အကြီးအကျယ် စွပ်စွဲကြသည်။ သတင်းသမားများက မေးမျြန်းသောအခါ အောင်ဝင်းခိုင်နှင့် ဇေယျာဖြိုးကို သူသိကြောင်း ဝန်ခံသည်။ @@ -17900,7 +18771,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17909,7 +18780,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17918,7 +18789,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17927,7 +18798,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17936,7 +18807,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17945,7 +18816,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17954,7 +18825,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17963,7 +18834,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17972,7 +18843,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17981,7 +18852,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17990,7 +18861,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -17999,7 +18870,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18008,7 +18879,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18017,7 +18888,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18026,7 +18897,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18035,7 +18906,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18044,7 +18915,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18053,7 +18924,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18062,7 +18933,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18071,7 +18942,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18080,7 +18951,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18089,7 +18960,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18098,7 +18969,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18107,7 +18978,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18116,7 +18987,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18125,7 +18996,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18134,7 +19005,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18143,7 +19014,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18152,7 +19023,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18161,7 +19032,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18170,7 +19041,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18179,7 +19050,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18188,7 +19059,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18197,7 +19068,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18206,7 +19077,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18215,7 +19086,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18224,7 +19095,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18233,7 +19104,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18242,7 +19113,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18251,7 +19122,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18260,7 +19131,7 @@ sentences: is_clean: true domain: news register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18269,7 +19140,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18278,7 +19149,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18287,7 +19158,7 @@ sentences: is_clean: true domain: literary register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18296,7 +19167,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18305,7 +19176,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18314,7 +19185,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18323,7 +19194,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18332,7 +19203,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18341,7 +19212,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18350,7 +19221,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18359,7 +19230,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18368,7 +19239,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18377,7 +19248,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18386,7 +19257,7 @@ sentences: is_clean: true domain: news register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18395,7 +19266,7 @@ sentences: is_clean: true domain: technical register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18404,7 +19275,7 @@ sentences: is_clean: true domain: religious register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18413,7 +19284,7 @@ sentences: is_clean: true domain: literary register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18422,7 +19293,7 @@ sentences: is_clean: true domain: general register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18431,7 +19302,7 @@ sentences: is_clean: true domain: conversational register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18440,7 +19311,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18449,7 +19320,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18458,7 +19329,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18467,7 +19338,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18476,7 +19347,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18485,7 +19356,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18494,7 +19365,7 @@ sentences: is_clean: true domain: academic register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18503,7 +19374,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18512,7 +19383,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18521,7 +19392,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18530,7 +19401,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18539,7 +19410,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18548,7 +19419,7 @@ sentences: is_clean: true domain: conversational register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18557,7 +19428,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18566,7 +19437,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18575,7 +19446,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18584,7 +19455,7 @@ sentences: is_clean: true domain: religious register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18593,7 +19464,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18602,7 +19473,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18611,7 +19482,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18620,7 +19491,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18629,7 +19500,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18638,7 +19509,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18647,7 +19518,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18656,7 +19527,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18665,7 +19536,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18674,7 +19545,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18683,7 +19554,7 @@ sentences: is_clean: true domain: academic register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18692,7 +19563,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18701,7 +19572,7 @@ sentences: is_clean: true domain: technical register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18710,7 +19581,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18719,7 +19590,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18728,7 +19599,7 @@ sentences: is_clean: true domain: general register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18737,7 +19608,7 @@ sentences: is_clean: true domain: conversational register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18746,7 +19617,7 @@ sentences: is_clean: true domain: academic register: informal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18755,7 +19626,7 @@ sentences: is_clean: true domain: news register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18764,7 +19635,7 @@ sentences: is_clean: true domain: technical register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18773,7 +19644,7 @@ sentences: is_clean: true domain: religious register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18782,7 +19653,7 @@ sentences: is_clean: true domain: literary register: colloquial - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] @@ -18791,7 +19662,7 @@ sentences: is_clean: true domain: general register: formal - difficulty_tier: null + difficulty_tier: scope: spelling notes: Clean sentence from pragmatics dataset. expected_errors: [] diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 092e44e..76d8d39 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -753,6 +753,9 @@ def run_benchmark( if _os.environ.get("MSC_USE_ORTHO_RESCUE", "").lower() in ("1", "true", "yes", "on"): config.validation.compound_split_ortho_insertion_rescue = True print(" compound_split_ortho_insertion_rescue: ENABLED (via MSC_USE_ORTHO_RESCUE)") + if _os.environ.get("MSC_DETECT_AW_VOWEL_UNMASK", "").lower() in ("1", "true", "yes", "on"): + config.validation.detect_aw_vowel_unmask = True + print(" detect_aw_vowel_unmask: ENABLED (via MSC_DETECT_AW_VOWEL_UNMASK)") sme_bigram_env = _os.environ.get("MSC_SEG_MERGE_BIGRAM_THRESHOLD", "").strip() if sme_bigram_env: try: diff --git a/src/myspellchecker/core/config/validation_configs.py b/src/myspellchecker/core/config/validation_configs.py index 424ecb4..d86b52a 100644 --- a/src/myspellchecker/core/config/validation_configs.py +++ b/src/myspellchecker/core/config/validation_configs.py @@ -96,6 +96,20 @@ class ValidationConfig(BaseModel): le=1.0, description="Confidence score for medial confusion corrections (ျ vs ြ)", ) + detect_aw_vowel_unmask: bool = Field( + default=False, + description=( + "Enable the pre-normalization aw-vowel un-mask detector. The " + "pre-lookup normalizer silently repairs aw-vowel typos " + "(flat ော → tall ေါ after {ပ,ခ,ဒ}; stray ေါ → ော after other " + "bases) before the validator judges the token, masking genuine " + "spelling errors like ခော်. When True, a raw-text detector " + "emits a confusable error with the canonical form as the single " + "suggestion for tokens that are dictionary-OOV as typed but " + "valid after the aw-vowel repair. Loanword bases {ဂ,င,ဝ} are " + "excluded from the tall→flat direction. Default-off." + ), + ) raise_on_strategy_error: bool = Field( default=False, description=( diff --git a/src/myspellchecker/core/detectors/pre_normalization.py b/src/myspellchecker/core/detectors/pre_normalization.py index 0d2f15b..e4b8a11 100644 --- a/src/myspellchecker/core/detectors/pre_normalization.py +++ b/src/myspellchecker/core/detectors/pre_normalization.py @@ -1,9 +1,10 @@ """Pre-normalization text-level detectors. These detectors run on the raw (un-normalized) text to catch errors that -normalization would silently fix, destroying the evidence. They all -return ``list[SyllableError]`` — the caller merges them into the main -error list after normalization. +normalization would silently fix, destroying the evidence. They return +``list[SyllableError]`` (or ``list[WordError]`` for the aw-vowel un-mask +detector) — the caller merges them into the main error list after +normalization. Extracted from ``spellchecker.py`` to reduce file size while preserving the exact same method signatures and behaviour. @@ -11,10 +12,12 @@ from __future__ import annotations +import re from typing import TYPE_CHECKING, Any from myspellchecker.core.constants import ( ET_BROKEN_VIRAMA, + ET_CONFUSABLE_ERROR, ET_INCOMPLETE_STACKING, ET_LEADING_VOWEL_E, ET_MEDIAL_ORDER_ERROR, @@ -22,6 +25,7 @@ ET_VOWEL_AFTER_ASAT, ET_ZAWGYI_ENCODING, MEDIALS, + SKIPPED_CONTEXT_WORDS, VOWEL_SIGNS, ) @@ -30,7 +34,8 @@ from myspellchecker.providers.base import DictionaryProvider from myspellchecker.core.detector_data import TEXT_DETECTOR_CONFIDENCES from myspellchecker.core.detectors.utils import iter_occurrences -from myspellchecker.core.response import Error, SyllableError +from myspellchecker.core.response import Error, Suggestion, SyllableError, WordError +from myspellchecker.text.normalize import _ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA, normalize from myspellchecker.text.zawgyi_support import convert_zawgyi_to_unicode, get_zawgyi_detector @@ -44,6 +49,7 @@ class PreNormalizationDetectorsMixin: # --- Type stubs for attributes provided by SpellChecker --- config: "SpellCheckerConfig" provider: "DictionaryProvider" + segmenter: Any logger: Any # ------------------------------------------------------------------ # @@ -109,6 +115,47 @@ class PreNormalizationDetectorsMixin: "ေကာင်း": ["ကောင်း"], } + # --- Aw-vowel (ော ↔ ေါ) un-mask detector data ------------------------- + # Prefilter for potential aw-vowel canonicalization targets: either a + # flat AA after a round-bottom whitelist consonant ({ပ,ခ,ဒ} + ေ + ာ) or + # a stray TALL AA after any other base character ( + ေ + ါ). + _AW_VOWEL_VIOLATION_RE = re.compile( + "[ပခဒ]ော" # {ပ,ခ,ဒ} + ေ + flat ာ + "|[^ပခဒ]ေါ" # other base + ေ + tall ါ + ) + # Classical round-bottom consonants deliberately EXCLUDED from the narrow + # normalizer whitelist (see _ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA): the + # post-flat-AA-migration dictionary keys these words FLAT, but modern + # loanwords are legitimately written with TALL AA after them (ဝေါလ် + # "Wall", ဂေါ ...), so the tall→flat unmask direction must not fire on + # them — they are exactly the clean-text false-positive risk measured in + # the unmask-probe-01 kill-gate (2 clean flips, both in this set). + _AW_TALL_AA_AMBIGUOUS_BASES: frozenset[str] = frozenset( + { + "ဂ", # ဂ GA + "င", # င NGA + "ဝ", # ဝ WA + } + ) + + # Particles/function words that glue onto a typo word inside a + # whitespace chunk. A chunk containing one of these as a segmenter + # token is a syntactic phrase, not a lexical compound, so the + # whole-chunk emission granularity is suppressed for it. + # SKIPPED_CONTEXT_WORDS (the context-validator particle set) plus glue + # particles it lacks (colloquial plural/topic/attributive/directional). + _AW_CHUNK_PARTICLES: frozenset[str] = frozenset(SKIPPED_CONTEXT_WORDS) | frozenset( + { + "တွေ", # plural (colloquial) + "ဟာ", # topic marker (colloquial) + "တဲ့", # attributive (colloquial) + "သို့", # directional (formal) + "မယ့်", # future attributive (colloquial) + "ပေါ်", # locative postposition ("on") + "အပေါ်", # locative postposition ("upon/towards") + } + ) + # Vowel signs that may appear wrongly before a medial sign. # Canonical Myanmar order: consonant + medials + vowels, so vowel-before-medial # is always wrong and normalization silently fixes it. @@ -525,6 +572,222 @@ def _detect_vowel_medial_reorder(self, text: str) -> list[SyllableError]: return errors + def _detect_aw_vowel_unmask_errors(self, text: str) -> list[WordError]: + """Detect aw-vowel (ော ↔ ေါ) typos that normalization silently repairs. + + ``normalize_e_vowel_tall_aa`` canonicalizes the aw-vowel by the shape + of the preceding base character ({ပ,ခ,ဒ} → tall ါ, everything else → + flat ာ) BEFORE the validator ever judges the token, so a genuine + spelling error like ``ခော်`` (flat AA after round-bottom ခ) is + rewritten to the dictionary-valid ``ခေါ်`` and never flagged. This + detector runs on the raw text and emits the silently-applied repair + as an explicit confusable error. + + Gates (all must hold per emitted span): + - every raw↔canonical difference inside the span is a guarded + aw-vowel substitution (any other normalization diff → skip); + - flat→tall direction requires the base in the round-bottom + whitelist {ပ,ခ,ဒ}; tall→flat requires the base OUTSIDE + ``_AW_TALL_AA_AMBIGUOUS_BASES`` ({ဂ,င,ဝ} — loanword guard) and a + Myanmar consonant or medial sign; + - the canonical form is dictionary-valid (it IS the suggestion); + - the raw form is dictionary-OOV. + + Span policy (one emission per typo, see _aw_vowel_candidate_spans): + the covering segmenter word by default; the whole whitespace chunk + when it is a multi-word compound of CONTENT words (no glued + particles/postpositions — ကျောင်းမှာ emits ကျောင်း, but the lexical + compound စိတ်ပေါက် emits whole); the violated syllable only as a + fallback when the covering word fails the dictionary gates. + + Chunks containing a ``_VOWEL_REORDER_ERRORS`` key are skipped — that + detector owns those forms and emits an ambiguity-aware suggestion + list (e.g. ကေါင်း may be a ခ→က consonant typo whose correction is + NOT the aw-canonical form). + + The suggestion is constructed deterministically from the normalizer + (not SymSpell ranking), so top-1 is the canonical form by design. + Emitted errors carry ``_structural_early_exit`` so downstream + dedup/meta filters preserve them. + + Must run BEFORE normalization, which destroys the evidence. + Gated by ``config.validation.detect_aw_vowel_unmask`` (default off) + at the ``_prepare_text`` call site. + """ + if not text or self._AW_VOWEL_VIOLATION_RE.search(text) is None: + return [] + + errors: list[WordError] = [] + confidence = TEXT_DETECTOR_CONFIDENCES.get("aw_vowel_unmask", 0.95) + current_idx = 0 + for chunk in text.split(): + idx = text.find(chunk, current_idx) + if idx >= 0: + current_idx = idx + len(chunk) + else: + continue + if self._AW_VOWEL_VIOLATION_RE.search(chunk) is None: + continue + # Defer to the dedicated vowel-reorder detector for its known + # ambiguous forms (ကေါင်း can be a ခ→က consonant typo): emitting + # here would displace its multi-candidate suggestion list. + if any(key in chunk for key in self._VOWEL_REORDER_ERRORS): + continue + canon_chunk = normalize(chunk) + # Positions must map 1:1 onto the raw chunk; any length-changing + # normalization step (Zawgyi conversion, char dedup) voids that, + # so skip the chunk entirely — other detectors own those cases. + if len(canon_chunk) != len(chunk) or canon_chunk == chunk: + continue + # Cheap chunk-level guard BEFORE any provider/segmenter work: + # every diff must be a guarded aw-vowel swap. This also rejects + # the legitimately-tall ဂ/ဝ/င words (ဝေါဟာရ, ဂေါက်) that the + # complement flattening rewrites, avoiding a wasted + # segmentation pass on clean text. + if not self._aw_vowel_diffs_guarded(chunk, canon_chunk): + continue + diff_pos = [ + i for i, (a, b) in enumerate(zip(chunk, canon_chunk, strict=True)) if a != b + ] + + for start, end in self._aw_vowel_candidate_spans(chunk, canon_chunk, diff_pos): + error = WordError( + text=chunk[start:end], + position=idx + start, + suggestions=[ + Suggestion( + canon_chunk[start:end], + confidence=confidence, + source="aw_vowel_unmask", + ) + ], + confidence=confidence, + error_type=ET_CONFUSABLE_ERROR, + ) + error._structural_early_exit = True + errors.append(error) + return errors + + def _aw_vowel_candidate_spans( + self, chunk: str, canon_chunk: str, diff_pos: list[int] + ) -> list[tuple[int, int]]: + """Choose ONE gated emission span per aw-vowel violation. + + Spans derive from the CANONICAL chunk (where the repaired word + segments exactly as the pipeline will later see it). Policy: + + 1. If the chunk is a multi-word compound of CONTENT words (no + particles/postpositions) whose canonical whole form passes the + dictionary gates, the typo unit is the whole compound + (စိတ်ပေါက ်-class) — emit the chunk. + 2. Otherwise emit each diff-covering segmenter word that passes the + gates (particle glue stays out of the span: ကျောင်းမှာ → ကျောင်း). + 3. If a covering word fails the gates (noisy segmentation, OOV + canonical), fall back to the violated syllable(s) inside it. + + A span passes the gates iff its raw and canonical slices differ, all + differences are guarded aw-vowel swaps, the canonical slice is + dictionary-valid and the raw slice is dictionary-OOV. + """ + + def _walk(parts: list[str]) -> list[tuple[int, int]]: + out: list[tuple[int, int]] = [] + cursor = 0 + for part in parts: + start = canon_chunk.find(part, cursor) + if start < 0: + continue + cursor = start + len(part) + out.append((start, cursor)) + return out + + def _gated(start: int, end: int) -> bool: + raw_tok = chunk[start:end] + canon_tok = canon_chunk[start:end] + if raw_tok == canon_tok: + return False + if not self._aw_vowel_diffs_guarded(raw_tok, canon_tok): + return False + if not self.provider.is_valid_word(canon_tok): + return False + return not self.provider.is_valid_word(raw_tok) + + try: + tok_spans = _walk(self.segmenter.segment_words(canon_chunk)) + except Exception: # pragma: no cover - segmenter backend unavailable + tok_spans = [] + + if not tok_spans: + # Segmenter unavailable/empty — fall back to the whole chunk. + return [(0, len(chunk))] if _gated(0, len(chunk)) else [] + + whole = (0, len(chunk)) + if ( + len(tok_spans) > 1 + and tok_spans[0][0] == 0 + and tok_spans[-1][1] == len(canon_chunk) + and all(canon_chunk[s:e] not in self._AW_CHUNK_PARTICLES for s, e in tok_spans) + and _gated(*whole) + ): + # All-content multi-word compound (e.g. စိတ်ပေါက်): the + # orthographic typo unit is the whole compound. + return [whole] + + spans: list[tuple[int, int]] = [] + syl_spans: list[tuple[int, int]] | None = None + for start, end in tok_spans: + if not any(start <= i < end for i in diff_pos): + continue + if _gated(start, end): + spans.append((start, end)) + continue + # Covering word failed the gates — violated-syllable fallback. + if syl_spans is None: + try: + syl_spans = _walk(self.segmenter.segment_syllables(canon_chunk)) + except Exception: # pragma: no cover - segmenter backend unavailable + syl_spans = [] + for s, e in syl_spans: + if s >= start and e <= end and any(s <= i < e for i in diff_pos) and _gated(s, e): + spans.append((s, e)) + return spans + + def _aw_vowel_diffs_guarded(self, raw_tok: str, canon_tok: str) -> bool: + """True iff every raw↔canonical difference is a guarded aw-vowel swap. + + Each differing position must be an ာ/ါ ↔ ါ/ာ substitution in the + aw-vowel rime (preceded by ေ), with the direction allowed by the + base character one position earlier: + + - flat ာ → tall ါ: base must be in the round-bottom whitelist + ({ပ,ခ,ဒ}) — the only direction the normalizer repairs that way; + - tall ါ → flat ာ: base must NOT be in the loanword-ambiguous set + ({ဂ,င,ဝ}) and must be a Myanmar consonant (U+1000–U+1021) or + medial sign (U+103B–U+103E). + """ + aw_pair = ("ာ", "ါ") # ာ AA, ါ TALL AA + diffs = [i for i, (a, b) in enumerate(zip(raw_tok, canon_tok, strict=True)) if a != b] + if not diffs: + return False + for i in diffs: + raw_ch, canon_ch = raw_tok[i], canon_tok[i] + if raw_ch not in aw_pair or canon_ch not in aw_pair: + return False + if i < 2 or raw_tok[i - 1] != "ေ": # ေ + return False + base = raw_tok[i - 2] + if raw_ch == "ာ": + # flat → tall: only valid after the round-bottom whitelist. + if base not in _ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA: + return False + else: + # tall → flat: loanword guard + structural base check. + if base in self._AW_TALL_AA_AMBIGUOUS_BASES: + return False + if not ("က" <= base <= "အ" or "ျ" <= base <= "ှ"): + return False + return True + def _detect_incomplete_stacking(self, text: str) -> list[SyllableError]: """Detect incomplete Pali/Sanskrit stacking (virama present, stacked consonant missing). diff --git a/src/myspellchecker/core/spellchecker.py b/src/myspellchecker/core/spellchecker.py index 747bcdf..b549c0f 100644 --- a/src/myspellchecker/core/spellchecker.py +++ b/src/myspellchecker/core/spellchecker.py @@ -754,7 +754,8 @@ def _prepare_text(self, text: str, level: ValidationLevel) -> dict[str, Any]: medial_order_errors, duplicate_diacritic_errors, leading_vowel_e_errors, vowel_reorder_errors, vowel_medial_reorder_errors, incomplete_stacking_errors, - vowel_after_dotbelow_errors: individual error lists for merging + vowel_after_dotbelow_errors, aw_vowel_unmask_errors: + individual error lists for merging """ self._thread_local.strategy_debug_telemetry = {} self._thread_local.last_strategy_debug_telemetry = {} @@ -800,6 +801,14 @@ def _prepare_text(self, text: str, level: ValidationLevel) -> dict[str, Any]: # Detect vowel after dot-below BEFORE normalization reorders vowels vowel_after_dotbelow_errors = self._detect_vowel_after_dotbelow(text) + # Detect aw-vowel (ော↔ေါ) typos BEFORE normalization silently repairs + # them into dictionary-valid forms (detection-side un-mask, default-off) + aw_vowel_unmask_errors = ( + self._detect_aw_vowel_unmask_errors(text) + if self.config.validation.detect_aw_vowel_unmask + else [] + ) + normalized_text = self._normalize_text(text, zawgyi_config) # Build position map from original to normalized text so that @@ -821,6 +830,7 @@ def _prepare_text(self, text: str, level: ValidationLevel) -> dict[str, Any]: + vowel_reorder_errors + vowel_medial_reorder_errors + vowel_after_dotbelow_errors + + aw_vowel_unmask_errors ) return { @@ -841,6 +851,7 @@ def _prepare_text(self, text: str, level: ValidationLevel) -> dict[str, Any]: "vowel_medial_reorder_errors": vowel_medial_reorder_errors, "incomplete_stacking_errors": incomplete_stacking_errors, "vowel_after_dotbelow_errors": vowel_after_dotbelow_errors, + "aw_vowel_unmask_errors": aw_vowel_unmask_errors, } def _run_validation( @@ -885,6 +896,7 @@ def _run_validation( "vowel_reorder_errors", "vowel_medial_reorder_errors", "vowel_after_dotbelow_errors", + "aw_vowel_unmask_errors", ): for err in prepared.get(key, []): remap_pre_norm_error(err, offset_map) @@ -903,6 +915,7 @@ def _run_validation( self._merge_pre_norm_errors(errors, prepared["vowel_reorder_errors"]) self._merge_pre_norm_errors(errors, prepared["vowel_medial_reorder_errors"]) self._merge_pre_norm_errors(errors, prepared["vowel_after_dotbelow_errors"]) + self._merge_pre_norm_errors(errors, prepared["aw_vowel_unmask_errors"]) # Final span-overlap dedup for pre-normalization errors errors = self._dedup_pre_norm_overlaps(errors) diff --git a/src/myspellchecker/rules/detector_confidences.yaml b/src/myspellchecker/rules/detector_confidences.yaml index 5fddb2c..8141c92 100644 --- a/src/myspellchecker/rules/detector_confidences.yaml +++ b/src/myspellchecker/rules/detector_confidences.yaml @@ -19,6 +19,7 @@ confidences: # Pre-normalization errors zawgyi_detected: 0.95 zero_width_chars: 0.95 + aw_vowel_unmask: 0.95 # Diacritic / structural errors missing_asat: 0.90 diff --git a/src/myspellchecker/text/normalize.py b/src/myspellchecker/text/normalize.py index 8bf760a..1932a05 100644 --- a/src/myspellchecker/text/normalize.py +++ b/src/myspellchecker/text/normalize.py @@ -339,13 +339,16 @@ def normalize_e_vowel_tall_aa(text: str) -> str: round-bottom set; see the ``_ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA`` block comment for the rationale and the criterion to widen it. - Scope limit: the rewrite fires only on the bare pattern - ``consonant + ေ + {ာ, ါ}`` at adjacent positions. Medial or stacking - interpositions — e.g. ``ပ + ြ + ေ + ာ`` (``ပြော``) or ``ခ + ျ + ေ + ာ`` - — are *not* matched and pass through unmodified. This is deliberate - (the round-bottom / tall-AA interaction with medials is still under - benchmark validation), so expanding the whitelist or the match pattern - without per-consonant verification is a regression risk. + Medial clusters: the scan keys on the character immediately before + ``ေ``, which for a medial-bearing cluster is the *medial sign* (ျ ြ ွ ှ), + not the base consonant. Medials are never in the round-bottom whitelist, + so such clusters always take the complement branch and flatten stray + ``ေါ`` to ``ော`` (e.g. ``ကျေါင်း → ကျောင်း``). This is deliberate and + orthographically correct: a medial changes the glyph bottom, so + medial-bearing clusters take flat AA regardless of the base consonant + (``ပျော``, ``ပြော``, ``ကျော``, ...) — even when the base consonant alone + would take TALL AA. Canonical forms like ``ပျော်`` already use flat AA + and pass through unchanged. Args: text: Input Myanmar text. @@ -365,6 +368,10 @@ def normalize_e_vowel_tall_aa(text: str) -> str: 'ကောင်း' >>> normalize_e_vowel_tall_aa("ဖော်") # ဖ is outside whitelist 'ဖော်' + >>> normalize_e_vowel_tall_aa("ကျေါင်း") # wrongly tall after medial ျ + 'ကျောင်း' + >>> normalize_e_vowel_tall_aa("ပျော်") # medial cluster keeps flat AA + 'ပျော်' Sources: - Myanmar Language Commission, *မြန်မာ သတ်ပုံ ကျမ်း* (1978 rev. 2003). @@ -383,9 +390,20 @@ def normalize_e_vowel_tall_aa(text: str) -> str: n = len(text) while i < n: ch = text[i] - # Looking for consonant + ေ + {ာ, ါ} at positions i, i+1, i+2. + # Looking for + ေ + {ာ, ါ} at positions i, i+1, i+2, where + # is whatever character precedes the e-vowel: the consonant + # for a bare cluster, or the last medial sign (ျ ြ ွ ှ) for a + # medial-bearing cluster. if i + 2 < n and text[i + 1] == _E_VOWEL and text[i + 2] in (_AA, _TALL_AA): - target_aw = _TALL_AA if ch in _ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA else _AA + if ch in _ROUND_BOTTOM_CONSONANTS_FOR_TALL_AA: + # Bare round-bottom consonant: TALL AA is canonical. + target_aw = _TALL_AA + else: + # Complement branch — every other base, INCLUDING medials. + # Medial-bearing clusters always take flat AA (ကျော, ပြော), + # so flattening stray ေါ after a medial (ကျေါင်း → ကျောင်း) + # is the canonical repair, not an accident of the scan. + target_aw = _AA out.append(ch) out.append(_E_VOWEL) out.append(target_aw) diff --git a/tests/test_aw_vowel_unmask_detector.py b/tests/test_aw_vowel_unmask_detector.py new file mode 100644 index 0000000..e7fbdff --- /dev/null +++ b/tests/test_aw_vowel_unmask_detector.py @@ -0,0 +1,298 @@ +"""Unit tests for the pre-normalization aw-vowel un-mask detector. + +The pre-lookup normalizer silently repairs aw-vowel typos (flat ော → tall +ေါ after {ပ,ခ,ဒ}; stray ေါ → ော after other bases) BEFORE the validator +judges the token, masking genuine spelling errors like ခော်. +``_detect_aw_vowel_unmask_errors`` runs on the raw text and emits the +silently-applied repair as an explicit confusable error with the canonical +form as the single suggestion, at up to three nested span granularities +(violated syllable, segmenter word, all-content compound chunk), +narrowest first. +""" + +from __future__ import annotations + +import logging + +import pytest + +from myspellchecker.core.constants import ET_CONFUSABLE_ERROR +from myspellchecker.core.detectors.pre_normalization import PreNormalizationDetectorsMixin + + +class _StubProvider: + """Exact-key dictionary stub mirroring SQLiteProvider.is_valid_word.""" + + def __init__(self, words: set[str]) -> None: + self._words = set(words) + + def is_valid_word(self, word: str) -> bool: + return word in self._words + + +class _StubSegmenter: + """Returns canned segmentations; falls back to the whole text.""" + + def __init__( + self, + words: dict[str, list[str]] | None = None, + syllables: dict[str, list[str]] | None = None, + ) -> None: + self._words = words or {} + self._syllables = syllables or {} + + def segment_words(self, text: str) -> list[str]: + return self._words.get(text, [text]) + + def segment_syllables(self, text: str) -> list[str]: + return self._syllables.get(text, [text]) + + +class _Harness(PreNormalizationDetectorsMixin): + """Minimal host providing the mixin's attribute stubs.""" + + def __init__( + self, + words: set[str], + segments: dict[str, list[str]] | None = None, + syllables: dict[str, list[str]] | None = None, + ) -> None: + self.provider = _StubProvider(words) + self.segmenter = _StubSegmenter(segments, syllables) + self.logger = logging.getLogger("test_aw_vowel_unmask") + + +# Canonical dictionary forms used across tests (all benchmark golds). +_DICT = { + "ခေါ်", + "ခေါင်း", + "ခေါင်းကွဲ", + "ပေါ်", + "ပေါက်", + "ဒေါ်", + "စိတ်ပေါက်", + "စုစုပေါင်း", + "သန်းခေါင်စာရင်း", + "ကျောင်း", + "ကောင်း", + "ဆောင်", + "သည်", + "စိတ်", + "ဝောလ်", # loanword "Wall" — keyed FLAT post flat-AA migration + "ဂေါ", # loanword — raw tall-AA form IS the in-dict key here +} + + +class TestDirectionFlatToTall: + """Flat ော after round-bottom {ပ,ခ,ဒ} — the 28-row direction.""" + + def test_single_token_fires(self) -> None: + det = _Harness(_DICT) + errors = det._detect_aw_vowel_unmask_errors("ခော်") + assert len(errors) == 1 + err = errors[0] + assert err.text == "ခော်" + assert err.position == 0 + assert err.error_type == ET_CONFUSABLE_ERROR + assert err.suggestions[0] == "ခေါ်" + assert err._structural_early_exit is True + assert err.confidence >= 0.60 # must not downgrade to INFORM + + def test_token_inside_sentence_position(self) -> None: + det = _Harness(_DICT) + text = "ဒီနေ့ ခော် သည်" + errors = det._detect_aw_vowel_unmask_errors(text) + assert len(errors) == 1 + assert errors[0].position == text.index("ခော်") + assert errors[0].suggestions[0] == "ခေါ်" + + def test_multi_syllable_compound_whole_word(self) -> None: + det = _Harness( + _DICT, + syllables={"သန်းခေါင်စာရင်း": ["သန်း", "ခေါင်", "စာ", "ရင်း"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("သန်းခောင်စာရင်း") + # Violated syllable ခေါင် is not in the stub dict, so only the + # whole compound (segmenter keeps it as one word) is emitted. + assert len(errors) == 1 + assert errors[0].text == "သန်းခောင်စာရင်း" + assert errors[0].suggestions[0] == "သန်းခေါင်စာရင်း" + + def test_deterministic_gold_for_da_row(self) -> None: + """ဒော် → ဒေါ် — the row generic SymSpell ranking missed (top-1 တော်). + The deterministic canonical construction must recover it.""" + det = _Harness(_DICT) + errors = det._detect_aw_vowel_unmask_errors("ဒော်") + assert len(errors) == 1 + assert errors[0].suggestions[0] == "ဒေါ်" + + def test_particle_glued_chunk_emits_word_only(self) -> None: + """Particle-glued chunk → emission at the inner word, no whole-chunk + emission even when the glued form is a (noisy) dictionary entry.""" + det = _Harness( + _DICT | {"ခေါ်သည်"}, # noisy glued dict entry must NOT widen the span + segments={"ခေါ်သည်": ["ခေါ်", "သည်"]}, + syllables={"ခေါ်သည်": ["ခေါ်", "သည်"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("ခော်သည်") + assert len(errors) == 1 + assert errors[0].text == "ခော်" + assert errors[0].position == 0 + assert errors[0].suggestions[0] == "ခေါ်" + + def test_compound_token_emits_whole_word(self) -> None: + """A violated syllable inside a lexical compound token emits at the + token span (the full word, the in-text typo unit).""" + det = _Harness( + _DICT, + segments={"ခေါင်းကွဲ": ["ခေါင်းကွဲ"]}, + syllables={"ခေါင်းကွဲ": ["ခေါင်း", "ကွဲ"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("ခောင်းကွဲ") + assert [e.text for e in errors] == ["ခောင်းကွဲ"] + assert [str(e.suggestions[0]) for e in errors] == ["ခေါင်းကွဲ"] + + def test_all_content_compound_chunk_emitted(self) -> None: + """A multi-word all-content chunk (lexical compound) emits the + whole-chunk span — the natural typo unit for compounds like + စိတ်ပေါက် that the segmenter splits.""" + det = _Harness( + _DICT, + segments={"စိတ်ပေါက်": ["စိတ်", "ပေါက်"]}, + syllables={"စိတ်ပေါက်": ["စိတ်", "ပေါက်"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("စိတ်ပောက်") + assert [e.text for e in errors] == ["စိတ်ပောက်"] + assert [str(e.suggestions[0]) for e in errors] == ["စိတ်ပေါက်"] + + def test_postposition_glued_chunk_emits_inner_word(self) -> None: + """A noun + locative postposition phrase is NOT a lexical compound: + the emission is the postposition word carrying the typo.""" + det = _Harness( + _DICT | {"ပြဿနာ", "အပေါ်", "ပြဿနာအပေါ်"}, + segments={"ပြဿနာအပေါ်": ["ပြဿနာ", "အပေါ်"]}, + syllables={"ပြဿနာအပေါ်": ["ပြ", "ဿ", "နာ", "အ", "ပေါ်"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("ပြဿနာအပော်") + assert [e.text for e in errors] == ["အပော်"] + assert [str(e.suggestions[0]) for e in errors] == ["အပေါ်"] + + def test_syllable_fallback_when_token_gate_fails(self) -> None: + """When the covering token's canonical form is dictionary-OOV + (noisy segmentation), the violated syllable is emitted instead.""" + det = _Harness( + _DICT, # ဒေါ် in dict; the glued name token is NOT + segments={"ဒေါ်အောင်ဆန်းမူ": ["ဒေါ်အောင်ဆန်းမူ"]}, + syllables={"ဒေါ်အောင်ဆန်းမူ": ["ဒေါ်", "အောင်", "ဆန်း", "မူ"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("ဒော်အောင်ဆန်းမူ") + assert [e.text for e in errors] == ["ဒော်"] + assert [str(e.suggestions[0]) for e in errors] == ["ဒေါ်"] + + +class TestDirectionTallToFlat: + """Stray tall ေါ after complement bases — the 2-row reverse direction.""" + + def test_after_complement_consonant(self) -> None: + det = _Harness(_DICT) + errors = det._detect_aw_vowel_unmask_errors("ဆေါင်") + assert len(errors) == 1 + assert errors[0].suggestions[0] == "ဆောင်" + + def test_after_medial_ya(self) -> None: + """ကျေါင်း → ကျောင်း: the medial sign occupies the base slot and + medial clusters take flat AA (see TestMedialClusterContract in + test_normalize_e_vowel_tall_aa.py).""" + det = _Harness(_DICT) + errors = det._detect_aw_vowel_unmask_errors("ကျေါင်း") + assert len(errors) == 1 + assert errors[0].suggestions[0] == "ကျောင်း" + + def test_vowel_reorder_map_forms_deferred(self) -> None: + """Chunks containing a _VOWEL_REORDER_ERRORS key are owned by the + dedicated detector: ကေါင်း can be a ခ→က consonant typo (gold + ခေါင်းလောင်း), so the single-suggestion unmask emission must not + displace that detector's multi-candidate list.""" + det = _Harness(_DICT | {"ကောင်းလောင်း"}) + assert det._detect_aw_vowel_unmask_errors("ကေါင်းလောင်း") == [] + assert det._detect_aw_vowel_unmask_errors("ကေါင်း") == [] + + +class TestLoanwordGuard: + """The {ဂ,င,ဝ} bases — classical round-bottom consonants excluded from + the narrow whitelist — must never fire in the tall→flat direction. + These are exactly the 2 clean-text flips from the unmask-probe-01 + kill-gate (ဝေါလ်, ဂေါ).""" + + def test_wa_loanword_not_flagged(self) -> None: + # ဝေါလ် is OOV as typed (dict keys the flat form ဝောလ်) and the + # canonical form IS in-dict — only the base guard blocks the fire. + det = _Harness(_DICT) + assert det._detect_aw_vowel_unmask_errors("ဝေါလ်") == [] + + def test_ga_loanword_not_flagged(self) -> None: + det = _Harness(_DICT) + assert det._detect_aw_vowel_unmask_errors("ဂေါ") == [] + + +class TestNoFireGates: + """Each precision gate must independently block the emission.""" + + def test_canonical_text_no_fire(self) -> None: + det = _Harness(_DICT) + assert det._detect_aw_vowel_unmask_errors("ခေါ် သည် ကျောင်း") == [] + + def test_complement_flat_aa_no_fire(self) -> None: + det = _Harness(_DICT) + assert det._detect_aw_vowel_unmask_errors("ကောင်း တော သော") == [] + + def test_canonical_not_in_dict_no_fire(self) -> None: + det = _Harness(set()) + assert det._detect_aw_vowel_unmask_errors("ခော်") == [] + + def test_raw_form_in_dict_no_fire(self) -> None: + det = _Harness(_DICT | {"ခော်"}) + assert det._detect_aw_vowel_unmask_errors("ခော်") == [] + + def test_non_aw_diff_in_chunk_no_fire(self) -> None: + """A chunk whose canonicalization changes anything besides the + aw-vowel is out of scope (other detectors own those repairs).""" + det = _Harness(_DICT | {"ဆောင်း"}) + # ဆေါငး် has BOTH a stray tall AA and a visarga-asat reorder: + # normalize() fixes both, so the diff is not aw-only → skip. + assert det._detect_aw_vowel_unmask_errors("ဆေါငး်") == [] + + def test_empty_and_non_myanmar(self) -> None: + det = _Harness(_DICT) + assert det._detect_aw_vowel_unmask_errors("") == [] + assert det._detect_aw_vowel_unmask_errors("hello world") == [] + + +class TestDiffGuardUnit: + """Direct unit coverage of _aw_vowel_diffs_guarded.""" + + @pytest.fixture() + def det(self) -> _Harness: + return _Harness(_DICT) + + def test_flat_to_tall_round_bottom_ok(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ခော်", "ခေါ်") is True + + def test_flat_to_tall_outside_whitelist_rejected(self, det: _Harness) -> None: + # Normalizer never repairs flat→tall outside {ပ,ခ,ဒ}; reject defensively. + assert det._aw_vowel_diffs_guarded("ကော်", "ကေါ်") is False + + def test_tall_to_flat_ambiguous_base_rejected(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ဝေါလ်", "ဝောလ်") is False + + def test_tall_to_flat_medial_base_ok(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ကျေါင်း", "ကျောင်း") is True + + def test_identical_strings_rejected(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ခေါ်", "ခေါ်") is False + + def test_non_aw_diff_rejected(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ခမ်", "ခန်") is False + + def test_multiple_aw_diffs_all_guarded(self, det: _Harness) -> None: + assert det._aw_vowel_diffs_guarded("ပော်ပောက်", "ပေါ်ပေါက်") is True diff --git a/tests/test_normalize_e_vowel_tall_aa.py b/tests/test_normalize_e_vowel_tall_aa.py index 4ce194c..8d7593a 100644 --- a/tests/test_normalize_e_vowel_tall_aa.py +++ b/tests/test_normalize_e_vowel_tall_aa.py @@ -151,6 +151,41 @@ def test_multiple_aw_vowels_in_one_word(self) -> None: assert normalize_e_vowel_tall_aa(flat) == expected +class TestMedialClusterContract: + """Medial-bearing clusters always take flat AA — the scan keys on the + medial sign in the base slot and flattens stray ေါ deliberately. + + Pins the documented contract (previously the docstring claimed medial + clusters pass through unmodified, which was false): the complement + branch firing on medials is intentional and orthographically correct. + """ + + @pytest.mark.parametrize( + "wrong,expected", + [ + ("ကျေါင်း", "ကျောင်း"), # stray tall after medial ျ → flatten + ("ကြေါ", "ကြော"), # stray tall after medial ြ → flatten + ("ပျေါ်", "ပျော်"), # whitelist base ပ + medial: cluster takes flat + ], + ) + def test_stray_tall_after_medial_flattened(self, wrong: str, expected: str) -> None: + assert normalize_e_vowel_tall_aa(wrong) == expected + + @pytest.mark.parametrize( + "word", + [ + "ပျော်", # "fun" — flat AA canonical despite whitelist base ပ + "ပြော", # "speak" + "ကျော", # "back" + "ခြောက်", # "six / dry" — whitelist base ခ + medial ြ stays flat + ], + ) + def test_medial_cluster_flat_aa_preserved(self, word: str) -> None: + """The round-bottom repair must never fire through a medial: the + char before ေ is the medial, not the whitelist consonant.""" + assert normalize_e_vowel_tall_aa(word) == word + + class TestPassthrough: """Non-targeted inputs must be untouched.""" From e42c41a0dac478033a4b9e4929477199b0e71550 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Sun, 7 Jun 2026 03:28:53 +0800 Subject: [PATCH 07/13] perf(pipeline): lat-02 memoize nasal variants + probe scores p95 658 -> 401ms (gate PASS): instance-level memo of dictionary-valid nasal variants in SymSpell (was 15M is_valid_word calls on tail) plus LRU memo on the frozen probe's score_sentence (69% duplicate forwards). Byte-identical detections across all 2084 benchmark sentences. Workstream: v18-latency-reduction Benchmark: myspellchecker_benchmark.yaml@1.5.0-v18c-aw-unmask-annotations Metrics: composite 0.6870 -> 0.6870 (0.0000); p95 658 -> 401ms --- .../algorithms/probe/syllable_span_probe.py | 25 ++++++++- src/myspellchecker/algorithms/symspell.py | 54 +++++++++++++++---- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/src/myspellchecker/algorithms/probe/syllable_span_probe.py b/src/myspellchecker/algorithms/probe/syllable_span_probe.py index a8cb32c..25d1fa1 100644 --- a/src/myspellchecker/algorithms/probe/syllable_span_probe.py +++ b/src/myspellchecker/algorithms/probe/syllable_span_probe.py @@ -16,6 +16,7 @@ from __future__ import annotations import json +from collections import OrderedDict from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING @@ -142,6 +143,12 @@ def __init__( self.model.eval() self.max_length = max_length self.segmenter = RegexSegmenter() + # Memo of score_sentence results. The probe is frozen and inference + # is deterministic per text, and the three probe strategies share + # one engine — on the p95 latency tail ~69% of score_sentence calls + # within a single check() are duplicates of the same sentence text. + self._score_cache: OrderedDict[str, tuple[list[float], list[_SyllableSpan]]] = OrderedDict() + self._SCORE_CACHE_MAX = 256 logger.info( "ProbeInferenceEngine loaded: encoder=%s head=%s device=%s", encoder_path, @@ -150,7 +157,23 @@ def __init__( ) def score_sentence(self, text: str) -> tuple[list[float], list[_SyllableSpan]]: - """Return (per-syllable probability list, syllable span list).""" + """Return (per-syllable probability list, syllable span list). + + Results are memoized per text (LRU, behavior-identical: the frozen + probe is deterministic). The outer lists are copied per call so a + caller mutating its result cannot poison the cache. + """ + cached = self._score_cache.get(text) + if cached is not None: + self._score_cache.move_to_end(text) + return list(cached[0]), list(cached[1]) + probs, spans = self._score_sentence_uncached(text) + if len(self._score_cache) >= self._SCORE_CACHE_MAX: + self._score_cache.popitem(last=False) + self._score_cache[text] = (probs, spans) + return list(probs), list(spans) + + def _score_sentence_uncached(self, text: str) -> tuple[list[float], list[_SyllableSpan]]: if not text: return [], [] diff --git a/src/myspellchecker/algorithms/symspell.py b/src/myspellchecker/algorithms/symspell.py index 62bb59a..5186d2e 100644 --- a/src/myspellchecker/algorithms/symspell.py +++ b/src/myspellchecker/algorithms/symspell.py @@ -455,6 +455,17 @@ def __init__( # Track which levels have been indexed self._indexed_levels: set[str] = set() + # Memo of dictionary-VALID nasal variants per indexed term, shared + # across lookups. Validity of an in-dictionary term's nasal variants + # is stable for the provider's lifetime (the same assumption the + # long-lived delete index makes), so this is instance-level, not a + # per-check session cache. Without it, _find_similar_terms + # re-validates the same variants once per delete-bucket hit — tens + # of millions of is_valid_word calls on long sentences (p95 tail). + # Benign data race under threads: values are idempotent. + self._nasal_variant_cache: dict[tuple[str, str], frozenset[str]] = {} + self._NASAL_VARIANT_CACHE_MAX = 150_000 + # RLock for thread-safe index access # Uses RLock (reentrant) so the same thread can acquire multiple times. # Both reads and writes are protected to prevent race conditions between @@ -964,18 +975,12 @@ def _find_similar_terms(self, delete_var: str, level: str) -> set[str]: for original_term, _ in self._deletes[delete_var]: similar.add(original_term) # Add nasal variants for better matching — - # only expand when the term actually contains nasal endings + # only expand when the term actually contains nasal endings. + # Validity per (term, level) is memoized: the same indexed + # term reappears in many delete buckets, and re-validating + # its variants dominated the p95 latency tail. if _has_nasal_ending(original_term): - nasal_vars = get_nasal_variants(original_term) - for variant in nasal_vars: - if variant == original_term: - continue # skip self-variant (already added) - if level == ValidationLevel.SYLLABLE.value: - if self.provider.is_valid_syllable(variant): - similar.add(variant) - else: - if self.provider.is_valid_word(variant): - similar.add(variant) + similar.update(self._valid_nasal_variants(original_term, level)) # 2. Fallback for unindexed terms (e.g. if build_index wasn't called) # This handles simple deletion errors where the delete_var IS the word @@ -992,6 +997,33 @@ def _find_similar_terms(self, delete_var: str, level: str) -> set[str]: return similar + def _valid_nasal_variants(self, original_term: str, level: str) -> frozenset[str]: + """Dictionary-valid nasal variants of an indexed term, memoized. + + The enum/level dispatch and the provider validity probes are hoisted + out of the delete-bucket hot loop into this memo. The cache is + instance-level because variant validity only changes if the + underlying dictionary changes — the same lifetime assumption as the + delete index itself. + """ + key = (original_term, level) + cached = self._nasal_variant_cache.get(key) + if cached is not None: + return cached + if level == ValidationLevel.SYLLABLE.value: + is_valid = self.provider.is_valid_syllable + else: + is_valid = self.provider.is_valid_word + valid = frozenset( + variant + for variant in get_nasal_variants(original_term) + if variant != original_term and is_valid(variant) + ) + if len(self._nasal_variant_cache) >= self._NASAL_VARIANT_CACHE_MAX: + self._nasal_variant_cache.clear() + self._nasal_variant_cache[key] = valid + return valid + def _collect_candidates( self, term: str, From c82876ea429200682875224dc62d3fe91e597dbc Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 08:01:06 +0800 Subject: [PATCH 08/13] fix(strategy): serialize probe score-cache against batch-async races The lat-02 score-cache (OrderedDict LRU) is built once and shared across all probe strategies, and check_batch_async hits it concurrently via asyncio.to_thread workers. An unlocked get/move_to_end racing another worker's popitem(last=False) near the 256 cap could raise KeyError and crash that check(). Guard the LRU dict ops with a threading.Lock (released around the encoder pass, so a duplicate miss merely recomputes) and re-check on insert. Workstream: v18-preship-fixes --- .../algorithms/probe/syllable_span_probe.py | 37 ++++++-- tests/test_probe_cache_concurrency.py | 89 +++++++++++++++++++ 2 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 tests/test_probe_cache_concurrency.py diff --git a/src/myspellchecker/algorithms/probe/syllable_span_probe.py b/src/myspellchecker/algorithms/probe/syllable_span_probe.py index 25d1fa1..4ecd773 100644 --- a/src/myspellchecker/algorithms/probe/syllable_span_probe.py +++ b/src/myspellchecker/algorithms/probe/syllable_span_probe.py @@ -16,6 +16,7 @@ from __future__ import annotations import json +import threading from collections import OrderedDict from dataclasses import dataclass from pathlib import Path @@ -149,6 +150,12 @@ def __init__( # within a single check() are duplicates of the same sentence text. self._score_cache: OrderedDict[str, tuple[list[float], list[_SyllableSpan]]] = OrderedDict() self._SCORE_CACHE_MAX = 256 + # The single engine is shared across all probe strategies and is hit + # concurrently by check_batch_async (asyncio.to_thread workers), so the + # OrderedDict LRU ops must be serialized: an unlocked get/move_to_end + # racing another thread's popitem(last=False) can raise KeyError. The + # lock guards only the (microsecond) dict ops, never the encoder pass. + self._score_cache_lock = threading.Lock() logger.info( "ProbeInferenceEngine loaded: encoder=%s head=%s device=%s", encoder_path, @@ -161,16 +168,30 @@ def score_sentence(self, text: str) -> tuple[list[float], list[_SyllableSpan]]: Results are memoized per text (LRU, behavior-identical: the frozen probe is deterministic). The outer lists are copied per call so a - caller mutating its result cannot poison the cache. + caller mutating its result cannot poison the cache. All cache reads + and writes are serialized under ``_score_cache_lock`` so concurrent + check_batch_async workers cannot race the LRU eviction; the lock is + released around the encoder pass, so a duplicate miss merely recomputes + (idempotent) rather than blocking. """ - cached = self._score_cache.get(text) - if cached is not None: - self._score_cache.move_to_end(text) - return list(cached[0]), list(cached[1]) + with self._score_cache_lock: + cached = self._score_cache.get(text) + if cached is not None: + self._score_cache.move_to_end(text) + return list(cached[0]), list(cached[1]) probs, spans = self._score_sentence_uncached(text) - if len(self._score_cache) >= self._SCORE_CACHE_MAX: - self._score_cache.popitem(last=False) - self._score_cache[text] = (probs, spans) + with self._score_cache_lock: + # Re-check: a concurrent worker may have inserted the same text + # while we held no lock during inference. Prefer the existing entry + # so all callers share one canonical tuple. + existing = self._score_cache.get(text) + if existing is not None: + self._score_cache.move_to_end(text) + probs, spans = existing + else: + if len(self._score_cache) >= self._SCORE_CACHE_MAX: + self._score_cache.popitem(last=False) + self._score_cache[text] = (probs, spans) return list(probs), list(spans) def _score_sentence_uncached(self, text: str) -> tuple[list[float], list[_SyllableSpan]]: diff --git a/tests/test_probe_cache_concurrency.py b/tests/test_probe_cache_concurrency.py new file mode 100644 index 0000000..667ed54 --- /dev/null +++ b/tests/test_probe_cache_concurrency.py @@ -0,0 +1,89 @@ +"""Concurrency regression test for ProbeInferenceEngine.score_sentence. + +The single probe engine is shared across all probe strategies and is hit by +check_batch_async via asyncio.to_thread workers. Before the lat-02 cache was +guarded, an unsynchronized OrderedDict LRU (get/move_to_end racing another +thread's popitem(last=False)) could raise KeyError near the size cap. This test +drives many threads against a near-full cache with distinct per-thread texts to +force concurrent evictions, and asserts no exception plus correct results. +""" + +from __future__ import annotations + +import threading +from collections import OrderedDict + +from myspellchecker.algorithms.probe.syllable_span_probe import ( + ProbeInferenceEngine, + _SyllableSpan, +) + + +def _make_engine(cache_max: int) -> ProbeInferenceEngine: + """Build an engine without loading the model (bypass __init__). + + Only the cache machinery is needed; _score_sentence_uncached is stubbed to + a deterministic pure function so the test never touches torch / the encoder. + """ + eng = ProbeInferenceEngine.__new__(ProbeInferenceEngine) + eng._score_cache = OrderedDict() + eng._SCORE_CACHE_MAX = cache_max + eng._score_cache_lock = threading.Lock() + + def _uncached(text: str) -> tuple[list[float], list[_SyllableSpan]]: + # Deterministic per-text payload: a probe is frozen/deterministic, so a + # given text must always map to the same scores regardless of thread. + probs = [float(len(text) % 7) / 7.0] + spans = [_SyllableSpan(text=text, start=0, end=len(text))] + return probs, spans + + eng._score_sentence_uncached = _uncached # type: ignore[method-assign] + return eng + + +def test_concurrent_eviction_no_keyerror() -> None: + eng = _make_engine(cache_max=256) + # Pre-fill to the cap so every miss triggers a popitem under contention. + for i in range(256): + eng.score_sentence(f"seed-{i}") + assert len(eng._score_cache) == 256 + + errors: list[BaseException] = [] + barrier = threading.Barrier(16) + + def worker(tid: int) -> None: + barrier.wait() # maximize overlap on the LRU ops + try: + for j in range(400): + text = f"t{tid}-{j}" + probs, spans = eng.score_sentence(text) + assert spans and spans[0].text == text + assert probs == [float(len(text) % 7) / 7.0] + # Re-read a hot key to exercise the get/move_to_end path. + eng.score_sentence("seed-0") + except BaseException as exc: # noqa: BLE001 - capture for assertion + errors.append(exc) + + threads = [threading.Thread(target=worker, args=(t,)) for t in range(16)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"score_sentence raced under concurrency: {errors[:3]}" + # Invariant: the LRU never exceeds its cap despite concurrent inserts. + assert len(eng._score_cache) <= 256 + + +def test_cache_hit_returns_independent_copies() -> None: + eng = _make_engine(cache_max=8) + p1, s1 = eng.score_sentence("ကျောင်း") + p2, s2 = eng.score_sentence("ကျောင်း") + # Outer lists are fresh per call so a caller mutating one cannot poison the + # cache for the next caller. + assert p1 == p2 and s1 == s2 + assert p1 is not p2 + assert s1 is not s2 + p1.append(999.0) + p3, _ = eng.score_sentence("ကျောင်း") + assert 999.0 not in p3 From b4f85600de955cb0435daa4bbc0fde48bee2c2e6 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 08:18:57 +0800 Subject: [PATCH 09/13] refactor(strategy): drop dead ortho-typo wrapper + correct stale comments Remove the unused _is_orthographic_insertion_typo bool wrapper (no committed callers; the wired gate is _ortho_insertion_detail) and fold its full spec into that method's docstring. Correct two misleading comments that risk steering a future edit wrong: the symspell nasal-variant cache is serialized under _index_lock (not a "benign data race"), and U+103D is MEDIAL WA, not ha-htoe. No behavior change. Workstream: v18-preship-fixes --- src/myspellchecker/algorithms/symspell.py | 4 ++- src/myspellchecker/core/error_suppression.py | 31 ++++++++------------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/myspellchecker/algorithms/symspell.py b/src/myspellchecker/algorithms/symspell.py index 5186d2e..bc45f5e 100644 --- a/src/myspellchecker/algorithms/symspell.py +++ b/src/myspellchecker/algorithms/symspell.py @@ -462,7 +462,9 @@ def __init__( # per-check session cache. Without it, _find_similar_terms # re-validates the same variants once per delete-bucket hit — tens # of millions of is_valid_word calls on long sentences (p95 tail). - # Benign data race under threads: values are idempotent. + # Mutated only inside _valid_nasal_variants, which is reached solely via + # _find_similar_terms under _index_lock, so all reads/writes (and the + # overflow .clear()) are serialized — there is no data race here. self._nasal_variant_cache: dict[tuple[str, str], frozenset[str]] = {} self._NASAL_VARIANT_CACHE_MAX = 150_000 diff --git a/src/myspellchecker/core/error_suppression.py b/src/myspellchecker/core/error_suppression.py index 19d4532..25321d6 100644 --- a/src/myspellchecker/core/error_suppression.py +++ b/src/myspellchecker/core/error_suppression.py @@ -184,14 +184,15 @@ # codepoint) marks an orthographic-insertion typo, distinguished from the # deletion / whole-word-swap shapes that the compound-split suppressor must # keep killing. Members: asat (U+103A), visarga (U+1038), dot-below (U+1037), -# ha-htoe pair (U+103D/U+103E), ya-medial pair (U+103B/U+103C). Used by -# ``_is_orthographic_insertion_typo`` as the SHAPE gate — never a freq/ed gate. +# medial-wa/medial-ha pair (U+103D/U+103E), medial-ya/medial-ra pair +# (U+103B/U+103C). Used by ``_ortho_insertion_detail`` as the SHAPE gate — +# never a freq/ed gate. _ORTHO_INSERTION_CHARS = frozenset( { chr(0x103A), # asat (killer / virama-marker) chr(0x1038), # visarga chr(0x1037), # dot below - chr(0x103D), # medial wa / ha-htoe + chr(0x103D), # medial wa chr(0x103E), # medial ha chr(0x103B), # medial ya chr(0x103C), # medial ra @@ -1164,9 +1165,14 @@ def _skip_rule_has_confident_candidate(self, word: str) -> bool: min_freq=getattr(validation, "skip_rule_gate_min_freq", 1000), ) - def _is_orthographic_insertion_typo(self, word: str) -> bool: - """SHAPE gate: True iff the token's ed=1 SymSpell top-1 is a single - in-syllable diacritic INSERTION (gold = typo + 1 codepoint). + def _ortho_insertion_detail(self, word: str) -> tuple[int, str, str] | None: + """SHAPE gate: detect a single in-syllable-diacritic INSERTION typo. + + Returns ``(insert_index, inserted_char, corrected_top1)`` when the + token's ed=1 SymSpell top-1 differs from ``word`` by exactly one + inserted in-syllable diacritic (gold = typo + 1 codepoint), else + ``None``. ``insert_index`` is the offset in ``word`` at which + ``inserted_char`` is added to yield ``corrected_top1``. Recovers orthographic-insertion typos (e.g. ``ရုပ`` → ``ရုပ်`` with an asat inserted) that ``_suppress_compound_split_valid_words`` kills. @@ -1175,7 +1181,7 @@ def _is_orthographic_insertion_typo(self, word: str) -> bool: correction shape (length-diff + identity of the inserted codepoint) separates them. - Returns True iff ALL of: + Returns non-``None`` iff ALL of: * ``word`` contains no ASCII/Latin codepoint (loanword grey-zone guard — 'algorithm'+la / 'website'+asat share this shape but are clean-text FPs); @@ -1191,17 +1197,6 @@ def _is_orthographic_insertion_typo(self, word: str) -> bool: Defensive against a missing SymSpell handle, ``None``/empty top-1, and length edge cases. """ - return self._ortho_insertion_detail(word) is not None - - def _ortho_insertion_detail(self, word: str) -> tuple[int, str, str] | None: - """Detail variant of :meth:`_is_orthographic_insertion_typo`. - - Returns ``(insert_index, inserted_char, corrected_top1)`` when the - token's ed=1 SymSpell top-1 is a single in-syllable-diacritic - insertion, else ``None``. ``insert_index`` is the offset in ``word`` - at which ``inserted_char`` is added to yield ``corrected_top1``. Pure - SHAPE gate — never relaxes a freq/ed threshold. - """ if not word: return None # Loanword grey-zone guard: any ASCII/Latin char in the token (or From f7b480fb42d215250bdc295c0be15607d77c9ba8 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 08:23:44 +0800 Subject: [PATCH 10/13] test(strategy): repair ortho-rescue harness flag + add rescue/aw-vowel coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compound-split suppression tests build a MagicMock config, so the default-off compound_split_ortho_insertion_rescue flag read as a truthy Mock and the rescue fired in-test — turning one freq-gate test red on develop. Pin the flag to its production default in _build_mixin (opt-in via a new param). Add coverage that was missing: _ortho_insertion_detail shape gate and _narrow_ortho_insertion_error narrowing, plus the aw-vowel guard direction/boundary and span-fallback / duplicate-position branches. Workstream: v18-preship-fixes --- tests/test_aw_vowel_unmask_detector.py | 44 ++++++++++ tests/test_error_suppression.py | 114 +++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/tests/test_aw_vowel_unmask_detector.py b/tests/test_aw_vowel_unmask_detector.py index e7fbdff..2ed504d 100644 --- a/tests/test_aw_vowel_unmask_detector.py +++ b/tests/test_aw_vowel_unmask_detector.py @@ -296,3 +296,47 @@ def test_non_aw_diff_rejected(self, det: _Harness) -> None: def test_multiple_aw_diffs_all_guarded(self, det: _Harness) -> None: assert det._aw_vowel_diffs_guarded("ပော်ပောက်", "ပေါ်ပေါက်") is True + + def test_no_base_before_aw_rejected(self, det: _Harness) -> None: + # Diff at index 1 — no base two positions back (the i < 2 guard). + assert det._aw_vowel_diffs_guarded("ေါ", "ော") is False + + def test_non_myanmar_base_rejected(self, det: _Harness) -> None: + # tall→flat requires a Myanmar consonant / medial base; Latin rejected. + assert det._aw_vowel_diffs_guarded("Aေါ", "Aော") is False + + def test_mixed_valid_and_invalid_diffs_rejected(self, det: _Harness) -> None: + # One guarded aw-swap (ပ: ော→ေါ) plus a non-aw diff (မ→န): the + # all-diffs-must-pass contract must reject the whole token. + assert det._aw_vowel_diffs_guarded("ပော်ခမ်", "ပေါ်ခန်") is False + + +class TestSpanFallbacksAndPositions: + """Defensive span-selection branches + duplicate-chunk position tracking.""" + + def test_segmenter_empty_falls_back_to_whole_chunk(self) -> None: + # segment_words returns [] for the canonical chunk → whole-chunk gate + # is the fallback emission (non-default / unavailable segmenter). + det = _Harness(_DICT, segments={"ခေါ်": []}) + errors = det._detect_aw_vowel_unmask_errors("ခော်") + assert [e.text for e in errors] == ["ခော်"] + assert [str(e.suggestions[0]) for e in errors] == ["ခေါ်"] + + def test_walk_skips_untraceable_segmenter_part(self) -> None: + # A segmenter part that is not a substring of the canonical chunk + # (non-tiling backend) is skipped without derailing the emission. + det = _Harness(_DICT, segments={"ခေါ်": ["ခေါ်", "ZZZ"]}) + errors = det._detect_aw_vowel_unmask_errors("ခော်") + assert [e.text for e in errors] == ["ခော်"] + + def test_duplicate_typo_chunk_distinct_positions(self) -> None: + # The same typo chunk repeated must anchor to start-advanced offsets, + # not collapse both onto the first occurrence. + text = "ခော် ခော်" + det = _Harness(_DICT) + errors = det._detect_aw_vowel_unmask_errors(text) + assert len(errors) == 2 + assert [e.position for e in errors] == [ + text.index("ခော်"), + text.rindex("ခော်"), + ] diff --git a/tests/test_error_suppression.py b/tests/test_error_suppression.py index 552fef8..0a4e052 100644 --- a/tests/test_error_suppression.py +++ b/tests/test_error_suppression.py @@ -354,6 +354,7 @@ def _build_mixin( syllables: list[str], valid_syllables: set[str] | None = None, symspell_top1: tuple[str, float, int] | None = None, + ortho_rescue: bool = False, ): """Create a mixin with just enough wiring to exercise the suppressor. @@ -373,6 +374,10 @@ def _build_mixin( # Wire the skip-rule gate params. mixin.config.validation.skip_rule_gate_max_ed = max_ed mixin.config.validation.skip_rule_gate_min_freq = min_freq + # config is a MagicMock, so an unset boolean flag would read as a + # truthy Mock. Pin the ortho-insertion-rescue carve-out to its real + # production default (off) unless a test opts in explicitly. + mixin.config.validation.compound_split_ortho_insertion_rescue = ortho_rescue # SymSpell mock. if symspell_top1 is None: @@ -418,6 +423,23 @@ def test_suppresses_when_candidate_below_freq_gate(self): mixin._suppress_compound_split_valid_words(errors) assert errors == [] + def test_ortho_rescue_on_keeps_and_narrows_below_freq_gate(self): + # With the carve-out enabled, an ed=1 single-asat insertion below the + # freq gate is KEPT (not suppressed) and narrowed to the affected + # syllable carrying the corrected form as the rank-1 suggestion. + mixin = self._build_mixin( + syllables=["စွမ်း", "ဆောင်", "ရ", "ည"], + symspell_top1=("စွမ်းဆောင်ရည်", 1.0, 500), # below min_freq=1000 + ortho_rescue=True, + ) + err = _make_error(text="စွမ်းဆောင်ရည", error_type="invalid_word") + errors = [err] + mixin._suppress_compound_split_valid_words(errors) + assert errors == [err] # kept, not suppressed + assert err.text == "ည" # narrowed to the affected syllable + assert err.suggestions[0].text == "ည်" + assert err._structural_early_exit is True + def test_suppresses_when_candidate_above_ed_gate(self): mixin = self._build_mixin( syllables=["စွမ်း", "ဆောင်", "ရ", "ည"], @@ -452,6 +474,98 @@ def test_does_not_suppress_short_word_even_without_candidate(self): assert errors == [err] +# --------------------------------------------------------------------------- +# Orthographic-insertion rescue carve-out (default-off in production) +# --------------------------------------------------------------------------- + + +class TestOrthoInsertionDetail: + """Unit tests for the _ortho_insertion_detail SHAPE gate.""" + + @staticmethod + def _mixin(top1: str | None, ed: float = 1.0): + mixin = _make_mixin() + sym = MagicMock() + if top1 is None: + sym.lookup.return_value = [] + else: + cand = MagicMock() + cand.term = top1 + cand.edit_distance = ed + sym.lookup.return_value = [cand] + mixin.symspell = sym + return mixin + + def test_detects_asat_insertion(self): + # ရုပ (3 codepoints) + ် (U+103A asat) appended → single insertion. + detail = self._mixin("ရုပ်")._ortho_insertion_detail("ရုပ") + assert detail is not None + i, inserted, top1 = detail + assert (i, inserted, top1) == (3, "်", "ရုပ်") + + def test_rejects_edit_distance_above_one(self): + assert self._mixin("ရုပ်", ed=2.0)._ortho_insertion_detail("ရုပ") is None + + def test_rejects_non_insertion_shape(self): + # Same length as the typo (a swap, not an insertion). + assert self._mixin("ကကခ")._ortho_insertion_detail("ကကက") is None + + def test_rejects_non_diacritic_insertion(self): + # Inserted codepoint is a consonant, not an in-syllable diacritic. + assert self._mixin("ကကက")._ortho_insertion_detail("ကက") is None + + def test_rejects_latin_token(self): + # Loanword grey-zone guard: any ASCII letter disqualifies the token. + assert self._mixin("aက်")._ortho_insertion_detail("aက") is None + + def test_tolerates_missing_symspell(self): + mixin = _make_mixin() + mixin.symspell = None + assert mixin._ortho_insertion_detail("ရုပ") is None + + def test_tolerates_empty_candidates(self): + assert self._mixin(None)._ortho_insertion_detail("ရုပ") is None + + +class TestNarrowOrthoInsertionError: + """Unit tests for the _narrow_ortho_insertion_error span-narrowing helper.""" + + @staticmethod + def _mixin(syllables: list[str]): + mixin = _make_mixin() + seg = MagicMock() + seg.segment_syllables.return_value = syllables + mixin.segmenter = seg + return mixin + + def test_narrows_to_affected_syllable(self): + mixin = self._mixin(["က", "ခ", "ဂ"]) + err = _make_error(text="ကခဂ", position=5, error_type="invalid_word") + # asat inserted at index 3 (end); affected syllable is the last, "ဂ". + mixin._narrow_ortho_insertion_error(err, (3, "်", "ကခဂ်")) + assert err.text == "ဂ" + assert err.position == 5 + 2 # offset by s_start of "ဂ" + assert err.suggestions[0].text == "ဂ်" + assert err.suggestions[0].source == "ortho_insertion_rescue" + assert err._structural_early_exit is True + + def test_noop_on_segmentation_mismatch(self): + # Syllables that do not re-join to err.text → safe no-op, error kept wide. + mixin = self._mixin(["x", "y"]) + err = _make_error(text="ကခ", position=0, error_type="invalid_word") + before = (err.text, err.position, list(err.suggestions)) + mixin._narrow_ortho_insertion_error(err, (2, "်", "ကခ်")) + assert (err.text, err.position, list(err.suggestions)) == before + assert err._structural_early_exit is False + + def test_noop_when_insert_index_zero(self): + mixin = self._mixin(["က", "ခ"]) + err = _make_error(text="ကခ", position=0, error_type="invalid_word") + mixin._narrow_ortho_insertion_error(err, (0, "်", "်ကခ")) + assert err.text == "ကခ" + assert err._structural_early_exit is False + + # --------------------------------------------------------------------------- # Structural-syllable early-exit (sse-implement-01) # --------------------------------------------------------------------------- From fe9348f1c3401a49543a691ca723568719b3dba8 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 08:36:01 +0800 Subject: [PATCH 11/13] fix(strategy): pin confidence on narrowed ortho-rescue error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ortho-insertion rescue narrows a kept compound-split error and attaches a 0.9 suggestion but left error.confidence untouched. Narrowing runs in post-processing, after context fusion may have rewritten error.confidence into the [0.5, 0.60) INFORM band — which silently drops the recovered correction from corrected_text while a 0.9 suggestion still shows in the list. Pin err.confidence = 0.9 so the recovery is self-contained. Default-off carve-out; enabled in the v18c benchmark. Workstream: v18-preship-fixes Benchmark: myspellchecker_benchmark.yaml@1.5.0-v18c-aw-unmask-annotations Metrics: composite 0.6870 → 0.6870 (+0.0000) --- src/myspellchecker/core/error_suppression.py | 6 ++++++ tests/test_error_suppression.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/src/myspellchecker/core/error_suppression.py b/src/myspellchecker/core/error_suppression.py index 25321d6..620367e 100644 --- a/src/myspellchecker/core/error_suppression.py +++ b/src/myspellchecker/core/error_suppression.py @@ -1281,6 +1281,12 @@ def _narrow_ortho_insertion_error(self, err: "Error", detail: tuple[int, str, st corrected = syl[:local] + inserted + syl[local:] err.position = err.position + s_start err.text = syl + # Pin the error-level confidence too: narrowing runs in post-processing + # (after context fusion may have rewritten error.confidence into the + # [0.5, 0.60) INFORM band), so without this the recovered correction is + # silently dropped from corrected_text even though a 0.9 suggestion + # still appears in the error list. + err.confidence = 0.9 err.suggestions = [ Suggestion( text=corrected, diff --git a/tests/test_error_suppression.py b/tests/test_error_suppression.py index 0a4e052..16c4405 100644 --- a/tests/test_error_suppression.py +++ b/tests/test_error_suppression.py @@ -542,11 +542,13 @@ def test_narrows_to_affected_syllable(self): mixin = self._mixin(["က", "ခ", "ဂ"]) err = _make_error(text="ကခဂ", position=5, error_type="invalid_word") # asat inserted at index 3 (end); affected syllable is the last, "ဂ". + err.confidence = 0.55 # simulate a fusion-clobbered INFORM-band value mixin._narrow_ortho_insertion_error(err, (3, "်", "ကခဂ်")) assert err.text == "ဂ" assert err.position == 5 + 2 # offset by s_start of "ဂ" assert err.suggestions[0].text == "ဂ်" assert err.suggestions[0].source == "ortho_insertion_rescue" + assert err.confidence == 0.9 # re-pinned out of the INFORM band assert err._structural_early_exit is True def test_noop_on_segmentation_mismatch(self): From 58efaf1835d35902107c4fc5ae86b107ef42aa21 Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 08:36:09 +0800 Subject: [PATCH 12/13] fix(strategy): scope aw-vowel reorder-defer to overlapping spans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The aw-vowel un-mask detector deferred the WHOLE whitespace chunk to the vowel-reorder detector when any _VOWEL_REORDER_ERRORS key appeared as a substring, silently dropping a recoverable aw-typo glued to a reorder-key form (e.g. ခော်ကေါင်း lost the ခော်→ခေါ် fix). Compute the reorder-key char ranges and defer only candidate spans that overlap them; non-overlapping spans still fire. FN-only change; never emits a wrong correction. Default-off detector; enabled in the v18c benchmark. Workstream: v18-preship-fixes Benchmark: myspellchecker_benchmark.yaml@1.5.0-v18c-aw-unmask-annotations Metrics: composite 0.6870 → 0.6870 (+0.0000) --- .../core/detectors/pre_normalization.py | 34 +++++++++++++++---- tests/test_aw_vowel_unmask_detector.py | 13 +++++++ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/myspellchecker/core/detectors/pre_normalization.py b/src/myspellchecker/core/detectors/pre_normalization.py index e4b8a11..d0a1dda 100644 --- a/src/myspellchecker/core/detectors/pre_normalization.py +++ b/src/myspellchecker/core/detectors/pre_normalization.py @@ -600,10 +600,12 @@ def _detect_aw_vowel_unmask_errors(self, text: str) -> list[WordError]: compound စိတ်ပေါက် emits whole); the violated syllable only as a fallback when the covering word fails the dictionary gates. - Chunks containing a ``_VOWEL_REORDER_ERRORS`` key are skipped — that + Spans overlapping a ``_VOWEL_REORDER_ERRORS`` key are deferred — that detector owns those forms and emits an ambiguity-aware suggestion list (e.g. ကေါင်း may be a ခ→က consonant typo whose correction is - NOT the aw-canonical form). + NOT the aw-canonical form). The deferral is per span, so an + independent aw-typo glued to a reorder-key form in the same chunk + still fires. The suggestion is constructed deterministically from the normalizer (not SymSpell ranking), so top-1 is the canonical form by design. @@ -628,11 +630,6 @@ def _detect_aw_vowel_unmask_errors(self, text: str) -> list[WordError]: continue if self._AW_VOWEL_VIOLATION_RE.search(chunk) is None: continue - # Defer to the dedicated vowel-reorder detector for its known - # ambiguous forms (ကေါင်း can be a ခ→က consonant typo): emitting - # here would displace its multi-candidate suggestion list. - if any(key in chunk for key in self._VOWEL_REORDER_ERRORS): - continue canon_chunk = normalize(chunk) # Positions must map 1:1 onto the raw chunk; any length-changing # normalization step (Zawgyi conversion, char dedup) voids that, @@ -650,7 +647,16 @@ def _detect_aw_vowel_unmask_errors(self, text: str) -> list[WordError]: i for i, (a, b) in enumerate(zip(chunk, canon_chunk, strict=True)) if a != b ] + # Defer to the dedicated vowel-reorder detector ONLY for spans that + # overlap a _VOWEL_REORDER_ERRORS form (it owns those ambiguous + # forms' multi-candidate suggestion list, e.g. ကေါင်း may be a ခ→က + # consonant typo whose correction is NOT the aw-canonical form). + # Scoping per span — rather than skipping the whole chunk — lets an + # independent aw-typo glued to a reorder-key form still fire. + reorder_ranges = self._vowel_reorder_ranges(chunk) for start, end in self._aw_vowel_candidate_spans(chunk, canon_chunk, diff_pos): + if any(start < r_end and r_start < end for r_start, r_end in reorder_ranges): + continue error = WordError( text=chunk[start:end], position=idx + start, @@ -668,6 +674,20 @@ def _detect_aw_vowel_unmask_errors(self, text: str) -> list[WordError]: errors.append(error) return errors + def _vowel_reorder_ranges(self, chunk: str) -> list[tuple[int, int]]: + """Char ranges in ``chunk`` occupied by a ``_VOWEL_REORDER_ERRORS`` key. + + Aw-vowel candidate spans overlapping any of these are deferred to the + dedicated vowel-reorder detector; non-overlapping spans still fire. + """ + ranges: list[tuple[int, int]] = [] + for key in self._VOWEL_REORDER_ERRORS: + start = chunk.find(key) + while start != -1: + ranges.append((start, start + len(key))) + start = chunk.find(key, start + 1) + return ranges + def _aw_vowel_candidate_spans( self, chunk: str, canon_chunk: str, diff_pos: list[int] ) -> list[tuple[int, int]]: diff --git a/tests/test_aw_vowel_unmask_detector.py b/tests/test_aw_vowel_unmask_detector.py index 2ed504d..5299e58 100644 --- a/tests/test_aw_vowel_unmask_detector.py +++ b/tests/test_aw_vowel_unmask_detector.py @@ -217,6 +217,19 @@ def test_vowel_reorder_map_forms_deferred(self) -> None: assert det._detect_aw_vowel_unmask_errors("ကေါင်းလောင်း") == [] assert det._detect_aw_vowel_unmask_errors("ကေါင်း") == [] + def test_independent_aw_typo_glued_to_reorder_key_still_fires(self) -> None: + """A recoverable aw-typo (ခော်) glued to a reorder-key form (ကေါင်း) in + the same chunk still fires for the typo; only the reorder-key span is + deferred. The old whole-chunk substring defer dropped both.""" + det = _Harness( + _DICT, + segments={"ခေါ်ကောင်း": ["ခေါ်", "ကောင်း"]}, + syllables={"ခေါ်ကောင်း": ["ခေါ်", "ကောင်း"]}, + ) + errors = det._detect_aw_vowel_unmask_errors("ခော်ကေါင်း") + assert [e.text for e in errors] == ["ခော်"] + assert [str(e.suggestions[0]) for e in errors] == ["ခေါ်"] + class TestLoanwordGuard: """The {ဂ,င,ဝ} bases — classical round-bottom consonants excluded from From e50209af9526f91c9e4d432f9e6d6bab2d4aac0f Mon Sep 17 00:00:00 2001 From: Thet Twe Date: Wed, 10 Jun 2026 10:54:09 +0800 Subject: [PATCH 13/13] =?UTF-8?q?chore:=20v1.8.0=20release=20prep=20?= =?UTF-8?q?=E2=80=94=20version=20bump,=20changelog,=20readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump version 1.7.1 → 1.8.0. Add CHANGELOG [1.8.0] entry (opt-in aw-vowel un-mask detector, ~40% hot-path latency reduction, concurrent-batch fix). Update README feature list, what's-new callout, and test badge. Workstream: release --- CHANGELOG.md | 19 +++++++++++++++++++ README.md | 5 +++-- pyproject.toml | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b39364e..3326a89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.8.0] - 2026-06-10 + +### Added + +- **Aw-vowel un-mask detector (opt-in).** A pre-normalization detector that surfaces a class of Myanmar spelling errors the normalizer previously masked — flat/tall *aa* swaps in the aw-vowel rime (ော ↔ ေါ), e.g. `ခော်` → `ခေါ်`. Each violation emits one gated correction with a deterministic canonical suggestion. Opt-in via the `detect_aw_vowel_unmask` config flag or the `MSC_DETECT_AW_VOWEL_UNMASK` environment variable; default off. + +### Changed + +- **Hot-path latency reduced ~40% (default, behavior-identical).** Memoized SymSpell nasal-variant validation (instance-level cache keyed by term and level) and the syllable-span probe's per-sentence scoring (LRU). p95 latency drops from 658 ms to 401 ms and mean per-sentence time by ~43%, with byte-identical detections. + +### Fixed + +- **Concurrent batch checking.** The shared probe score cache is now lock-guarded, so concurrent `check_batch_async` workers can no longer raise a `KeyError` under load. +- Orthographic-insertion-rescue corrections now carry an explicit confidence so a recovered correction is not silently withheld, and the aw-vowel detector defers only the overlapping span (not the whole token) to the vowel-reorder detector. Plus internal comment, dead-code, and test-coverage cleanups. + +### Benchmark + +- With the aw-vowel detector enabled, spelling composite improves `0.6520` → `0.6870` (**+0.0350**) on the v1.8.0 benchmark: +98 true positives at zero added false positives, top-1 accuracy 92% on the un-masked corrections, clean false-positive sentences within cap (91/779), p95 382 ms. The benchmark's clean/error annotations were also corrected this release (67 previously-unannotated planted typos). Default behavior (detector off) is unchanged. + ## [1.7.1] - 2026-06-02 ### Added diff --git a/README.md b/README.md index 0ab7d30..c38e4db 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Python Version](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Coverage](https://img.shields.io/badge/coverage-75%25-green)](tests/) -[![Tests](https://img.shields.io/badge/tests-4%2C940_passed-brightgreen)](tests/) +[![Tests](https://img.shields.io/badge/tests-5%2C182_passed-brightgreen)](tests/) ## Overview @@ -34,6 +34,7 @@ * **Compound & Morpheme Handling**: DP-based compound resolution, ternary compound splits in morpheme correction, productive reduplication validation. * **AI Semantic Checking (Optional)**: ONNX masked language model for context-aware validation. * **Syllable-Span Probe (opt-in, v1.7.1)**: A frozen-encoder neural probe that improves recall on broken-compound, over-segmentation, and consonant-substitution errors. Three strategies share one small model; default-off, enabled via `use_probe_*` config flags or `MSC_USE_PROBE_*` environment variables. +* **Aw-Vowel Un-Mask Detector (opt-in, v1.8.0)**: Surfaces a class of Myanmar aw-vowel spelling errors — flat/tall *aa* swaps in the aw-vowel rime (ော ↔ ေါ, e.g. `ခော်` → `ခေါ်`) — that pre-normalization would otherwise silently repair before validation. Default-off, enabled via the `detect_aw_vowel_unmask` config flag or `MSC_DETECT_AW_VOWEL_UNMASK`. * **Named Entity Recognition**: Heuristic and Transformer-based NER to reduce false positives on names and places. ### Dictionary Building Pipeline @@ -65,7 +66,7 @@ Full documentation is available at **[docs.myspellchecker.com](https://docs.myspellchecker.com/)**. -> **What's new in v1.7.1?** See the **[Release Notes](https://docs.myspellchecker.com/reference/release-notes)** for the opt-in **syllable-span probe** — a frozen-encoder neural enhancement (three default-off strategies sharing one small model) that improves recall on broken-compound, over-segmentation, and consonant-substitution errors (+0.0125 composite when enabled). Earlier v1.7.x work added mined-confusable detection, the cross-whitespace and compound-merge probes, the skip-rule confidence gate, and benchmark-hygiene reclassification. +> **What's new in v1.8.0?** See the **[Release Notes](https://docs.myspellchecker.com/reference/release-notes)** for the opt-in **aw-vowel un-mask detector** — it surfaces Myanmar aw-vowel spelling errors (ော ↔ ေါ) the normalizer previously masked (+0.0350 spelling composite when enabled, at zero added false positives). This release also cuts hot-path latency ~40% (p95 658 → 401 ms, with identical results) and hardens concurrent batch checking. The v1.7.1 syllable-span probe remains available behind its opt-in flags. ### Getting Started * **[Introduction](https://docs.myspellchecker.com/introduction)**: Overview of the library and its architecture. diff --git a/pyproject.toml b/pyproject.toml index 2b9d980..78951fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "myspellchecker" -version = "1.7.1" +version = "1.8.0" description = "Myanmar (Burmese) text intelligence library — spell checking, grammar validation, dictionary building, and AI model training" readme = "README.md" # NOTE: Codebase uses Python 3.10+ syntax (PEP 604 unions like `str | None`).