From cb319f8b696394a25c8cee4902f76dbb874acb5c Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 11 May 2026 15:14:48 -0700 Subject: [PATCH] fix(types): prevent false-positive lexical substitution of ambiguous slang terms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cop_informal dictionary concept listed "botón" (button), "agente" (agent), and "cuero" (leather) as avoid-terms because they're regional slang for "police officer" in some dialects. The lexical substitution engine then swapped these common words to dialect-specific cop slang (e.g. "el botón" → "la tomba" for es-CO) even when used in their primary meaning. Add `ambiguous` flag to Variant interface so dictionary entries can mark variants that should be excluded from avoid-term generation. Co-Authored-By: Claude Opus 4.7 --- .../__tests__/lexical-substitution.test.ts | 21 +++++++++++++++++++ packages/types/src/dialectal-dictionary.json | 12 +++++++---- packages/types/src/dialectal-dictionary.ts | 4 ++++ packages/types/src/dialectal-vocabulary.ts | 8 ++++++- 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/packages/providers/src/__tests__/lexical-substitution.test.ts b/packages/providers/src/__tests__/lexical-substitution.test.ts index b0ef812..25674c8 100644 --- a/packages/providers/src/__tests__/lexical-substitution.test.ts +++ b/packages/providers/src/__tests__/lexical-substitution.test.ts @@ -74,4 +74,25 @@ describe("applyLexicalSubstitution", () => { expect(result).toContain("zumo"); expect(result).not.toContain("jugo"); }); + + it("does not swap ambiguous slang terms with common non-slang meanings", () => { + // "botón" = button (UI) but also es-UY slang for "cop". Must NOT swap to "tomba". + expect(applyLexicalSubstitution("Haga clic en el botón para continuar.", "es-CO")) + .toContain("botón"); + + // "agente" = agent (generic) but also es-EC variant for "cop". Must NOT swap. + expect(applyLexicalSubstitution("El agente secreto fue descubierto.", "es-CO")) + .toContain("agente"); + + // "cuero" = leather but also es-DO slang for "money" and "cop". Must NOT swap. + expect(applyLexicalSubstitution("El cuero del zapato está dañado.", "es-CO")) + .toContain("cuero"); + }); + + it("still swaps unambiguous slang terms correctly", () => { + // "paco" (es-CL cop slang) is unambiguous — should still swap + const result = applyLexicalSubstitution("El paco está en la esquina.", "es-CO"); + expect(result).toContain("tomba"); + expect(result).not.toContain("paco"); + }); }); diff --git a/packages/types/src/dialectal-dictionary.json b/packages/types/src/dialectal-dictionary.json index 2197b00..83360bc 100644 --- a/packages/types/src/dialectal-dictionary.json +++ b/packages/types/src/dialectal-dictionary.json @@ -44897,7 +44897,8 @@ "es-UY": { "term": "botón", "frequency": 1, - "register": "informal" + "register": "informal", + "ambiguous": true }, "es-PY": { "term": "mbae", @@ -44942,7 +44943,8 @@ "es-EC": { "term": "agente", "frequency": 1, - "register": "universal" + "register": "universal", + "ambiguous": true }, "es-BO": { "term": "choco", @@ -44952,7 +44954,8 @@ "es-DO": { "term": "cuero", "frequency": 1, - "register": "informal" + "register": "informal", + "ambiguous": true }, "es-PR": { "term": "policía", @@ -46001,7 +46004,8 @@ "es-DO": { "term": "cuero", "frequency": 2, - "register": "informal" + "register": "informal", + "ambiguous": true }, "es-PR": { "term": "chavos", diff --git a/packages/types/src/dialectal-dictionary.ts b/packages/types/src/dialectal-dictionary.ts index 49521c8..c84264f 100644 --- a/packages/types/src/dialectal-dictionary.ts +++ b/packages/types/src/dialectal-dictionary.ts @@ -19,6 +19,10 @@ export interface Variant { frequency: 1 | 2 | 3; register: "formal" | "informal" | "universal"; notes?: string; + /** When true, this variant has a common non-slang meaning and should not + * be used as an avoid-term for lexical substitution. Prevents false + * positives like "botón" (button) being swapped to "tomba" (cop slang). */ + ambiguous?: boolean; } export interface DictionaryEntry { diff --git a/packages/types/src/dialectal-vocabulary.ts b/packages/types/src/dialectal-vocabulary.ts index b556f92..e217946 100644 --- a/packages/types/src/dialectal-vocabulary.ts +++ b/packages/types/src/dialectal-vocabulary.ts @@ -73,7 +73,13 @@ export function getVocabularyForDialect(dialect: SpanishDialect): VocabularySwap const variant = resolveVariant(entry, dialect); if (!variant) continue; const allTerms = getAllTerms(entry); - const avoidTerms = allTerms.filter(t => t !== variant.term); + // Build avoid-terms, excluding ambiguous variants that have common + // non-slang meanings (e.g. "botón" = button, not just es-UY cop slang). + const ambiguousTerms = new Set(); + for (const v of Object.values(entry.variants ?? {})) { + if (v?.ambiguous) ambiguousTerms.add(v.term); + } + const avoidTerms = allTerms.filter(t => t !== variant.term && !ambiguousTerms.has(t)); swaps.push({ concept: entry.concept, englishGloss: entry.englishGloss,