From 3f9e04ff4f25b19bf8e06dba760baa77633d8110 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 12 Mar 2025 19:23:28 -0700 Subject: [PATCH 1/5] Add variant selector support to library java --- .../ZawgyiUnicodeMarkovModel.java | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/clients/java/src/main/java/com/google/myanmartools/ZawgyiUnicodeMarkovModel.java b/clients/java/src/main/java/com/google/myanmartools/ZawgyiUnicodeMarkovModel.java index 02eef960..a741ef4f 100644 --- a/clients/java/src/main/java/com/google/myanmartools/ZawgyiUnicodeMarkovModel.java +++ b/clients/java/src/main/java/com/google/myanmartools/ZawgyiUnicodeMarkovModel.java @@ -63,13 +63,21 @@ class ZawgyiUnicodeMarkovModel { private static final int SPC_CP0 = '\u2000'; private static final int SPC_CP1 = '\u200B'; + // Variation selectors + private static final int VAR_CP0 = '\uFE00'; + private static final int VAR_CP1 = '\uFE0D'; + // Indices into Markov nodes private static final short STD_OFFSET = 1; private static final short AFT_OFFSET = STD_OFFSET + STD_CP1 - STD_CP0 + 1; private static final short EXA_OFFSET = AFT_OFFSET + AFT_CP1 - AFT_CP0 + 1; private static final short EXB_OFFSET = EXA_OFFSET + EXA_CP1 - EXA_CP0 + 1; private static final short SPC_OFFSET = EXB_OFFSET + EXB_CP1 - EXB_CP0 + 1; - private static final short END_OFFSET = SPC_OFFSET + SPC_CP1 - SPC_CP0 + 1; + + // Number of code points, which depends on the SSV + private static final short END_OFFSET_0 = SPC_OFFSET + SPC_CP1 - SPC_CP0 + 1; + private static final short END_OFFSET_1 = SPC_OFFSET; + private static final short END_OFFSET_2 = SPC_OFFSET + VAR_CP1 - VAR_CP0 + 1; /** * SSV: An ID representing which Unicode code points to include in the model: @@ -81,7 +89,8 @@ class ZawgyiUnicodeMarkovModel { */ static final int SSV_STD_EXA_EXB_SPC = 0; static final int SSV_STD_EXA_EXB = 1; - static final int SSV_COUNT = 2; + static final int SSV_STD_EXA_EXB_VAR = 2; + static final int SSV_COUNT = 3; /** * Returns the index of the state in the Markov chain corresponding to the given code point. @@ -112,12 +121,23 @@ static int getIndexForCodePoint(int cp, int ssv) { if (ssv == SSV_STD_EXA_EXB_SPC && SPC_CP0 <= cp && cp <= SPC_CP1) { return cp - SPC_CP0 + SPC_OFFSET; } + if (ssv == SSV_STD_EXA_EXB_VAR && VAR_CP0 <= cp && cp <= VAR_CP1) { + return cp - VAR_CP0 + SPC_OFFSET; + } return 0; } /** The number of states in the Markov chain. */ static short getSize(int ssv) { - return ssv == SSV_STD_EXA_EXB_SPC ? END_OFFSET : SPC_OFFSET; + if (ssv == 0) { + return END_OFFSET_0; + } else if (ssv == 1) { + return END_OFFSET_1; + } else if (ssv == 2) { + return END_OFFSET_2; + } else { + throw new AssertionError(); + } } final BinaryMarkov classifier; @@ -159,7 +179,7 @@ public ZawgyiUnicodeMarkovModel(InputStream stream) throws IOException { if (ssv < 0 || ssv >= SSV_COUNT) { throw new IOException( String.format( - "Unexpected value in ssv position; expected 0 or 1 but got %08X", + "Unexpected value in ssv position; expected 0-2 but got %08X", ssv)); } classifier = new BinaryMarkov(stream); From dc169dede58c0abbba8bbff3500d13fe76d96717 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 12 Mar 2025 19:26:01 -0700 Subject: [PATCH 2/5] Use trainedModel.dat as intermediate file name (increased confidence only) --- Makefile | 18 +++++++++--------- .../GenerateCompatibilityTSV.java | 2 +- ...awgyiUnicodeModel.dat => trainedModel.dat} | Bin 3 files changed, 10 insertions(+), 10 deletions(-) rename training/src/main/resources/com/google/myanmartools/{zawgyiUnicodeModel.dat => trainedModel.dat} (100%) diff --git a/Makefile b/Makefile index 6cafdbb3..021492a3 100644 --- a/Makefile +++ b/Makefile @@ -54,15 +54,15 @@ testData.tsv: training/target $(MVN) -f training/pom.xml -q process-resources copy-resources: - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/java/src/main/resources/com/google/myanmartools - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/cpp/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/js/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/ruby/lib/myanmar-tools/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/php/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/go/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/python/src/myanmartools/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/dart/resources - cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/c#/Resources + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/java/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/cpp/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/js/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/ruby/lib/myanmar-tools/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/php/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/go/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/python/src/myanmartools/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/dart/resources/zawgyiUnicodeModel.dat + cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/c#/Resources/zawgyiUnicodeModel.dat cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/java/src/test/resources/com/google/myanmartools cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/cpp/resources cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/js/resources diff --git a/training/src/main/java/com/google/myanmartools/GenerateCompatibilityTSV.java b/training/src/main/java/com/google/myanmartools/GenerateCompatibilityTSV.java index 3a620ab9..f75d10ff 100644 --- a/training/src/main/java/com/google/myanmartools/GenerateCompatibilityTSV.java +++ b/training/src/main/java/com/google/myanmartools/GenerateCompatibilityTSV.java @@ -38,7 +38,7 @@ private static InputStream getResourceAsStream(String path) throws IOException { public static void main(String[] args) throws IOException { BufferedReader tsvReader = new BufferedReader(new InputStreamReader(getResourceAsStream("com/google/myanmartools/compatibility.tsv"), UTF_8)); - ZawgyiDetector detector = new ZawgyiDetector(getResourceAsStream("com/google/myanmartools/zawgyiUnicodeModel.dat")); + ZawgyiDetector detector = new ZawgyiDetector(getResourceAsStream("com/google/myanmartools/trainedModel.dat")); String line; while ((line = tsvReader.readLine()) != null) { String input = line.split("\t", -1)[1].trim(); diff --git a/training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat b/training/src/main/resources/com/google/myanmartools/trainedModel.dat similarity index 100% rename from training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat rename to training/src/main/resources/com/google/myanmartools/trainedModel.dat From e2cde513b981c0a3a3f398866f22e3d8abeb7d19 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 12 Mar 2025 19:26:28 -0700 Subject: [PATCH 3/5] Improve SSV option parsing --- .../google/myanmartools/GenerateZawgyiUnicodeModelDAT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/src/main/java/com/google/myanmartools/GenerateZawgyiUnicodeModelDAT.java b/training/src/main/java/com/google/myanmartools/GenerateZawgyiUnicodeModelDAT.java index c9a6c1a6..c5985fa1 100644 --- a/training/src/main/java/com/google/myanmartools/GenerateZawgyiUnicodeModelDAT.java +++ b/training/src/main/java/com/google/myanmartools/GenerateZawgyiUnicodeModelDAT.java @@ -40,8 +40,8 @@ public static void main(String[] args) throws IOException { BurmeseData.DATA_DIRECTORY = args[0]; int ssv = 0; - if (args.length >= 2 && args[1].equals("1")) { - ssv = 1; + if (args.length >= 2) { + ssv = Integer.parseInt(args[1]); } ZawgyiUnicodeMarkovModelBuilder builder = new ZawgyiUnicodeMarkovModelBuilder(ssv); From d64b1898cf01505276f5a2d149dbb3b982d3fd00 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 12 Mar 2025 19:28:02 -0700 Subject: [PATCH 4/5] Explicitly set SSV in Makefile --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 021492a3..1a41568a 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,7 @@ PHPUNIT=./vendor/bin/phpunit PYTHON=python FLUTTER=flutter DOTNET=dotnet +SSV=0 # /usr/bin/swift is under macOS System Integrity Protection, which # filters out environment variables like DYLD_LIBRARY_PATH which let @@ -42,7 +43,7 @@ training/target: $(wildcard training/src/**/*) $(MVN) -f training/pom.xml -q compile zawgyiUnicodeModel.dat: training/target - TMP=`mktemp`; $(MVN) -f training/pom.xml -q -e exec:java -Dexec.args="'$(CORPUS)'" > $$TMP; if [ $$? -ne 0 ]; then cat $$TMP; rm $$TMP; exit 1; else mv $$TMP training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat; exit 0; fi + TMP=`mktemp`; $(MVN) -f training/pom.xml -q -e exec:java -Dexec.args="'$(CORPUS)' $(SSV)" > $$TMP; if [ $$? -ne 0 ]; then cat $$TMP; rm $$TMP; exit 1; else mv $$TMP training/src/main/resources/com/google/myanmartools/trainedModel.dat; exit 0; fi $(MVN) -f training/pom.xml -q process-resources compatibility.tsv: zawgyiUnicodeModel.dat training/target From c47fbfdc148ba2f243a4f9df9347eb4927dc1d0b Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 12 Mar 2025 19:36:37 -0700 Subject: [PATCH 5/5] Add Phake language test --- .../test/java/com/google/myanmartools/ZawgyiDetectorTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/clients/java/src/test/java/com/google/myanmartools/ZawgyiDetectorTest.java b/clients/java/src/test/java/com/google/myanmartools/ZawgyiDetectorTest.java index 1ce76f77..f0831768 100644 --- a/clients/java/src/test/java/com/google/myanmartools/ZawgyiDetectorTest.java +++ b/clients/java/src/test/java/com/google/myanmartools/ZawgyiDetectorTest.java @@ -94,6 +94,7 @@ public void testDifficult() { {"အစားထိုး အထူးအက္ခရာ", 0.995}, // Truth: Unicode (confirmed by yinmay@) {"ယခု မိုးရွာနေပါသလား။", 0.995}, // Truth: Unicode (confirmed by yinmay@) {"အခြား", 0.74}, // Truth: Unicode (confirmed by yinmay@) + {"ၸ︀ၞ်ꩭူမ︀ႃꩭေ︀ႃၺꩫ︀်ၸ︀ြႃကꩭၞ်ꩫ︀ႝမ︀ွက︀်လ︀ွ် ꩡ︀ွ်တ︀ႃ ။", 0.995}, // Truth: Unicode (Phake language, #119) // DIFFICULT STRINGS THAT DETECT CORRECTLY // Changes to the detector should not significantly change these scores.