Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ PHPUNIT=./vendor/bin/phpunit
PYTHON=python
FLUTTER=flutter
DOTNET=dotnet
SSV=0

# /usr/bin/swift is under macOS System Integrity Protection, which
# filters out environment variables like DYLD_LIBRARY_PATH which let
Expand All @@ -42,7 +43,7 @@ training/target: $(wildcard training/src/**/*)
$(MVN) -f training/pom.xml -q compile

zawgyiUnicodeModel.dat: training/target
TMP=`mktemp`; $(MVN) -f training/pom.xml -q -e exec:java -Dexec.args="'$(CORPUS)'" > $$TMP; if [ $$? -ne 0 ]; then cat $$TMP; rm $$TMP; exit 1; else mv $$TMP training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat; exit 0; fi
TMP=`mktemp`; $(MVN) -f training/pom.xml -q -e exec:java -Dexec.args="'$(CORPUS)' $(SSV)" > $$TMP; if [ $$? -ne 0 ]; then cat $$TMP; rm $$TMP; exit 1; else mv $$TMP training/src/main/resources/com/google/myanmartools/trainedModel.dat; exit 0; fi
$(MVN) -f training/pom.xml -q process-resources

compatibility.tsv: zawgyiUnicodeModel.dat training/target
Expand All @@ -54,15 +55,15 @@ testData.tsv: training/target
$(MVN) -f training/pom.xml -q process-resources

copy-resources:
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/java/src/main/resources/com/google/myanmartools
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/cpp/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/js/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/ruby/lib/myanmar-tools/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/php/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/go/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/python/src/myanmartools/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/dart/resources
cp training/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat clients/c#/Resources
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/java/src/main/resources/com/google/myanmartools/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/cpp/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/js/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/ruby/lib/myanmar-tools/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/php/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/go/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/python/src/myanmartools/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/dart/resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/trainedModel.dat clients/c#/Resources/zawgyiUnicodeModel.dat
cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/java/src/test/resources/com/google/myanmartools
cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/cpp/resources
cp training/src/main/resources/com/google/myanmartools/compatibility.tsv clients/js/resources
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,21 @@ class ZawgyiUnicodeMarkovModel {
private static final int SPC_CP0 = '\u2000';
private static final int SPC_CP1 = '\u200B';

// Variation selectors
private static final int VAR_CP0 = '\uFE00';
private static final int VAR_CP1 = '\uFE0D';

// Indices into Markov nodes
private static final short STD_OFFSET = 1;
private static final short AFT_OFFSET = STD_OFFSET + STD_CP1 - STD_CP0 + 1;
private static final short EXA_OFFSET = AFT_OFFSET + AFT_CP1 - AFT_CP0 + 1;
private static final short EXB_OFFSET = EXA_OFFSET + EXA_CP1 - EXA_CP0 + 1;
private static final short SPC_OFFSET = EXB_OFFSET + EXB_CP1 - EXB_CP0 + 1;
private static final short END_OFFSET = SPC_OFFSET + SPC_CP1 - SPC_CP0 + 1;

// Number of code points, which depends on the SSV
private static final short END_OFFSET_0 = SPC_OFFSET + SPC_CP1 - SPC_CP0 + 1;
private static final short END_OFFSET_1 = SPC_OFFSET;
private static final short END_OFFSET_2 = SPC_OFFSET + VAR_CP1 - VAR_CP0 + 1;

/**
* SSV: An ID representing which Unicode code points to include in the model:
Expand All @@ -81,7 +89,8 @@ class ZawgyiUnicodeMarkovModel {
*/
static final int SSV_STD_EXA_EXB_SPC = 0;
static final int SSV_STD_EXA_EXB = 1;
static final int SSV_COUNT = 2;
static final int SSV_STD_EXA_EXB_VAR = 2;
static final int SSV_COUNT = 3;

/**
* Returns the index of the state in the Markov chain corresponding to the given code point.
Expand Down Expand Up @@ -112,12 +121,23 @@ static int getIndexForCodePoint(int cp, int ssv) {
if (ssv == SSV_STD_EXA_EXB_SPC && SPC_CP0 <= cp && cp <= SPC_CP1) {
return cp - SPC_CP0 + SPC_OFFSET;
}
if (ssv == SSV_STD_EXA_EXB_VAR && VAR_CP0 <= cp && cp <= VAR_CP1) {
return cp - VAR_CP0 + SPC_OFFSET;
}
return 0;
}

/** The number of states in the Markov chain. */
static short getSize(int ssv) {
return ssv == SSV_STD_EXA_EXB_SPC ? END_OFFSET : SPC_OFFSET;
if (ssv == 0) {
return END_OFFSET_0;
} else if (ssv == 1) {
return END_OFFSET_1;
} else if (ssv == 2) {
return END_OFFSET_2;
} else {
throw new AssertionError();
}
}

final BinaryMarkov classifier;
Expand Down Expand Up @@ -159,7 +179,7 @@ public ZawgyiUnicodeMarkovModel(InputStream stream) throws IOException {
if (ssv < 0 || ssv >= SSV_COUNT) {
throw new IOException(
String.format(
"Unexpected value in ssv position; expected 0 or 1 but got %08X",
"Unexpected value in ssv position; expected 0-2 but got %08X",
ssv));
}
classifier = new BinaryMarkov(stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ public void testDifficult() {
{"အစားထိုး အထူးအက္ခရာ", 0.995}, // Truth: Unicode (confirmed by yinmay@)
{"ယခု မိုးရွာနေပါသလား။", 0.995}, // Truth: Unicode (confirmed by yinmay@)
{"အခြား", 0.74}, // Truth: Unicode (confirmed by yinmay@)
{"ၸ︀ၞ်ꩭူမ︀ႃꩭေ︀ႃၺꩫ︀်ၸ︀ြႃကꩭၞ်ꩫ︀ႝမ︀ွက︀်လ︀ွ် ꩡ︀ွ်တ︀ႃ ။", 0.995}, // Truth: Unicode (Phake language, #119)

// DIFFICULT STRINGS THAT DETECT CORRECTLY
// Changes to the detector should not significantly change these scores.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ private static InputStream getResourceAsStream(String path) throws IOException {
public static void main(String[] args) throws IOException {
BufferedReader tsvReader =
new BufferedReader(new InputStreamReader(getResourceAsStream("com/google/myanmartools/compatibility.tsv"), UTF_8));
ZawgyiDetector detector = new ZawgyiDetector(getResourceAsStream("com/google/myanmartools/zawgyiUnicodeModel.dat"));
ZawgyiDetector detector = new ZawgyiDetector(getResourceAsStream("com/google/myanmartools/trainedModel.dat"));
String line;
while ((line = tsvReader.readLine()) != null) {
String input = line.split("\t", -1)[1].trim();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ public static void main(String[] args) throws IOException {
BurmeseData.DATA_DIRECTORY = args[0];

int ssv = 0;
if (args.length >= 2 && args[1].equals("1")) {
ssv = 1;
if (args.length >= 2) {
ssv = Integer.parseInt(args[1]);
}

ZawgyiUnicodeMarkovModelBuilder builder = new ZawgyiUnicodeMarkovModelBuilder(ssv);
Expand Down