From e355061ffd32b4a442051681a98e06a3844aa08e Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Fri, 19 Jun 2026 23:43:00 +0200 Subject: [PATCH 1/9] test with double array trie --- .../jfiveparse/DoubleArrayTrie.java | 209 ++++++++++++++++++ .../jfiveparse/GenerateEntities.java | 20 ++ 2 files changed, 229 insertions(+) create mode 100644 src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java new file mode 100644 index 0000000..63167b3 --- /dev/null +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -0,0 +1,209 @@ +package ch.digitalfondue.jfiveparse; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// from https://medium.com/@lchang1994/deep-dive-dat-double-array-trie-f51e5e5f006c +// TODO: extract all letters in the key of entities -> generate a mapping with a 1 index +public class DoubleArrayTrie { + private int[] check; + private int[] base; + + public DoubleArrayTrie() { + check = new int[128]; + base = new int[128]; + + Arrays.fill(check, -1); // -1 means empty + Arrays.fill(base, 0); + + base[1] = 1; // Root is 1 + check[0] = -1; // Root has no parent + } + + private void expand(int index) { + if (index < base.length) { + return; + } + int oldSize = base.length; + int newSize = Math.max(index + 1, oldSize * 2); + + // Resize check array and fill new slots with -1 + check = Arrays.copyOf(check, newSize); + Arrays.fill(check, oldSize, newSize, -1); + + // Resize base array (new slots automatically defaults to 0) + base = Arrays.copyOf(base, newSize); + } + + public int charId(char c) { + // 1-based index for characters 'a'-'z'. + return c - 'a' + 1; + } + + public void insert(String word) { + int node = 1; + for (int i = 0; i < word.length(); i++) { + int code = charId(word.charAt(i)); + + // Ensure base has a valid offset (0 means no children yet) + if (base[node] == 0) { + base[node] = 1; // Default offset 1 + } + + // Use Math.abs() because base might be negative if it's a terminal node + int offset = Math.abs(base[node]); + int nextNode = offset + code; + expand(nextNode); + + if (check[nextNode] == -1) { + // Free spot + check[nextNode] = node; + } else if (check[nextNode] != node) { + // Collision + resolveCollision(node, code); + // Re-calculate after resolution + offset = Math.abs(base[node]); + nextNode = offset + code; + expand(nextNode); + check[nextNode] = node; + } + + node = nextNode; + } + + setTerminal(node); + } + + private void setTerminal(int node) { + if (base[node] == 0) { + base[node] = -1; // Terminal, offset 1 + } else if (base[node] > 0) { + base[node] = -base[node]; + } + } + + private int isTerminal(int node) { + if (node >= base.length) { + return 0; + } + return base[node] < 0 ? 1 : 0; + } + + private void resolveCollision(int node, int newCharCode) { + List children = getChildren(node); + children.add(newCharCode); + + int newBaseOffset = findValidBase(children); + int oldBaseOffset = Math.abs(base[node]); + + // Apply new base, maintaining terminal status + if (base[node] < 0) { + base[node] = -newBaseOffset; + } else { + base[node] = newBaseOffset; + } + + for (int c : children) { + if (c == newCharCode) { + continue; + } + + int oldIdx = oldBaseOffset + c; + int newIdx = newBaseOffset + c; + expand(newIdx); + + // Move node info: base, check + base[newIdx] = base[oldIdx]; + check[newIdx] = node; // Parent is still 'node' + + // If the moved child has children, update their parent pointers (check) + if (base[oldIdx] != 0) { + int childBaseOffset = Math.abs(base[oldIdx]); + List grandchildren = getChildrenOfOffset(oldIdx, childBaseOffset); + for (int gcCode : grandchildren) { + int gcIdx = childBaseOffset + gcCode; + check[gcIdx] = newIdx; + } + } + + // Clear old spot + check[oldIdx] = -1; + base[oldIdx] = 0; + } + } + + private int findValidBase(List children) { + int q = 1; + while (true) { + boolean ok = true; + for (int c : children) { + int idx = q + c; + if (idx < check.length && check[idx] != -1) { + ok = false; + break; + } + } + if (ok) { + return q; + } + q++; + } + } + + private List getChildren(int node) { + return getChildrenOfOffset(node, Math.abs(base[node])); + } + + private List getChildrenOfOffset(int nodeIdx, int baseOffset) { + List children = new ArrayList<>(); + if (baseOffset == 0) { + return children; + } + // 'a'-'z' maps to 1..26 + for (int c = 1; c <= 26; c++) { + int idx = baseOffset + c; + if (idx < check.length && check[idx] == nodeIdx) { + children.add(c); + } + } + return children; + } + + // -1 don't exists, 0 is a sub element, 1 is terminal + public int lookup(String word) { + int node = 1; + for (int i = 0; i < word.length(); i++) { + int code = charId(word.charAt(i)); + int offset = Math.abs(base[node]); + if (offset == 0) { + return -1; + } + + int nextNode = offset + code; + if (nextNode >= check.length || check[nextNode] != node) { + return -1; + } + node = nextNode; + } + return isTerminal(node); + } + + + @Test + void check() { + var dat = new DoubleArrayTrie(); + + var words = List.of("dog", "apple", "app", "banana", "band", "b"); + for (var v : words) { + dat.insert(v); + } + var testWords = List.of("dog", "apple", "app", "banana", "band", "b", "appl", "ban", "c", "", "&", "."); + for (var w : testWords) { + var found = dat.lookup(w); + System.err.println(w + " " + found); + } + } +} diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 77711ef..11898c5 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -24,7 +24,10 @@ import java.lang.reflect.Type; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Map; +import java.util.TreeMap; +import java.util.TreeSet; import java.util.zip.GZIPOutputStream; /** @@ -43,7 +46,23 @@ public static void main(String[] args) throws IOException { String json = Files.readString(Paths.get("src/test/resources/entities.json")); Map m = new GsonBuilder().create().fromJson(json, type); + var allUsedChars = new TreeSet(); + for (String key: m.keySet()) { + key.chars().forEach(allUsedChars::add); + } + var allChars = new ArrayList<>(allUsedChars); + + for (int i = 0; i < allChars.size(); i++) { + System.err.println((i+1) + ":" + allChars.get(i)); + } + + System.err.println(allUsedChars.first() + " " + allUsedChars.last()); + // we can iterate for making a and array for mapping char -> id! + // and also generate the range! + + EntitiesPrefix p = new EntitiesPrefix(null); + //DoubleArrayTrie dat = new DoubleArrayTrie(); ByteArrayOutputStream baosOneCodePoint = new ByteArrayOutputStream(); GZIPOutputStream osOneCodePoint = new GZIPOutputStream(baosOneCodePoint); @@ -53,6 +72,7 @@ public static void main(String[] args) throws IOException { int twoCodePointLength = 0; for (String key : m.keySet()) { + //dat.insert(key); p.addWord(key, m.get(key).codepoints); if (m.get(key).codepoints.length == 1) { From 5f47bca4b4cc3246913477e6f6b09d573db798c7 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sat, 20 Jun 2026 00:11:17 +0200 Subject: [PATCH 2/9] tmp --- .../java/ch/digitalfondue/jfiveparse/GenerateEntities.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 11898c5..9836a34 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -52,9 +52,11 @@ public static void main(String[] args) throws IOException { } var allChars = new ArrayList<>(allUsedChars); + // create support array for (int i = 0; i < allChars.size(); i++) { - System.err.println((i+1) + ":" + allChars.get(i)); + System.err.println("entitiesCharIdx[" + (allChars.get(i) -38)+"] = "+ (i+1)); } + // System.err.println(allUsedChars.first() + " " + allUsedChars.last()); // we can iterate for making a and array for mapping char -> id! From 578121c30440a480c0e3a8f245951a4de484b723 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sat, 20 Jun 2026 10:26:20 +0200 Subject: [PATCH 3/9] add char mapping --- .../jfiveparse/DoubleArrayTrie.java | 80 ++++++++++++++++++- .../jfiveparse/GenerateEntities.java | 35 ++++++-- 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index 63167b3..caa56c8 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -7,7 +7,6 @@ import java.util.List; // from https://medium.com/@lchang1994/deep-dive-dat-double-array-trie-f51e5e5f006c -// TODO: extract all letters in the key of entities -> generate a mapping with a 1 index public class DoubleArrayTrie { private int[] check; private int[] base; @@ -39,14 +38,16 @@ private void expand(int index) { } public int charId(char c) { - // 1-based index for characters 'a'-'z'. - return c - 'a' + 1; + return c >= 38 && c <= 122 ? entitiesCharIdx[c - 38] : -1; } public void insert(String word) { int node = 1; for (int i = 0; i < word.length(); i++) { int code = charId(word.charAt(i)); + if (code == -1) { + throw new IllegalStateException("Character " + word.charAt(i) + " is not mapped"); + } // Ensure base has a valid offset (0 means no children yet) if (base[node] == 0) { @@ -177,6 +178,9 @@ public int lookup(String word) { int node = 1; for (int i = 0; i < word.length(); i++) { int code = charId(word.charAt(i)); + if (code == -1) { + return -1; + } int offset = Math.abs(base[node]); if (offset == 0) { return -1; @@ -206,4 +210,74 @@ void check() { System.err.println(w + " " + found); } } + + + + + private static final int[] entitiesCharIdx = new int[85]; + static { + Arrays.fill(entitiesCharIdx, -1); + entitiesCharIdx[0] = 1; + entitiesCharIdx[11] = 2; + entitiesCharIdx[12] = 3; + entitiesCharIdx[13] = 4; + entitiesCharIdx[14] = 5; + entitiesCharIdx[15] = 6; + entitiesCharIdx[16] = 7; + entitiesCharIdx[17] = 8; + entitiesCharIdx[18] = 9; + entitiesCharIdx[21] = 10; + entitiesCharIdx[27] = 11; + entitiesCharIdx[28] = 12; + entitiesCharIdx[29] = 13; + entitiesCharIdx[30] = 14; + entitiesCharIdx[31] = 15; + entitiesCharIdx[32] = 16; + entitiesCharIdx[33] = 17; + entitiesCharIdx[34] = 18; + entitiesCharIdx[35] = 19; + entitiesCharIdx[36] = 20; + entitiesCharIdx[37] = 21; + entitiesCharIdx[38] = 22; + entitiesCharIdx[39] = 23; + entitiesCharIdx[40] = 24; + entitiesCharIdx[41] = 25; + entitiesCharIdx[42] = 26; + entitiesCharIdx[43] = 27; + entitiesCharIdx[44] = 28; + entitiesCharIdx[45] = 29; + entitiesCharIdx[46] = 30; + entitiesCharIdx[47] = 31; + entitiesCharIdx[48] = 32; + entitiesCharIdx[49] = 33; + entitiesCharIdx[50] = 34; + entitiesCharIdx[51] = 35; + entitiesCharIdx[52] = 36; + entitiesCharIdx[59] = 37; + entitiesCharIdx[60] = 38; + entitiesCharIdx[61] = 39; + entitiesCharIdx[62] = 40; + entitiesCharIdx[63] = 41; + entitiesCharIdx[64] = 42; + entitiesCharIdx[65] = 43; + entitiesCharIdx[66] = 44; + entitiesCharIdx[67] = 45; + entitiesCharIdx[68] = 46; + entitiesCharIdx[69] = 47; + entitiesCharIdx[70] = 48; + entitiesCharIdx[71] = 49; + entitiesCharIdx[72] = 50; + entitiesCharIdx[73] = 51; + entitiesCharIdx[74] = 52; + entitiesCharIdx[75] = 53; + entitiesCharIdx[76] = 54; + entitiesCharIdx[77] = 55; + entitiesCharIdx[78] = 56; + entitiesCharIdx[79] = 57; + entitiesCharIdx[80] = 58; + entitiesCharIdx[81] = 59; + entitiesCharIdx[82] = 60; + entitiesCharIdx[83] = 61; + entitiesCharIdx[84] = 62; + } } diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 9836a34..0e35a89 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -17,6 +17,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.reflect.TypeToken; +import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; @@ -24,10 +25,7 @@ import java.lang.reflect.Type; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Map; -import java.util.TreeMap; -import java.util.TreeSet; +import java.util.*; import java.util.zip.GZIPOutputStream; /** @@ -40,7 +38,11 @@ private static class EntityValues { int[] codepoints; } - public static void main(String[] args) throws IOException { + + // this generates the supporting array for mapping a character to a unique id + // for the double array trie + @Test + void generateCharactersMappingArray() throws IOException { Type type = (new TypeToken>() { }).getType(); String json = Files.readString(Paths.get("src/test/resources/entities.json")); @@ -51,14 +53,33 @@ public static void main(String[] args) throws IOException { key.chars().forEach(allUsedChars::add); } var allChars = new ArrayList<>(allUsedChars); + var firstChar = allChars.get(0); + var lastChar = allChars.get(allChars.size() - 1); + // mapping function + System.out.println("public int charId(char c) {"); + System.out.printf(" return c >= %d && c <= %d ? entitiesCharIdx[c - %d] : -1;\n", firstChar, lastChar, firstChar); + System.out.println("}"); + // + System.out.println("private static final int[] entitiesCharIdx = new int[" + (lastChar-firstChar+1) + "];"); + System.out.println("static {"); // create support array + System.out.println("Arrays.fill(entitiesCharIdx, -1);"); for (int i = 0; i < allChars.size(); i++) { - System.err.println("entitiesCharIdx[" + (allChars.get(i) -38)+"] = "+ (i+1)); + System.out.println("entitiesCharIdx[" + (allChars.get(i) - firstChar)+"] = "+ (i+1) + ";"); } + System.out.println("}"); // + System.out.println(allUsedChars.first() + " " + allUsedChars.last() + ";"); + } + + public static void main(String[] args) throws IOException { + Type type = (new TypeToken>() { + }).getType(); + String json = Files.readString(Paths.get("src/test/resources/entities.json")); + Map m = new GsonBuilder().create().fromJson(json, type); + - System.err.println(allUsedChars.first() + " " + allUsedChars.last()); // we can iterate for making a and array for mapping char -> id! // and also generate the range! From 7c2d8fd51e2ae7f43830b9c9323ecff0099219fa Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sat, 20 Jun 2026 10:48:08 +0200 Subject: [PATCH 4/9] more generated code --- .../jfiveparse/DoubleArrayTrie.java | 18 +++++------ .../jfiveparse/GenerateEntities.java | 30 +++++++++++++++---- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index caa56c8..7698e6b 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -8,8 +8,8 @@ // from https://medium.com/@lchang1994/deep-dive-dat-double-array-trie-f51e5e5f006c public class DoubleArrayTrie { - private int[] check; - private int[] base; + int[] check; + int[] base; public DoubleArrayTrie() { check = new int[128]; @@ -37,10 +37,6 @@ private void expand(int index) { base = Arrays.copyOf(base, newSize); } - public int charId(char c) { - return c >= 38 && c <= 122 ? entitiesCharIdx[c - 38] : -1; - } - public void insert(String word) { int node = 1; for (int i = 0; i < word.length(); i++) { @@ -163,8 +159,8 @@ private List getChildrenOfOffset(int nodeIdx, int baseOffset) { if (baseOffset == 0) { return children; } - // 'a'-'z' maps to 1..26 - for (int c = 1; c <= 26; c++) { + // the chars map to 1 up to MAX_CHAR_ID + for (int c = 1; c <= MAX_CHAR_ID; c++) { int idx = baseOffset + c; if (idx < check.length && check[idx] == nodeIdx) { children.add(c); @@ -212,8 +208,12 @@ void check() { } + // generated code - + private static final int MAX_CHAR_ID = 62; + public int charId(char c) { + return c >= 38 && c <= 122 ? entitiesCharIdx[c - 38] : -1; + } private static final int[] entitiesCharIdx = new int[85]; static { Arrays.fill(entitiesCharIdx, -1); diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 0e35a89..1348f91 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -17,6 +17,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.reflect.TypeToken; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.io.ByteArrayOutputStream; @@ -41,12 +42,11 @@ private static class EntityValues { // this generates the supporting array for mapping a character to a unique id // for the double array trie + // step 1 @Test + @Disabled void generateCharactersMappingArray() throws IOException { - Type type = (new TypeToken>() { - }).getType(); - String json = Files.readString(Paths.get("src/test/resources/entities.json")); - Map m = new GsonBuilder().create().fromJson(json, type); + Map m = getEntitiesMap(); var allUsedChars = new TreeSet(); for (String key: m.keySet()) { @@ -56,6 +56,7 @@ void generateCharactersMappingArray() throws IOException { var firstChar = allChars.get(0); var lastChar = allChars.get(allChars.size() - 1); // mapping function + System.out.println("private static final int MAX_CHAR_ID = " + allChars.size() + ";"); System.out.println("public int charId(char c) {"); System.out.printf(" return c >= %d && c <= %d ? entitiesCharIdx[c - %d] : -1;\n", firstChar, lastChar, firstChar); System.out.println("}"); @@ -70,14 +71,31 @@ void generateCharactersMappingArray() throws IOException { } System.out.println("}"); // - System.out.println(allUsedChars.first() + " " + allUsedChars.last() + ";"); } - public static void main(String[] args) throws IOException { + private static Map getEntitiesMap() throws IOException { Type type = (new TypeToken>() { }).getType(); String json = Files.readString(Paths.get("src/test/resources/entities.json")); Map m = new GsonBuilder().create().fromJson(json, type); + return m; + } + + // step 2 + @Test + @Disabled + void pregenerateSupportArray() throws IOException { + var entities = getEntitiesMap(); + var dat = new DoubleArrayTrie(); + for(String key : entities.keySet()) { + dat.insert(key); + } + System.out.println(Arrays.toString(dat.base)); + System.out.println(Arrays.toString(dat.check)); + } + + public static void main(String[] args) throws IOException { + Map m = getEntitiesMap(); // we can iterate for making a and array for mapping char -> id! From dd59886381c47cc3b37cd7233c2a936b07971e6c Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sat, 20 Jun 2026 11:02:50 +0200 Subject: [PATCH 5/9] muh --- .../jfiveparse/DoubleArrayTrie.java | 2 ++ .../jfiveparse/GenerateEntities.java | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index 7698e6b..7ba3218 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -1,5 +1,6 @@ package ch.digitalfondue.jfiveparse; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -193,6 +194,7 @@ public int lookup(String word) { @Test + @Disabled void check() { var dat = new DoubleArrayTrie(); diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 1348f91..3fed667 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -17,6 +17,7 @@ import com.google.gson.GsonBuilder; import com.google.gson.reflect.TypeToken; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -92,6 +93,28 @@ void pregenerateSupportArray() throws IOException { } System.out.println(Arrays.toString(dat.base)); System.out.println(Arrays.toString(dat.check)); + + // right size base and check array + for (int i = dat.base.length - 1; i >= 0; i--) { + if (dat.base[i] != 0) { + System.out.println("base length should be " + (i+1)); + break; + } + } + + for (int i = dat.check.length - 1; i >= 0; i--) { + if (dat.check[i] != -1) { + System.out.println("check length should be " + (i+1)); + break; + } + } + + // all entities must be accounted for + for(String key : entities.keySet()) { + Assertions.assertEquals(1, dat.lookup(key)); + } + Assertions.assertEquals(-1, dat.lookup("&lol;")); + Assertions.assertEquals(0, dat.lookup("&am")); } public static void main(String[] args) throws IOException { From 21f681d5a9a588bb027317c630319eaf39416cdd Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sun, 21 Jun 2026 14:20:58 +0200 Subject: [PATCH 6/9] note --- .../jfiveparse/GenerateEntities.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 3fed667..23cbef4 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -82,13 +82,28 @@ private static Map getEntitiesMap() throws IOException { return m; } + // long l = (((long)x) << 32) | (y & 0xffffffffL); + // int x = (int)(l >> 32); + // int y = (int)l; + + private static long pack1Or2IntToLong(int[] codepoint) { + if (codepoint.length == 1) { + return ((long) codepoint[0] << 32); + } else if (codepoint.length == 2) { + return ((long) codepoint[0] << 32) | (codepoint[1] & 0xFFFFFFFFL); + } + throw new IllegalStateException("1 or 2 codepoint"); + } + // step 2 @Test @Disabled void pregenerateSupportArray() throws IOException { var entities = getEntitiesMap(); var dat = new DoubleArrayTrie(); - for(String key : entities.keySet()) { + for (String key : entities.keySet()) { + var codepoints = entities.get(key).codepoints; + pack1Or2IntToLong(codepoints); dat.insert(key); } System.out.println(Arrays.toString(dat.base)); From 1d040e8cf99ecd5f3ce7cfd00890b51717c5f386 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Sun, 21 Jun 2026 21:26:54 +0200 Subject: [PATCH 7/9] add codepoints in trie --- .../jfiveparse/DoubleArrayTrie.java | 117 +++++++++++++----- .../jfiveparse/GenerateEntities.java | 10 +- 2 files changed, 96 insertions(+), 31 deletions(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index 7ba3218..77f7c65 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -6,23 +6,45 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.function.IntFunction; +import java.util.function.IntUnaryOperator; // from https://medium.com/@lchang1994/deep-dive-dat-double-array-trie-f51e5e5f006c public class DoubleArrayTrie { int[] check; - int[] base; + long[] base; + long[] values; + int valuesCount; public DoubleArrayTrie() { check = new int[128]; - base = new int[128]; + base = new long[128]; + values = new long[128]; Arrays.fill(check, -1); // -1 means empty - Arrays.fill(base, 0); + Arrays.fill(base, 0L); - base[1] = 1; // Root is 1 + setBaseAt(1, 1); // Root is 1 + // base[1] = 1; check[0] = -1; // Root has no parent } + private void setBaseAt(int idx, int offset) { + setBaseAt(idx, offset, -1); + } + + private void setBaseAt(int idx, int offset, int idxValue) { + base[idx] = pack1Or2IntToLong(offset, idxValue); + } + + private int getBaseAt(int idx) { + return extractHigh(base[idx]); + } + + private int getValueIdxAt(int idx) { + return extractLow(base[idx]); + } + private void expand(int index) { if (index < base.length) { return; @@ -38,7 +60,7 @@ private void expand(int index) { base = Arrays.copyOf(base, newSize); } - public void insert(String word) { + public void insert(String word, long value) { int node = 1; for (int i = 0; i < word.length(); i++) { int code = charId(word.charAt(i)); @@ -47,12 +69,13 @@ public void insert(String word) { } // Ensure base has a valid offset (0 means no children yet) - if (base[node] == 0) { - base[node] = 1; // Default offset 1 + if (getBaseAt(node) == 0) { + // base[node] = 1; // Default offset 1 + setBaseAt(node, 1); } // Use Math.abs() because base might be negative if it's a terminal node - int offset = Math.abs(base[node]); + int offset = Math.abs(getBaseAt(node)); int nextNode = offset + code; expand(nextNode); @@ -63,7 +86,7 @@ public void insert(String word) { // Collision resolveCollision(node, code); // Re-calculate after resolution - offset = Math.abs(base[node]); + offset = Math.abs(getBaseAt(node)); nextNode = offset + code; expand(nextNode); check[nextNode] = node; @@ -71,23 +94,29 @@ public void insert(String word) { node = nextNode; } - - setTerminal(node); + setTerminal(node, value); } - private void setTerminal(int node) { - if (base[node] == 0) { - base[node] = -1; // Terminal, offset 1 - } else if (base[node] > 0) { - base[node] = -base[node]; + private void setTerminal(int node, long value) { + if (getBaseAt(node) == 0) { + // base[node] = -1; // Terminal, offset 1 + setBaseAt(node, -1, valuesCount); + } else if (getBaseAt(node) > 0) { + // base[node] = -base[node]; + setBaseAt(node, getBaseAt(node), valuesCount); + } + if (valuesCount >= values.length) { + values = Arrays.copyOf(values, values.length * 2); } + values[valuesCount] = value; + valuesCount++; } private int isTerminal(int node) { if (node >= base.length) { return 0; } - return base[node] < 0 ? 1 : 0; + return getBaseAt(node) < 0 ? 1 : 0; } private void resolveCollision(int node, int newCharCode) { @@ -95,13 +124,15 @@ private void resolveCollision(int node, int newCharCode) { children.add(newCharCode); int newBaseOffset = findValidBase(children); - int oldBaseOffset = Math.abs(base[node]); + int oldBaseOffset = Math.abs(getBaseAt(node)); // Apply new base, maintaining terminal status - if (base[node] < 0) { - base[node] = -newBaseOffset; + if (getBaseAt(node) < 0) { + setBaseAt(node, -newBaseOffset); + //base[node] = -newBaseOffset; } else { - base[node] = newBaseOffset; + setBaseAt(node, newBaseOffset); + // base[node] = newBaseOffset; } for (int c : children) { @@ -114,12 +145,13 @@ private void resolveCollision(int node, int newCharCode) { expand(newIdx); // Move node info: base, check + // CHECK, this should be the only place where we relocate the whole info (high/low) base[newIdx] = base[oldIdx]; check[newIdx] = node; // Parent is still 'node' // If the moved child has children, update their parent pointers (check) - if (base[oldIdx] != 0) { - int childBaseOffset = Math.abs(base[oldIdx]); + if (getBaseAt(oldIdx) != 0) { + int childBaseOffset = Math.abs(getBaseAt(oldIdx)); List grandchildren = getChildrenOfOffset(oldIdx, childBaseOffset); for (int gcCode : grandchildren) { int gcIdx = childBaseOffset + gcCode; @@ -129,7 +161,8 @@ private void resolveCollision(int node, int newCharCode) { // Clear old spot check[oldIdx] = -1; - base[oldIdx] = 0; + // base[oldIdx] = 0; + setBaseAt(oldIdx, 0); } } @@ -152,7 +185,7 @@ private int findValidBase(List children) { } private List getChildren(int node) { - return getChildrenOfOffset(node, Math.abs(base[node])); + return getChildrenOfOffset(node, Math.abs(getBaseAt(node))); } private List getChildrenOfOffset(int nodeIdx, int baseOffset) { @@ -170,15 +203,27 @@ private List getChildrenOfOffset(int nodeIdx, int baseOffset) { return children; } - // -1 don't exists, 0 is a sub element, 1 is terminal + // return: + // -1 don't exist + // 0 is a sub element (e.g "ab" is a subelement of "abc") + // 1 is terminal public int lookup(String word) { + return lookup(word, this::isTerminal); + } + + // can only be called when lookupp as returned a terminal node + public long lookupValue(String word) { + return values[lookup(word, this::getValueIdxAt)]; + } + + private int lookup(String word, IntUnaryOperator op) { int node = 1; for (int i = 0; i < word.length(); i++) { int code = charId(word.charAt(i)); if (code == -1) { return -1; } - int offset = Math.abs(base[node]); + int offset = Math.abs(getBaseAt(node)); if (offset == 0) { return -1; } @@ -189,7 +234,21 @@ public int lookup(String word) { } node = nextNode; } - return isTerminal(node); + return op.applyAsInt(node); + } + + private static long pack1Or2IntToLong(int high, int low) { + return ((long) high << 32) | (low & 0xFFFFFFFFL); + } + + // extract from a long v the high value part + static int extractHigh(long v) { + return (int) (v >> 32); + } + + // extract from a long v the low value part + static int extractLow(long v) { + return (int) v; } @@ -200,7 +259,7 @@ void check() { var words = List.of("dog", "apple", "app", "banana", "band", "b"); for (var v : words) { - dat.insert(v); + dat.insert(v, 0); } var testWords = List.of("dog", "apple", "app", "banana", "band", "b", "appl", "ban", "c", "", "&", "."); for (var w : testWords) { diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 23cbef4..3cd7661 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -95,6 +95,7 @@ private static long pack1Or2IntToLong(int[] codepoint) { throw new IllegalStateException("1 or 2 codepoint"); } + // step 2 @Test @Disabled @@ -103,8 +104,7 @@ void pregenerateSupportArray() throws IOException { var dat = new DoubleArrayTrie(); for (String key : entities.keySet()) { var codepoints = entities.get(key).codepoints; - pack1Or2IntToLong(codepoints); - dat.insert(key); + dat.insert(key, pack1Or2IntToLong(codepoints)); } System.out.println(Arrays.toString(dat.base)); System.out.println(Arrays.toString(dat.check)); @@ -127,6 +127,12 @@ void pregenerateSupportArray() throws IOException { // all entities must be accounted for for(String key : entities.keySet()) { Assertions.assertEquals(1, dat.lookup(key)); + var k = dat.lookupValue(key); + var h = DoubleArrayTrie.extractHigh(k); + var l = DoubleArrayTrie.extractLow(k); + int[] codePoints = l == 0 ? new int[] {h} : new int[] {h, l}; + Assertions.assertArrayEquals(entities.get(key).codepoints, codePoints); + } Assertions.assertEquals(-1, dat.lookup("&lol;")); Assertions.assertEquals(0, dat.lookup("&am")); From 0f2c89f3db8acd7a5c3fc37263a6a0804d35e230 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Tue, 23 Jun 2026 00:25:59 +0200 Subject: [PATCH 8/9] fix+advance --- .../jfiveparse/DoubleArrayTrie.java | 7 ++++--- .../jfiveparse/GenerateEntities.java | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index 77f7c65..b26362c 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -6,10 +6,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.function.IntFunction; import java.util.function.IntUnaryOperator; // from https://medium.com/@lchang1994/deep-dive-dat-double-array-trie-f51e5e5f006c +// FIXME deduplicate the inserted value, currently at each insert, we reserve a new spot for the value, do a linear probe. public class DoubleArrayTrie { int[] check; long[] base; @@ -125,13 +125,14 @@ private void resolveCollision(int node, int newCharCode) { int newBaseOffset = findValidBase(children); int oldBaseOffset = Math.abs(getBaseAt(node)); + int baseValue = getValueIdxAt(node); // Apply new base, maintaining terminal status if (getBaseAt(node) < 0) { - setBaseAt(node, -newBaseOffset); + setBaseAt(node, -newBaseOffset, baseValue); //base[node] = -newBaseOffset; } else { - setBaseAt(node, newBaseOffset); + setBaseAt(node, newBaseOffset, baseValue); // base[node] = newBaseOffset; } diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 3cd7661..9c53b79 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -95,6 +95,16 @@ private static long pack1Or2IntToLong(int[] codepoint) { throw new IllegalStateException("1 or 2 codepoint"); } + @Test + void checkValueSub() { + var dat = new DoubleArrayTrie(); + dat.insert("Æ", 40); + dat.insert("Æ", 41); + dat.insert("&", 42); + dat.insert("&", 43); + dat.lookupValue("&"); + } + // step 2 @Test @@ -106,8 +116,8 @@ void pregenerateSupportArray() throws IOException { var codepoints = entities.get(key).codepoints; dat.insert(key, pack1Or2IntToLong(codepoints)); } - System.out.println(Arrays.toString(dat.base)); - System.out.println(Arrays.toString(dat.check)); + //System.out.println(Arrays.toString(dat.base)); + //System.out.println(Arrays.toString(dat.check)); // right size base and check array for (int i = dat.base.length - 1; i >= 0; i--) { @@ -124,15 +134,18 @@ void pregenerateSupportArray() throws IOException { } } + //dat.lookup("&"); + //dat.lookupValue("&"); + // all entities must be accounted for for(String key : entities.keySet()) { Assertions.assertEquals(1, dat.lookup(key)); + System.err.println("key " + key); var k = dat.lookupValue(key); var h = DoubleArrayTrie.extractHigh(k); var l = DoubleArrayTrie.extractLow(k); int[] codePoints = l == 0 ? new int[] {h} : new int[] {h, l}; Assertions.assertArrayEquals(entities.get(key).codepoints, codePoints); - } Assertions.assertEquals(-1, dat.lookup("&lol;")); Assertions.assertEquals(0, dat.lookup("&am")); From f82cda1d2e9150a65cfbacf354b70773e8833b71 Mon Sep 17 00:00:00 2001 From: Sylvain Jermini Date: Tue, 23 Jun 2026 22:27:20 +0200 Subject: [PATCH 9/9] tmp, need to find a way to compact, currently the generated is 50k! vs 12k --- .../jfiveparse/DoubleArrayTrie.java | 20 ++++++++--- .../jfiveparse/GenerateEntities.java | 35 +++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java index b26362c..21d3837 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/DoubleArrayTrie.java @@ -98,18 +98,28 @@ public void insert(String word, long value) { } private void setTerminal(int node, long value) { + int idxValue = valuesCount; + for (int i = 0; i < valuesCount; i++) { + if (values[i] == value) { + idxValue = i; + break; + } + } + if (getBaseAt(node) == 0) { // base[node] = -1; // Terminal, offset 1 - setBaseAt(node, -1, valuesCount); + setBaseAt(node, -1, idxValue); } else if (getBaseAt(node) > 0) { // base[node] = -base[node]; - setBaseAt(node, getBaseAt(node), valuesCount); + setBaseAt(node, getBaseAt(node), idxValue); } - if (valuesCount >= values.length) { + if (idxValue >= values.length) { values = Arrays.copyOf(values, values.length * 2); } - values[valuesCount] = value; - valuesCount++; + values[idxValue] = value; + if (idxValue == valuesCount) { + valuesCount++; + } } private int isTerminal(int node) { diff --git a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java index 9c53b79..ee1101f 100644 --- a/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java +++ b/src/test/java/ch/digitalfondue/jfiveparse/GenerateEntities.java @@ -120,8 +120,10 @@ void pregenerateSupportArray() throws IOException { //System.out.println(Arrays.toString(dat.check)); // right size base and check array + int base_check = 0; for (int i = dat.base.length - 1; i >= 0; i--) { if (dat.base[i] != 0) { + base_check = i+1; System.out.println("base length should be " + (i+1)); break; } @@ -134,6 +136,39 @@ void pregenerateSupportArray() throws IOException { } } + ByteArrayOutputStream baosOneCodePoint = new ByteArrayOutputStream(); + GZIPOutputStream osOneCodePoint = new GZIPOutputStream(baosOneCodePoint); + DataOutputStream daos = new DataOutputStream(osOneCodePoint); + daos.writeInt(base_check); + for(int i = 0; i < base_check; i++) { + daos.writeLong(dat.base[i]); + } + for(int i = 0; i < base_check; i++) { + daos.writeInt(dat.check[i]); + } + daos.writeInt(dat.valuesCount); + for(int i = 0; i < dat.valuesCount; i++) { + daos.writeLong(dat.values[i]); + } + daos.flush(); + daos.close(); + Files.write(Paths.get("double-trie-array"), baosOneCodePoint.toByteArray()); + + + + /*System.out.println("base"); + System.out.println(Arrays.stream(Arrays.copyOfRange(dat.base, 0, base_check)) + .mapToObj(v -> ""+v + "L") + .toList()); + System.out.println("check"); + System.out.println(Arrays.toString(Arrays.copyOfRange(dat.check, 0, base_check))); + System.out.println("values"); + System.out.println(Arrays.stream(Arrays.copyOfRange(dat.values, 0, dat.valuesCount)) + .mapToObj(v -> ""+v + "L") + .toList()); + + System.out.println("values length is " + dat.valuesCount); + */ //dat.lookup("&"); //dat.lookupValue("&");