GrammarSoft · TinoDidriksen · Sep 8, 2025 · Jun 23, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,8 @@ Makefile
 /test/**/output*.txt
 /test/**/untraced*.txt
 /test/**/std*.txt
+/test/**/*.bsf*.txt
+/test/**/*.bsf.cg3
 /test/**/*.out.cg3
 /test/**/*.cg3b
 /test/**/*.bin

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -149,7 +149,7 @@ if(EMSCRIPTEN)
 	endif()
 endif()
 
-add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit)
+add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit -DU_CHARSET_IS_UTF8=1)
 include_directories("include")
 include_directories("src")
 
@@ -165,6 +165,7 @@ configure_file(scripts/cg3-autobin.pl.in scripts/cg3-autobin.pl @ONLY)
 install(PROGRAMS
 	"${CMAKE_CURRENT_BINARY_DIR}/scripts/cg3-autobin.pl"
 	"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-sort"
+	"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-stabilize-relations"
 	"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-strictify"
 	"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-untrace"
 	DESTINATION ${CMAKE_INSTALL_BINDIR})

diff --git a/include/getopt/getopt.cpp b/include/getopt/getopt.cpp
@@ -37,14 +37,14 @@ int getopt(int argc, char **argv, const char *opts) {
 		   argv[optind][0] != '-' || argv[optind][1] == '\0')
 			return(EOF);
 		else if (strcmp(argv[optind], "--") == 0) {
-			optind++;
+			++optind;
 			return(EOF);
 		}
 	optopt = c = argv[optind][sp];
 	if (c == ':' || (cp=strchr(opts, c)) == 0) {
 		ERR(": illegal option -- ", (char)c);
 		if (argv[optind][++sp] == '\0') {
-			optind++;
+			++optind;
 			sp = 1;
 		}
 		return('?');
@@ -65,7 +65,7 @@ int getopt(int argc, char **argv, const char *opts) {
 	else {
 		if (argv[optind][++sp] == '\0') {
 			sp = 1;
-			optind++;
+			++optind;
 		}
 		optarg = nullptr;
 	}

diff --git a/manual/streamformats.xml b/manual/streamformats.xml
@@ -183,4 +183,65 @@
     </para>
   </section>
 
+  <section id="stream-binary">
+    <title>Binary Format</title>
+    <indexterm>
+      <primary>Binary Stream Format</primary>
+    </indexterm>
+    <para>
+      The binary format can be generated by <link linkend="cg-conv">cg-conv</link> and can be parsed either by cg-conv or by the Python bindings.
+      It is designed for faster parsing than the textual formats.
+      The intended usecase is cases where the same input needs to be processed multiple times (such as when testing several grammars).
+    </para>
+    <para>
+      The stream begins with a header containing <code>CGBF</code> followed by a 4-byte version number (currently <code>1</code>).
+      After that, each packet begins with 1 byte indicating its contents.
+      <code>1</code> is a window, <code>2></code> is a command, and <code>3></code> is text.
+    </para>
+    <para>
+      Command packets have a second byte identifying the command: <code>1</code> for <code>FLUSH</code>, <code>2</code> for <code>EXIT</code>, <code>3</code> for <code>IGNORE</code>, and <code>4</code> for <code>RESUME</code>.
+      Commands which manipulate variables are represented in window packets.
+    </para>
+    <para>
+      Text packets consist of a 2-byte length followed by the contents in UTF-8.
+    </para>
+    <para>
+      Each window packet begins with 4 bytes specifying the length of the block and then the following structure:
+      <screen>
+        window flags [2]
+          &gt; 1 = has multi-window dependencies
+        tags [array of str]
+        variables [array]
+          mode
+            &gt; 1 = SETVAR (var = val)
+            &gt; 2 = SETVAR (var = *)
+            &gt; 3 = REMVAR
+          var [tag]
+          val or 0 [tag]
+        text [str]
+        text_post [str]
+        cohorts [array]
+          flags [2]
+            &gt; 1 = is target of a relation
+          wordform [tag]
+          static_tags [array of tag]
+          dep_self [4]
+          dep_parent or 0xFFFFFFFF [4]
+          relations [array]
+            tag [tag]
+            head [4]
+          text [str]
+          wblank [str]
+          readings [array]
+            flags [2]
+              &gt; 1 = is subreading of predecessor
+              &gt; 2 = deleted
+            baseform [tag]
+            tags [array of tag]
+      </screen>
+      Where arrays and strings are both encoded with a 2-byte length followed by the specified number of objects or UTF-8 bytes.
+      Each item of type <code>[tag]</code> is a 2-byte index into the window-wide <code>tags</code> array.
+    </para>
+  </section>
+
 </chapter>
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
 
 set(PYTHON_FILE "constraint_grammar.py")
 set(CPP_WRAP_FILE "constraint_grammar_wrap.cpp")
-file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/cg3.py" "from constraint_grammar import *\n")
+set(PYTHON_LIBRARY_FILE "cg3.py")
 
 set(BUILD_DEFS "")
 get_directory_property(_defs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS)
@@ -21,7 +21,7 @@ add_custom_command(OUTPUT ${CPP_WRAP_FILE} ${PYTHON_FILE}
 )
 
 add_custom_target(wrapper ALL
-	DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE}
+	DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} ${PYTHON_LIBRARY_FILE}
 	VERBATIM
 )
 

diff --git a/python/cg3.py b/python/cg3.py
@@ -0,0 +1,177 @@
+from constraint_grammar import *
+
+from collections import defaultdict
+from dataclasses import dataclass, field
+import struct
+from typing import DefaultDict, Dict, List, Optional
+
+@dataclass
+class Reading:
+	lemma: str = ''
+	tags: List[str] = field(default_factory=list)
+	subreading: Optional['Reading'] = None
+	deleted: bool = False
+
+@dataclass
+class Cohort:
+	static: Reading = field(default_factory=Reading)
+	readings: List[Reading] = field(default_factory=list)
+	dep_self: int = 0
+	dep_parent: Optional[int] = None
+	relations: DefaultDict[str, List[int]] = field(
+		default_factory=lambda: defaultdict(list))
+	text: str = ''
+	wblank: str = ''
+
+@dataclass
+class Window:
+	cohorts: List[Cohort] = field(default_factory=list)
+	set_vars: Dict[str, Optional[str]] = field(default_factory=dict)
+	rem_vars: List[str] = field(default_factory=list)
+	text: str = ''
+	text_post: str = ''
+	dep_has_spanned: bool = False
+
+@dataclass
+class Packet:
+	type: str = ''
+	window: Optional[Window] = None
+	command: str = ''
+	text: str = ''
+
+def parse_binary_window(buf):
+	'''Given a bytestring `buf` containing a single window
+	(not including the length header), parse and return a Window()
+	object. For most applications you probbaly want parse_binary_stream()
+	instead.'''
+
+	pos = 0
+	def read_pat(pat):
+		nonlocal pos, buf
+		ret = struct.unpack_from('<'+pat, buf, pos)
+		pos += struct.calcsize('<'+pat)
+		return ret
+	def read_u16():
+		return read_pat('H')[0]
+	def read_u32():
+		return read_pat('I')[0]
+	def read_str():
+		l = read_u16()
+		if l == 0:
+			return ''
+		return read_pat(f'{l}s')[0].decode('utf-8')
+	window = Window()
+	window_flags = read_u16()
+	if window_flags & 1:
+		window.dep_has_spanned = True
+	tag_count = read_u16()
+	tags = [read_str() for i in range(tag_count)]
+	def read_tags():
+		nonlocal tags
+		ct = read_u16()
+		if ct == 0:
+			return []
+		idx = read_pat(f'{ct}H')
+		return [tags[t] for t in idx]
+	var_count = read_u16()
+	for i in range(var_count):
+		mode = read_pat('B')[0]
+		t1 = read_u16()
+		t2 = read_u16()
+		if mode == 1:
+			window.set_vars[tags[t1]] = tags[t2]
+		elif mode == 2:
+			window.set_vars[tags[t1]] = None
+		elif mode == 3:
+			window.rem_vars.append(tags[t1])
+	window.text = read_str()
+	window.text_post = read_str()
+	cohort_count = read_u16()
+	for i in range(cohort_count):
+		cohort = Cohort()
+		cohort_flags = read_u16()
+		cohort.static.lemma = tags[read_u16()]
+		cohort.static.tags = read_tags()
+		cohort.dep_self = read_u32()
+		cohort.dep_parent = read_u32()
+		if cohort.dep_parent == 0xffffffff:
+			cohort.dep_parent = None
+		rel_count = read_u16()
+		for i in range(rel_count):
+			tag = tags[read_u16()]
+			head = read_u32()
+			cohort.relations[tag].append(head)
+		cohort.text = read_str()
+		cohort.wblank = read_str()
+		reading_count = read_u16()
+		prev = None
+		for i in range(reading_count):
+			reading_flags = read_u16()
+			reading = Reading()
+			reading.lemma = tags[read_u16()]
+			reading.tags = read_tags()
+			if reading_flags & 1 and prev is not None:
+				prev.subreading = reading
+			else:
+				cohort.readings.append(reading)
+			if reading_flags & 2:
+				reading.deleted = True
+			prev = reading
+		window.cohorts.append(cohort)
+	return window
+
+def parse_binary_stream(fin, windows_only=False):
+	'''Given a file `fin`, yield a series of Packet() objects.
+	raises ValueError if stream header is missing or invalid.
+	If `windows_only` is True, packets containing commands or text
+	are skipped and Window() objects are returned instead.'''
+
+	header = fin.read(8)
+	label, version = struct.unpack('<4sI', header)
+	if label != b'CGBF':
+		raise ValueError('Binary format header not found!')
+	if version != 1:
+		raise ValueError('Unknown binary format version!')
+	while True:
+		ptype = fin.read(1)
+		if len(ptype) != 1:
+			break
+		if ptype[0] == 1:
+			spec = fin.read(4)
+			if len(spec) != 4:
+				break;
+			block_len = struct.unpack('<I', spec)[0]
+			block = fin.read(block_len)
+			if len(block) != block_len:
+				break
+			window = parse_binary_window(block)
+			if windows_only:
+				yield window
+			else:
+				yield Packet(type='window', window=window)
+		elif ptype[0] == 2:
+			cmd = fin.read(1)
+			if len(cmd) != 1:
+				break
+			if windows_only:
+				continue
+			pack = Packet(type='command')
+			if cmd[0] == 1:
+				pack.command = 'FLUSH'
+			elif cmd[0] == 2:
+				pack.command = 'EXIT'
+			elif cmd[0] == 3:
+				pack.command = 'IGNORE'
+			elif cmd[0] == 4:
+				pack.command = 'RESUME'
+			else:
+				continue
+		elif ptype[0] == 3:
+			lbuf = fin.read(2)
+			ln = struct.unpack('<I', lbuf)[0]
+			pack = Packet(type='text')
+			pack.text = fin.read(ln).decode('utf-8')
+			if not windows_only:
+				yield pack
+		else:
+			continue
diff --git a/scripts/cg-sort b/scripts/cg-sort
@@ -14,7 +14,7 @@ use Getopt::Long;
 Getopt::Long::Configure('bundling');
 Getopt::Long::Configure('no_ignore_case');
 my %opts = ();
-GetOptions(\%opts, ('weight|w:s', 'reverse|r', 'first|1', 'help|?'));
+GetOptions(\%opts, ('weight|w:s', 'mapping|m:s', 'reverse|r', 'first|1', 'help|?'));
 
 sub print_help {
    print <<'XOUT';
@@ -25,6 +25,7 @@ Pipe a CG stream through this to sort and unique the readings of each cohort.
 Options:
  -?, --help       outputs this help
  -w, --weight     sorts by a numeric tag; defaults to W
+ -m, --mapping    sorts mapping tags with given prefix; defaults to @
  -r, --reverse    reverses the sort order
  -1, --first      only keep the first reading
 
@@ -41,6 +42,11 @@ if (exists($opts{weight}) && length($opts{weight})) {
    $W = $opts{weight};
 }
 
+my $M = '@';
+if (exists($opts{mapping}) && length($opts{mapping})) {
+   $M = $opts{mapping};
+}
+
 my $in_cohort = 0;
 my %readings = ();
 my %deleted = ();
@@ -66,6 +72,14 @@ sub print_sorted_readings {
    if (!@_) {
       return;
    }
+   if (exists($opts{mapping})) {
+      foreach (@_) {
+         my @tags = ($_ =~ m@ ($M\S+)@g);
+         @tags = sort @tags;
+         my $t = join(' ', @tags);
+         $_ =~ s@( $M\S+)+@ $t@;
+      }
+   }
    if (exists($opts{weight})) {
       @_ = sort sort_weight @_;
    }

diff --git a/scripts/cg-stabilize-relations b/scripts/cg-stabilize-relations
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+import argparse
+import re
+import sys
+
+parser = argparse.ArgumentParser('Pipe a CG stream through this to stabilize IDs and relations so they have consistent numbers')
+args = parser.parse_args()
+
+id_map = {}
+
+tag = re.compile(r'\b(ID:|R:[^:\s]+:)(\d+)\b')
+def repl(matchobj):
+    global id_map
+    n = matchobj.group(2)
+    if n not in id_map:
+        id_map[n] = str(len(id_map) + 1)
+    return matchobj.group(1) + id_map[n]
+
+for line in sys.stdin:
+    sys.stdout.write(tag.sub(repl, line))
diff --git a/src/ApertiumApplicator.cpp b/src/ApertiumApplicator.cpp
@@ -382,7 +382,7 @@ void ApertiumApplicator::runGrammarOnText(std::istream& input, std::ostream& out
 
 			lCohort = cCohort = alloc_cohort(cSWindow);
 			cCohort->global_number = gWindow->cohort_counter++;
-			numCohorts++;
+			++numCohorts;
 
 			cCohort->text = blank;
 			blank.clear();
@@ -782,8 +782,8 @@ void ApertiumApplicator::printReading(const Reading* reading, std::ostream& outp
 			if (reading->parent->dep_parent == 0) {
 				pr = reading->parent->parent->cohorts[0];
 			}
-			else if (reading->parent->parent->parent->cohort_map.find(reading->parent->dep_parent) != reading->parent->parent->parent->cohort_map.end()) {
-				pr = reading->parent->parent->parent->cohort_map[reading->parent->dep_parent];
+			else if (gWindow->cohort_map.find(reading->parent->dep_parent) != gWindow->cohort_map.end()) {
+				pr = gWindow->cohort_map[reading->parent->dep_parent];
 			}
 		}