Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
a2e0192
start on binary format
mr-martian Jun 23, 2025
db2ac15
assorted fixes
mr-martian Jun 24, 2025
ff5b438
stream header and detection
mr-martian Jun 24, 2025
56d68e6
variables; account for ID updates
mr-martian Jun 27, 2025
60a5e3d
start on relations
mr-martian Jun 27, 2025
a5fd974
write global_number rather than dep_self
mr-martian Jun 27, 2025
9f114b4
relations
mr-martian Jun 27, 2025
9fae2fc
minor optimizations - don't rehash if we don't need to
mr-martian Jul 2, 2025
ef63dce
python binary parser
mr-martian Jul 2, 2025
f890e75
fix indentation
mr-martian Jul 2, 2025
ad0279f
stop using CI as compiler
mr-martian Jul 2, 2025
9ea2ea7
add docs
mr-martian Jul 2, 2025
03fe90d
typo in python; --dep-delimit for conv
mr-martian Jul 17, 2025
1670d65
add baseform properly
mr-martian Jul 26, 2025
c149504
also for static tags
mr-martian Jul 26, 2025
f17ac62
Add format conversion in main; Add tests for binary format, currently…
TinoDidriksen Aug 20, 2025
373bbcd
handle empty cohorts (41/69)
mr-martian Aug 20, 2025
151c291
add endtag to last cohort (42/69)
mr-martian Aug 20, 2025
5bd496c
args to cg-conv (45/69)
mr-martian Aug 20, 2025
329b515
unique_tags (46/69)
mr-martian Aug 20, 2025
628168b
delimiters in tests (49/69)
mr-martian Aug 21, 2025
daf2d86
print end tag
mr-martian Aug 21, 2025
e192cbf
dep_has_spanned (Omniscan)
mr-martian Aug 21, 2025
c6e610f
Include Static (57/69)
TinoDidriksen Aug 22, 2025
c6155cc
fix flag offsets
mr-martian Aug 22, 2025
ad8e641
split mappings
mr-martian Aug 22, 2025
32d3b32
ensure tags are mapping tags
mr-martian Aug 22, 2025
e70fd68
stabilize-relations (#142)
mr-martian Aug 22, 2025
389cdd4
some tests have FLUSH in them
mr-martian Aug 22, 2025
22a6be8
parent.local_number == 0 -> parent = 0
mr-martian Aug 22, 2025
21f8c02
dep_window happens in appendCohort (62/69 passing tests); Minor other…
TinoDidriksen Aug 29, 2025
9ddc564
Move text belonging to removed cohorts to prior not-removed cohorts, …
TinoDidriksen Aug 29, 2025
7a3bdb4
Baseform/wordform type isn't enough to exclude (64/69)
TinoDidriksen Aug 29, 2025
6c82135
Create window for trailing vars (65/69)
TinoDidriksen Aug 29, 2025
86a856c
3 distinct packet types (67/69)
TinoDidriksen Aug 29, 2025
4acac69
Ensure binary stream is little endian (still 67/69)
TinoDidriksen Aug 29, 2025
2450df0
multiple packet types python
mr-martian Sep 5, 2025
21d7940
update docs
mr-martian Sep 5, 2025
ea29654
Move mappings to the end; Tell cg-sort which prefix
TinoDidriksen Sep 8, 2025
7aa0008
Force PERL_UNICODE=SDA in the test runner
TinoDidriksen Sep 8, 2025
7838dae
Version
TinoDidriksen Sep 8, 2025
f4f812f
Install cg-stabilize-relations
TinoDidriksen Sep 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Makefile
/test/**/output*.txt
/test/**/untraced*.txt
/test/**/std*.txt
/test/**/*.bsf*.txt
/test/**/*.bsf.cg3
/test/**/*.out.cg3
/test/**/*.cg3b
/test/**/*.bin
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ if(EMSCRIPTEN)
endif()
endif()

add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit)
add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit -DU_CHARSET_IS_UTF8=1)
include_directories("include")
include_directories("src")

Expand All @@ -165,6 +165,7 @@ configure_file(scripts/cg3-autobin.pl.in scripts/cg3-autobin.pl @ONLY)
install(PROGRAMS
"${CMAKE_CURRENT_BINARY_DIR}/scripts/cg3-autobin.pl"
"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-sort"
"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-stabilize-relations"
"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-strictify"
"${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-untrace"
DESTINATION ${CMAKE_INSTALL_BINDIR})
Expand Down
6 changes: 3 additions & 3 deletions include/getopt/getopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ int getopt(int argc, char **argv, const char *opts) {
argv[optind][0] != '-' || argv[optind][1] == '\0')
return(EOF);
else if (strcmp(argv[optind], "--") == 0) {
optind++;
++optind;
return(EOF);
}
optopt = c = argv[optind][sp];
if (c == ':' || (cp=strchr(opts, c)) == 0) {
ERR(": illegal option -- ", (char)c);
if (argv[optind][++sp] == '\0') {
optind++;
++optind;
sp = 1;
}
return('?');
Expand All @@ -65,7 +65,7 @@ int getopt(int argc, char **argv, const char *opts) {
else {
if (argv[optind][++sp] == '\0') {
sp = 1;
optind++;
++optind;
}
optarg = nullptr;
}
Expand Down
61 changes: 61 additions & 0 deletions manual/streamformats.xml
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,65 @@
</para>
</section>

<section id="stream-binary">
<title>Binary Format</title>
<indexterm>
<primary>Binary Stream Format</primary>
</indexterm>
<para>
The binary format can be generated by <link linkend="cg-conv">cg-conv</link> and can be parsed either by cg-conv or by the Python bindings.
It is designed for faster parsing than the textual formats.
The intended usecase is cases where the same input needs to be processed multiple times (such as when testing several grammars).
</para>
<para>
The stream begins with a header containing <code>CGBF</code> followed by a 4-byte version number (currently <code>1</code>).
After that, each packet begins with 1 byte indicating its contents.
<code>1</code> is a window, <code>2></code> is a command, and <code>3></code> is text.
</para>
<para>
Command packets have a second byte identifying the command: <code>1</code> for <code>FLUSH</code>, <code>2</code> for <code>EXIT</code>, <code>3</code> for <code>IGNORE</code>, and <code>4</code> for <code>RESUME</code>.
Commands which manipulate variables are represented in window packets.
</para>
<para>
Text packets consist of a 2-byte length followed by the contents in UTF-8.
</para>
<para>
Each window packet begins with 4 bytes specifying the length of the block and then the following structure:
<screen>
window flags [2]
&gt; 1 = has multi-window dependencies
tags [array of str]
variables [array]
mode
&gt; 1 = SETVAR (var = val)
&gt; 2 = SETVAR (var = *)
&gt; 3 = REMVAR
var [tag]
val or 0 [tag]
text [str]
text_post [str]
cohorts [array]
flags [2]
&gt; 1 = is target of a relation
wordform [tag]
static_tags [array of tag]
dep_self [4]
dep_parent or 0xFFFFFFFF [4]
relations [array]
tag [tag]
head [4]
text [str]
wblank [str]
readings [array]
flags [2]
&gt; 1 = is subreading of predecessor
&gt; 2 = deleted
baseform [tag]
tags [array of tag]
</screen>
Where arrays and strings are both encoded with a 2-byte length followed by the specified number of objects or UTF-8 bytes.
Each item of type <code>[tag]</code> is a 2-byte index into the window-wide <code>tags</code> array.
</para>
</section>

</chapter>
4 changes: 2 additions & 2 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})

set(PYTHON_FILE "constraint_grammar.py")
set(CPP_WRAP_FILE "constraint_grammar_wrap.cpp")
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/cg3.py" "from constraint_grammar import *\n")
set(PYTHON_LIBRARY_FILE "cg3.py")

set(BUILD_DEFS "")
get_directory_property(_defs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS)
Expand All @@ -21,7 +21,7 @@ add_custom_command(OUTPUT ${CPP_WRAP_FILE} ${PYTHON_FILE}
)

add_custom_target(wrapper ALL
DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE}
DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} ${PYTHON_LIBRARY_FILE}
VERBATIM
)

Expand Down
177 changes: 177 additions & 0 deletions python/cg3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
from constraint_grammar import *

from collections import defaultdict
from dataclasses import dataclass, field
import struct
from typing import DefaultDict, Dict, List, Optional

@dataclass
class Reading:
lemma: str = ''
tags: List[str] = field(default_factory=list)
subreading: Optional['Reading'] = None
deleted: bool = False

@dataclass
class Cohort:
static: Reading = field(default_factory=Reading)
readings: List[Reading] = field(default_factory=list)
dep_self: int = 0
dep_parent: Optional[int] = None
relations: DefaultDict[str, List[int]] = field(
default_factory=lambda: defaultdict(list))
text: str = ''
wblank: str = ''

@dataclass
class Window:
cohorts: List[Cohort] = field(default_factory=list)
set_vars: Dict[str, Optional[str]] = field(default_factory=dict)
rem_vars: List[str] = field(default_factory=list)
text: str = ''
text_post: str = ''
dep_has_spanned: bool = False

@dataclass
class Packet:
type: str = ''
window: Optional[Window] = None
command: str = ''
text: str = ''

def parse_binary_window(buf):
'''Given a bytestring `buf` containing a single window
(not including the length header), parse and return a Window()
object. For most applications you probbaly want parse_binary_stream()
instead.'''

pos = 0
def read_pat(pat):
nonlocal pos, buf
ret = struct.unpack_from('<'+pat, buf, pos)
pos += struct.calcsize('<'+pat)
return ret
def read_u16():
return read_pat('H')[0]
def read_u32():
return read_pat('I')[0]
def read_str():
l = read_u16()
if l == 0:
return ''
return read_pat(f'{l}s')[0].decode('utf-8')
window = Window()
window_flags = read_u16()
if window_flags & 1:
window.dep_has_spanned = True
tag_count = read_u16()
tags = [read_str() for i in range(tag_count)]
def read_tags():
nonlocal tags
ct = read_u16()
if ct == 0:
return []
idx = read_pat(f'{ct}H')
return [tags[t] for t in idx]
var_count = read_u16()
for i in range(var_count):
mode = read_pat('B')[0]
t1 = read_u16()
t2 = read_u16()
if mode == 1:
window.set_vars[tags[t1]] = tags[t2]
elif mode == 2:
window.set_vars[tags[t1]] = None
elif mode == 3:
window.rem_vars.append(tags[t1])
window.text = read_str()
window.text_post = read_str()
cohort_count = read_u16()
for i in range(cohort_count):
cohort = Cohort()
cohort_flags = read_u16()
cohort.static.lemma = tags[read_u16()]
cohort.static.tags = read_tags()
cohort.dep_self = read_u32()
cohort.dep_parent = read_u32()
if cohort.dep_parent == 0xffffffff:
cohort.dep_parent = None
rel_count = read_u16()
for i in range(rel_count):
tag = tags[read_u16()]
head = read_u32()
cohort.relations[tag].append(head)
cohort.text = read_str()
cohort.wblank = read_str()
reading_count = read_u16()
prev = None
for i in range(reading_count):
reading_flags = read_u16()
reading = Reading()
reading.lemma = tags[read_u16()]
reading.tags = read_tags()
if reading_flags & 1 and prev is not None:
prev.subreading = reading
else:
cohort.readings.append(reading)
if reading_flags & 2:
reading.deleted = True
prev = reading
window.cohorts.append(cohort)
return window

def parse_binary_stream(fin, windows_only=False):
'''Given a file `fin`, yield a series of Packet() objects.
raises ValueError if stream header is missing or invalid.
If `windows_only` is True, packets containing commands or text
are skipped and Window() objects are returned instead.'''

header = fin.read(8)
label, version = struct.unpack('<4sI', header)
if label != b'CGBF':
raise ValueError('Binary format header not found!')
if version != 1:
raise ValueError('Unknown binary format version!')
while True:
ptype = fin.read(1)
if len(ptype) != 1:
break
if ptype[0] == 1:
spec = fin.read(4)
if len(spec) != 4:
break;
block_len = struct.unpack('<I', spec)[0]
block = fin.read(block_len)
if len(block) != block_len:
break
window = parse_binary_window(block)
if windows_only:
yield window
else:
yield Packet(type='window', window=window)
elif ptype[0] == 2:
cmd = fin.read(1)
if len(cmd) != 1:
break
if windows_only:
continue
pack = Packet(type='command')
if cmd[0] == 1:
pack.command = 'FLUSH'
elif cmd[0] == 2:
pack.command = 'EXIT'
elif cmd[0] == 3:
pack.command = 'IGNORE'
elif cmd[0] == 4:
pack.command = 'RESUME'
else:
continue
elif ptype[0] == 3:
lbuf = fin.read(2)
ln = struct.unpack('<I', lbuf)[0]
pack = Packet(type='text')
pack.text = fin.read(ln).decode('utf-8')
if not windows_only:
yield pack
else:
continue
16 changes: 15 additions & 1 deletion scripts/cg-sort
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use Getopt::Long;
Getopt::Long::Configure('bundling');
Getopt::Long::Configure('no_ignore_case');
my %opts = ();
GetOptions(\%opts, ('weight|w:s', 'reverse|r', 'first|1', 'help|?'));
GetOptions(\%opts, ('weight|w:s', 'mapping|m:s', 'reverse|r', 'first|1', 'help|?'));

sub print_help {
print <<'XOUT';
Expand All @@ -25,6 +25,7 @@ Pipe a CG stream through this to sort and unique the readings of each cohort.
Options:
-?, --help outputs this help
-w, --weight sorts by a numeric tag; defaults to W
-m, --mapping sorts mapping tags with given prefix; defaults to @
-r, --reverse reverses the sort order
-1, --first only keep the first reading

Expand All @@ -41,6 +42,11 @@ if (exists($opts{weight}) && length($opts{weight})) {
$W = $opts{weight};
}

my $M = '@';
if (exists($opts{mapping}) && length($opts{mapping})) {
$M = $opts{mapping};
}

my $in_cohort = 0;
my %readings = ();
my %deleted = ();
Expand All @@ -66,6 +72,14 @@ sub print_sorted_readings {
if (!@_) {
return;
}
if (exists($opts{mapping})) {
foreach (@_) {
my @tags = ($_ =~ m@ ($M\S+)@g);
@tags = sort @tags;
my $t = join(' ', @tags);
$_ =~ s@( $M\S+)+@ $t@;
}
}
if (exists($opts{weight})) {
@_ = sort sort_weight @_;
}
Expand Down
21 changes: 21 additions & 0 deletions scripts/cg-stabilize-relations
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python3

import argparse
import re
import sys

parser = argparse.ArgumentParser('Pipe a CG stream through this to stabilize IDs and relations so they have consistent numbers')
args = parser.parse_args()

id_map = {}

tag = re.compile(r'\b(ID:|R:[^:\s]+:)(\d+)\b')
def repl(matchobj):
global id_map
n = matchobj.group(2)
if n not in id_map:
id_map[n] = str(len(id_map) + 1)
return matchobj.group(1) + id_map[n]

for line in sys.stdin:
sys.stdout.write(tag.sub(repl, line))
6 changes: 3 additions & 3 deletions src/ApertiumApplicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ void ApertiumApplicator::runGrammarOnText(std::istream& input, std::ostream& out

lCohort = cCohort = alloc_cohort(cSWindow);
cCohort->global_number = gWindow->cohort_counter++;
numCohorts++;
++numCohorts;

cCohort->text = blank;
blank.clear();
Expand Down Expand Up @@ -782,8 +782,8 @@ void ApertiumApplicator::printReading(const Reading* reading, std::ostream& outp
if (reading->parent->dep_parent == 0) {
pr = reading->parent->parent->cohorts[0];
}
else if (reading->parent->parent->parent->cohort_map.find(reading->parent->dep_parent) != reading->parent->parent->parent->cohort_map.end()) {
pr = reading->parent->parent->parent->cohort_map[reading->parent->dep_parent];
else if (gWindow->cohort_map.find(reading->parent->dep_parent) != gWindow->cohort_map.end()) {
pr = gWindow->cohort_map[reading->parent->dep_parent];
}
}

Expand Down
Loading