From a2e01921f9c5443ea89ba21414f47ff158813a28 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 23 Jun 2025 14:35:16 -0400 Subject: [PATCH 01/42] start on binary format --- src/BinaryApplicator.cpp | 347 +++++++++++++++++++++++++++++++++++++++ src/BinaryApplicator.hpp | 52 ++++++ src/CMakeLists.txt | 2 + src/FormatConverter.cpp | 15 ++ src/FormatConverter.hpp | 3 +- src/cg-conv.cpp | 6 + src/cg3.h | 1 + src/options_conv.hpp | 4 + 8 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 src/BinaryApplicator.cpp create mode 100644 src/BinaryApplicator.hpp diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp new file mode 100644 index 00000000..65d2f278 --- /dev/null +++ b/src/BinaryApplicator.cpp @@ -0,0 +1,347 @@ +/* +* Copyright (C) 2007-2025, GrammarSoft ApS +* Developed by Tino Didriksen +* Design by Eckhard Bick , Tino Didriksen +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this progam. If not, see . +*/ + +#include "BinaryApplicator.hpp" +#include "Grammar.hpp" + +namespace CG3 { + +BinaryApplicator::BinaryApplicator(std::ostream& ux_err) + : GrammarApplicator(ux_err) +{ +} + +void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& output) { + ux_stdin = &input; + ux_stdout = &output; + + if (!input.good()) { + u_fprintf(ux_stderr, "Error: Input is null - nothing to parse!\n"); + CG3Quit(1); + } + if (input.eof()) { + u_fprintf(ux_stderr, "Error: Input is empty - nothing to parse!\n"); + CG3Quit(1); + } + if (!output) { + u_fprintf(ux_stderr, "Error: Output is null - cannot write to nothing!\n"); + CG3Quit(1); + } + + if (!grammar) { + u_fprintf(ux_stderr, "Error: No grammar provided - cannot continue! Hint: call setGrammar() first.\n"); + CG3Quit(1); + } + + index(); + + uint32_t resetAfter = ((num_windows + 4) * 2 + 1); + bool flushAfter = false; + + gWindow->window_span = num_windows; + + while (!input.eof()) { + flushAfter = readWindow(); + gWindow->shuffleWindowsDown(); + runGrammarOnWindow(); + ++numWindows; + if (numWindows % resetAfter == 0) { + resetIndexes(); + } + if (flushAfter) { + while (!gWindow->next.empty()) { + gWindow->shuffleWindowsDown(); + runGrammarOnWindow(); + } + gWindow->shuffleWindowsDown(); + while (!gWindow->previous.empty()) { + SingleWindow* tmp = gWindow->previous.front(); + printSingleWindow(tmp, output); + free_swindow(tmp); + gWindow->previous.erase(gWindow->previous.begin()); + } + } + } +} + +#define READ_U16_INTO(dest) \ + do { \ + (dest) = reinterpret_cast(&buf[pos])[0]; \ + pos += 2; \ + } while (false) + +#define READ_U32_INTO(dest) \ + do { \ + (dest) = reinterpret_cast(&buf[pos])[0]; \ + pos += 4; \ + } while (false) + +#define READ_STR_INTO(dest) \ + do { \ + uint16_t tl = reinterpret_cast(&buf[pos])[0]; \ + pos += 2; \ + (dest).clear(); \ + (dest).resize(tl, 0); \ + int32_t olen = 0; \ + UErrorCode status = U_ZERO_ERROR; \ + u_strFromUTF8(&(dest)[0], tl, &olen, &buf[pos], tl, &status); \ + (dest).resize(olen); \ + pos += tl; \ + } while (false) + +bool BinaryApplicator::readWindow() { + SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); + + uint32_t cs = 0; + readRaw(*ux_stdin, cs); + + if (ux_stdin->eof()) { + return true; + } + + std::string buf(cs, 0); + ux_stdin->read(&buf[0], cs); + uint32_t pos = 0; + + // TODO: flags + uint16_t flags; + READ_U16_INTO(flags); + if (flags & BFW_FLUSH) { + cSWindow->flush_after = true; + } + + TagVector window_tags; + uint16_t tag_count; + READ_U16_INTO(tag_count); + for (uint16_t i = 0; i < tag_count; i++) { + UString tg; + READ_STR_INTO(tg); + u_fprintf(ux_stderr, "pos = %u, tg = %S, i = %u / %u\n", pos, tg.data(), i, tag_count); + window_tags.push_back(addTag(tg)); + } + + uint16_t var_count; + READ_U16_INTO(var_count); + // TODO + + READ_STR_INTO(cSWindow->text); + READ_STR_INTO(cSWindow->text_post); + + uint16_t cohort_count; + READ_U16_INTO(cohort_count); + uint16_t tag; + for (uint16_t cn = 0; cn < cohort_count; cn++) { + Cohort* cCohort = alloc_cohort(cSWindow); + cCohort->global_number = gWindow->cohort_counter++; + + READ_U16_INTO(flags); + /*if (flags & BFC_DELETED) { + cCohort->type |= CT_DELETED; + }*/ + + READ_U16_INTO(tag); + cCohort->wordform = window_tags[tag]; + + READ_U16_INTO(tag_count); + if (tag_count) { + cCohort->wread = alloc_reading(cCohort); + for (uint16_t tn = 0; tn < tag_count; tn++) { + READ_U16_INTO(tag); + addTagToReading(*cCohort->wread, window_tags[tag]); + } + } + + READ_U32_INTO(cCohort->dep_self); + READ_U32_INTO(cCohort->dep_parent); + + READ_STR_INTO(cCohort->text); + READ_STR_INTO(cCohort->wblank); + + uint16_t reading_count; + READ_U16_INTO(reading_count); + Reading* prev = nullptr; + for (uint16_t rn = 0; rn < reading_count; rn++) { + Reading* cReading = alloc_reading(cCohort); + addTagToReading(*cReading, cCohort->wordform); + + READ_U16_INTO(flags); + if (flags & BFR_DELETED) { + cReading->deleted = 1; + } + + READ_U16_INTO(tag_count); + for (uint16_t tn = 0; tn < tag_count; tn++) { + READ_U16_INTO(tag); + addTagToReading(*cReading, window_tags[tag]); + } + + if (prev && flags & BFR_SUBREADING) { + prev->next = cReading; + } + else { + cCohort->appendReading(cReading); + } + prev = cReading; + } + } + + return cSWindow->flush_after; +} + +#define WRITE_U16_INTO(n, buffer) \ + do { \ + std::string tmp(2, 0); \ + uint16_t tmp_n = (n); \ + tmp.assign(reinterpret_cast(&tmp_n), 2); \ + (buffer) += tmp; \ + } while (false) + +#define WRITE_U32_INTO(n, buffer) \ + do { \ + std::string tmp(4, 0); \ + uint32_t tmp_n = (n); \ + tmp.assign(reinterpret_cast(&tmp_n), 4); \ + (buffer) += tmp; \ + } while (false) + +#define WRITE_TAG_INTO(tag_, buffer) \ + do { \ + if (tag_index.find((tag_)) == tag_index.end()) { \ + tag_index[(tag_)] = tags_to_write.size(); \ + tags_to_write.push_back((tag_)); \ + u_fprintf(ux_stderr, "adding tag %S\n", (tag_)->tag.data()); \ + } \ + WRITE_U16_INTO(tag_index[(tag_)], buffer); \ + } while (false) + +#define WRITE_STR_INTO(s, buffer) \ + do { \ + std::string tmp((s).size() * 4, 0); \ + int32_t olen = 0; \ + UErrorCode status = U_ZERO_ERROR; \ + u_strToUTF8(&tmp[0], SI32((s).size() * 4 - 1), &olen, (s).data(), SI32((s).size()), &status); \ + tmp.resize(olen); \ + WRITE_U16_INTO(UI16(olen), (buffer)); \ + (buffer) += tmp; \ + } while (false) + +void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling) { + TagVector tags_to_write; + std::map tag_index; + + std::string cohort_buffer; + uint16_t cohort_count = 0; + for (auto& cohort : window->all_cohorts) { + if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { + continue; + } + cohort_count++; + + uint16_t flags = 0; + WRITE_U16_INTO(flags, cohort_buffer); + + WRITE_TAG_INTO(cohort->wordform, cohort_buffer); + if (cohort->wread) { + std::string tag_buffer; + uint16_t tag_count = 0; + for (auto tter : cohort->wread->tags_list) { + if (tter == cohort->wordform->hash) { + continue; + } + WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); + tag_count++; + } + WRITE_U16_INTO(tag_count, cohort_buffer); + cohort_buffer += tag_buffer; + } + else { + WRITE_U16_INTO(0, cohort_buffer); + } + + WRITE_U32_INTO(cohort->dep_self, cohort_buffer); + WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); + + WRITE_STR_INTO(cohort->text, cohort_buffer); + WRITE_STR_INTO(cohort->wblank, cohort_buffer); + + std::string reading_buffer; + uint16_t reading_count = 0; + std::sort(cohort->readings.begin(), cohort->readings.end(), Reading::cmp_number); + for (auto top_reading : cohort->readings) { + if (top_reading->noprint) { + continue; + } + auto reading = top_reading; + while (reading) { + reading_count++; + uint16_t flags = 0; + if (reading != top_reading) { + flags |= BFR_SUBREADING; + } + std::string tag_buffer; + uint16_t tag_count = 0; + if (reading->baseform) { + WRITE_TAG_INTO(grammar->single_tags[reading->baseform], tag_buffer); + tag_count++; + } + for (auto& tter : reading->tags_list) { + auto tag = grammar->single_tags[tter]; + if (tag->type & T_BASEFORM) { + continue; + } + WRITE_TAG_INTO(tag, tag_buffer); + tag_count++; + } + WRITE_U16_INTO(tag_count, reading_buffer); + reading_buffer += tag_buffer; + reading = reading->next; + } + } + WRITE_U16_INTO(reading_count, cohort_buffer); + cohort_buffer += reading_buffer; + } + + std::string header_buffer; + + uint16_t flags = 0; + if (window->flush_after) { + flags |= BFW_FLUSH; + } + WRITE_U16_INTO(flags, header_buffer); + + WRITE_U16_INTO(tags_to_write.size(), header_buffer); + for (auto& tag : tags_to_write) { + WRITE_STR_INTO(tag->tag, header_buffer); + } + + // TODO: variables + WRITE_U16_INTO(0, header_buffer); + + WRITE_STR_INTO(window->text, header_buffer); + WRITE_STR_INTO(window->text_post, header_buffer); + + WRITE_U16_INTO(cohort_count, header_buffer); + + uint32_t total_size = header_buffer.size() + cohort_buffer.size(); + writeRaw(output, total_size); + output.write(header_buffer.data(), header_buffer.size()); + output.write(cohort_buffer.data(), cohort_buffer.size()); + output.flush(); +} +} diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp new file mode 100644 index 00000000..1fcef4a9 --- /dev/null +++ b/src/BinaryApplicator.hpp @@ -0,0 +1,52 @@ +/* +* Copyright (C) 2007-2025, GrammarSoft ApS +* Developed by Tino Didriksen +* Design by Eckhard Bick , Tino Didriksen +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this progam. If not, see . +*/ + +#pragma once +#ifndef GRAMMARAPPLICATORBINARY_H +#define GRAMMARAPPLICATORBINARY_H + +#include "GrammarApplicator.hpp" + +namespace CG3 { + +enum BinaryFormatFlags { + // Window + BFW_FLUSH = (1 << 1), + // Cohort + BFC_DELETED = (1 << 1), + // Reading + BFR_SUBREADING = (1 << 1), + BFR_DELETED = (1 << 2), +}; + +class BinaryApplicator : public virtual GrammarApplicator { +public: + BinaryApplicator(std::ostream& ux_err); + + void runGrammarOnText(std::istream& input, std::ostream& output); + +protected: + void printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling = false) override; + +private: + bool readWindow(); +}; +} + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c3466559..cd4b2256 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -80,6 +80,8 @@ set(LIBCG3_SOURCES AST.hpp ApertiumApplicator.cpp ApertiumApplicator.hpp + BinaryApplicator.cpp + BinaryApplicator.hpp BinaryGrammar.cpp BinaryGrammar.hpp BinaryGrammar_read.cpp diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp index f88f07b6..62fae5cc 100644 --- a/src/FormatConverter.cpp +++ b/src/FormatConverter.cpp @@ -96,6 +96,7 @@ cg3_sformat detectFormat(std::string_view buf8) { FormatConverter::FormatConverter(std::ostream& ux_err) : GrammarApplicator(ux_err) , ApertiumApplicator(ux_err) + , BinaryApplicator(ux_err) , FSTApplicator(ux_err) , JsonlApplicator(ux_err) , MatxinApplicator(ux_err) @@ -151,6 +152,10 @@ void FormatConverter::runGrammarOnText(std::istream& input, std::ostream& output JsonlApplicator::runGrammarOnText(input, output); break; } + case CG3SF_BINARY: { + BinaryApplicator::runGrammarOnText(input, output); + break; + } default: CG3Quit(); } @@ -182,6 +187,8 @@ void FormatConverter::printCohort(Cohort* cohort, std::ostream& output, bool pro JsonlApplicator::printCohort(cohort, output, profiling); break; } + case CG3SF_BINARY: + break; default: CG3Quit(); } @@ -213,6 +220,10 @@ void FormatConverter::printSingleWindow(SingleWindow* window, std::ostream& outp JsonlApplicator::printSingleWindow(window, output, profiling); break; } + case CG3SF_BINARY: { + BinaryApplicator::printSingleWindow(window, output, profiling); + break; + } default: CG3Quit(); } @@ -224,6 +235,8 @@ void FormatConverter::printStreamCommand(UStringView cmd, std::ostream& output) JsonlApplicator::printStreamCommand(cmd, output); break; } + case CG3SF_BINARY: + break; case CG3SF_CG: case CG3SF_APERTIUM: case CG3SF_FST: @@ -242,6 +255,8 @@ void FormatConverter::printPlainTextLine(UStringView line, std::ostream& output) JsonlApplicator::printPlainTextLine(line, output); break; } + case CG3SF_BINARY: + break; case CG3SF_CG: case CG3SF_APERTIUM: case CG3SF_FST: diff --git a/src/FormatConverter.hpp b/src/FormatConverter.hpp index 61c93730..16f6cff6 100644 --- a/src/FormatConverter.hpp +++ b/src/FormatConverter.hpp @@ -22,6 +22,7 @@ #define c6d28b7452ec699b_FORMATCONVERTER_H #include "ApertiumApplicator.hpp" +#include "BinaryApplicator.hpp" #include "FSTApplicator.hpp" #include "JsonlApplicator.hpp" #include "MatxinApplicator.hpp" @@ -34,7 +35,7 @@ namespace CG3 { cg3_sformat detectFormat(std::string_view str); -class FormatConverter : public ApertiumApplicator, public FSTApplicator, public JsonlApplicator, public MatxinApplicator, public NicelineApplicator, public PlaintextApplicator { +class FormatConverter : public ApertiumApplicator, public BinaryApplicator, public FSTApplicator, public JsonlApplicator, public MatxinApplicator, public NicelineApplicator, public PlaintextApplicator { public: FormatConverter(std::ostream& ux_err); diff --git a/src/cg-conv.cpp b/src/cg-conv.cpp index d136ff16..e88bb122 100644 --- a/src/cg-conv.cpp +++ b/src/cg-conv.cpp @@ -151,6 +151,9 @@ int main(int argc, char* argv[]) { else if (options_conv[IN_JSONL].doesOccur) { fmt = CG3SF_JSONL; } + else if (options_conv[IN_BINARY].doesOccur) { + fmt = CG3SF_BINARY; + } if (options_conv[IN_AUTO].doesOccur || fmt == CG3SF_INVALID) { _instream = applicator.detectFormat(std::cin); @@ -210,6 +213,9 @@ int main(int argc, char* argv[]) { else if (options_conv[OUT_JSONL].doesOccur) { applicator.fmt_output = CG3SF_JSONL; } + else if (options_conv[OUT_BINARY].doesOccur) { + applicator.fmt_output = CG3SF_BINARY; + } if (options_conv[UNICODE_TAGS].doesOccur) { applicator.unicode_tags = true; diff --git a/src/cg3.h b/src/cg3.h index abdc0863..27ee497a 100644 --- a/src/cg3.h +++ b/src/cg3.h @@ -80,6 +80,7 @@ typedef enum { CG3SF_FST, CG3SF_PLAIN, CG3SF_JSONL, + CG3SF_BINARY, } cg3_sformat; // Default usage: if (!cg3_init(stdin, stdout, stderr)) { exit(1); } diff --git a/src/options_conv.hpp b/src/options_conv.hpp index da1bef57..c56e9f7f 100644 --- a/src/options_conv.hpp +++ b/src/options_conv.hpp @@ -41,6 +41,7 @@ enum OPTIONS { IN_FST, IN_PLAIN, IN_JSONL, + IN_BINARY, ADD_TAGS, OUT_CG, OUT_CG2, @@ -50,6 +51,7 @@ enum OPTIONS { OUT_NICELINE, OUT_PLAIN, OUT_JSONL, + OUT_BINARY, FST_WFACTOR, FST_WTAG, SUB_DELIMITER, @@ -75,6 +77,7 @@ std::array options_conv{ UOption{"in-fst", 'f', UOPT_NO_ARG, "sets input format to HFST/XFST"}, UOption{"in-plain", 'x', UOPT_NO_ARG, "sets input format to plain text"}, UOption{"in-jsonl", 'j', UOPT_NO_ARG, "sets input format to JSONL (experimental, specs below)"}, + UOption{"in-binary", 'z', UOPT_NO_ARG, "sets input format to binary (experimental)"}, UOption{"add-tags", 0, UOPT_NO_ARG, "adds minimal analysis to readings (implies -x)"}, UOption{"out-cg", 'C', UOPT_NO_ARG, "sets output format to CG (default)"}, UOption{"V", 'V', UOPT_NO_ARG}, @@ -84,6 +87,7 @@ std::array options_conv{ UOption{"out-niceline", 'N', UOPT_NO_ARG, "sets output format to Niceline CG"}, UOption{"out-plain", 'X', UOPT_NO_ARG, "sets output format to plain text"}, UOption{"out-jsonl", 'J', UOPT_NO_ARG, "sets output format to JSONL (experimental, specs below)"}, + UOption{"out-binary", 'Z', UOPT_NO_ARG, "sets output format to binary (experimental)"}, UOption{"wfactor", 'W', UOPT_REQUIRES_ARG, "FST weight factor (defaults to 1.0)"}, UOption{"wtag", 0, UOPT_REQUIRES_ARG, "FST weight tag prefix (defaults to W)"}, UOption{"sub-delim", 'S', UOPT_REQUIRES_ARG, "FST sub-reading delimiters (defaults to #)"}, From db2ac151335b74c09c3b38fa79cf9ef0d85207fb Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 23 Jun 2025 21:52:20 -0400 Subject: [PATCH 02/42] assorted fixes --- src/BinaryApplicator.cpp | 80 ++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 65d2f278..4ff5df07 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -56,28 +56,41 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu gWindow->window_span = num_windows; + auto flush = [&]() { + if (gWindow->back()) { + gWindow->back()->flush_after = true; + } + + while (!gWindow->next.empty()) { + gWindow->shuffleWindowsDown(); + runGrammarOnWindow(); + } + + gWindow->shuffleWindowsDown(); + while (!gWindow->previous.empty()) { + SingleWindow* tmp = gWindow->previous.front(); + printSingleWindow(tmp, output); + free_swindow(tmp); + gWindow->previous.erase(gWindow->previous.begin()); + } + flushAfter = false; + }; + while (!input.eof()) { flushAfter = readWindow(); - gWindow->shuffleWindowsDown(); - runGrammarOnWindow(); ++numWindows; - if (numWindows % resetAfter == 0) { - resetIndexes(); - } - if (flushAfter) { - while (!gWindow->next.empty()) { - gWindow->shuffleWindowsDown(); - runGrammarOnWindow(); - } + if (gWindow->next.size() > num_windows) { gWindow->shuffleWindowsDown(); - while (!gWindow->previous.empty()) { - SingleWindow* tmp = gWindow->previous.front(); - printSingleWindow(tmp, output); - free_swindow(tmp); - gWindow->previous.erase(gWindow->previous.begin()); + runGrammarOnWindow(); + if (numWindows % resetAfter == 0) { + resetIndexes(); } } + if (flushAfter) { + flush(); + } } + flush(); } #define READ_U16_INTO(dest) \ @@ -107,6 +120,7 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu bool BinaryApplicator::readWindow() { SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); uint32_t cs = 0; readRaw(*ux_stdin, cs); @@ -132,7 +146,6 @@ bool BinaryApplicator::readWindow() { for (uint16_t i = 0; i < tag_count; i++) { UString tg; READ_STR_INTO(tg); - u_fprintf(ux_stderr, "pos = %u, tg = %S, i = %u / %u\n", pos, tg.data(), i, tag_count); window_tags.push_back(addTag(tg)); } @@ -149,6 +162,7 @@ bool BinaryApplicator::readWindow() { for (uint16_t cn = 0; cn < cohort_count; cn++) { Cohort* cCohort = alloc_cohort(cSWindow); cCohort->global_number = gWindow->cohort_counter++; + numCohorts++; READ_U16_INTO(flags); /*if (flags & BFC_DELETED) { @@ -181,9 +195,9 @@ bool BinaryApplicator::readWindow() { addTagToReading(*cReading, cCohort->wordform); READ_U16_INTO(flags); - if (flags & BFR_DELETED) { - cReading->deleted = 1; - } + + READ_U16_INTO(tag); + cReading->baseform = window_tags[tag]->hash; READ_U16_INTO(tag_count); for (uint16_t tn = 0; tn < tag_count; tn++) { @@ -191,14 +205,21 @@ bool BinaryApplicator::readWindow() { addTagToReading(*cReading, window_tags[tag]); } - if (prev && flags & BFR_SUBREADING) { + if (prev && (flags & BFR_SUBREADING)) { prev->next = cReading; } + else if (flags & BFR_DELETED) { + cCohort->deleted.push_back(cReading); + } else { cCohort->appendReading(cReading); } prev = cReading; + ++numReadings; } + + insert_if_exists(cCohort->possible_sets, grammar->sets_any); + cSWindow->appendCohort(cCohort); } return cSWindow->flush_after; @@ -220,14 +241,13 @@ bool BinaryApplicator::readWindow() { (buffer) += tmp; \ } while (false) -#define WRITE_TAG_INTO(tag_, buffer) \ +#define WRITE_TAG_INTO(tag, buffer) \ do { \ - if (tag_index.find((tag_)) == tag_index.end()) { \ - tag_index[(tag_)] = tags_to_write.size(); \ - tags_to_write.push_back((tag_)); \ - u_fprintf(ux_stderr, "adding tag %S\n", (tag_)->tag.data()); \ + if (tag_index.find((tag)) == tag_index.end()) { \ + tag_index[(tag)] = tags_to_write.size(); \ + tags_to_write.push_back((tag)); \ } \ - WRITE_U16_INTO(tag_index[(tag_)], buffer); \ + WRITE_U16_INTO(tag_index[(tag)], buffer); \ } while (false) #define WRITE_STR_INTO(s, buffer) \ @@ -294,15 +314,13 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out if (reading != top_reading) { flags |= BFR_SUBREADING; } + WRITE_U16_INTO(flags, reading_buffer); + WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); std::string tag_buffer; uint16_t tag_count = 0; - if (reading->baseform) { - WRITE_TAG_INTO(grammar->single_tags[reading->baseform], tag_buffer); - tag_count++; - } for (auto& tter : reading->tags_list) { auto tag = grammar->single_tags[tter]; - if (tag->type & T_BASEFORM) { + if ((tag->type & T_WORDFORM) || (tag->type & T_BASEFORM)) { continue; } WRITE_TAG_INTO(tag, tag_buffer); From ff5b4385d7a1921f6a306a6b3795bc35d3fad6d5 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Tue, 24 Jun 2025 09:58:03 -0400 Subject: [PATCH 03/42] stream header and detection --- src/BinaryApplicator.cpp | 37 +++++++++++++++++++++++++++++++------ src/FormatConverter.cpp | 5 +++++ src/inlines.hpp | 5 +++++ src/version.hpp | 1 + 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 4ff5df07..1f550ec8 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -19,6 +19,7 @@ #include "BinaryApplicator.hpp" #include "Grammar.hpp" +#include "version.hpp" namespace CG3 { @@ -49,6 +50,23 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu CG3Quit(1); } + { + std::string header(8, 0); + if (!input.read(&header[0], 8)) { + u_fprintf(ux_stderr, "Error: Could not read stream header!\n"); + CG3Quit(1); + } + if (!is_cg3bsf(header)) { + u_fprintf(ux_stderr, "Error: Stream does not start with magic bytes - cannot read as binary!\n"); + CG3Quit(1); + } + uint32_t version = reinterpret_cast(&header[4])[0]; + if (version != CG3_BINARY_STREAM) { + u_fprintf(ux_stderr, "Error: Stream is version %u but this reader only knows version %u!\n", version, CG3_BINARY_STREAM); + CG3Quit(1); + } + } + index(); uint32_t resetAfter = ((num_windows + 4) * 2 + 1); @@ -119,9 +137,6 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu } while (false) bool BinaryApplicator::readWindow() { - SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); - initEmptySingleWindow(cSWindow); - uint32_t cs = 0; readRaw(*ux_stdin, cs); @@ -129,6 +144,9 @@ bool BinaryApplicator::readWindow() { return true; } + SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); + std::string buf(cs, 0); ux_stdin->read(&buf[0], cs); uint32_t pos = 0; @@ -204,7 +222,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag); addTagToReading(*cReading, window_tags[tag]); } - + if (prev && (flags & BFR_SUBREADING)) { prev->next = cReading; } @@ -262,6 +280,13 @@ bool BinaryApplicator::readWindow() { } while (false) void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling) { + if (window->number == 1) { + output.write("CGBF", 4); + std::string version; + WRITE_U32_INTO(CG3_BINARY_STREAM, version); + output.write(version.data(), 4); + } + TagVector tags_to_write; std::map tag_index; @@ -299,7 +324,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_STR_INTO(cohort->text, cohort_buffer); WRITE_STR_INTO(cohort->wblank, cohort_buffer); - + std::string reading_buffer; uint16_t reading_count = 0; std::sort(cohort->readings.begin(), cohort->readings.end(), Reading::cmp_number); @@ -334,7 +359,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_U16_INTO(reading_count, cohort_buffer); cohort_buffer += reading_buffer; } - + std::string header_buffer; uint16_t flags = 0; diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp index 62fae5cc..c6715aa1 100644 --- a/src/FormatConverter.cpp +++ b/src/FormatConverter.cpp @@ -28,6 +28,11 @@ cg3_sformat detectFormat(std::string_view buf8) { cg3_sformat fmt = CG3SF_INVALID; UErrorCode status = U_ZERO_ERROR; + if (is_cg3bsf(buf8)) { + fmt = CG3SF_BINARY; + return fmt; + } + UString buffer(BUF_SIZE, 0); int32_t nr = 0; u_strFromUTF8(&buffer[0], BUF_SIZE, &nr, buf8.data(), SI32(buf8.size()), &status); diff --git a/src/inlines.hpp b/src/inlines.hpp index 66a890bc..7db76b32 100644 --- a/src/inlines.hpp +++ b/src/inlines.hpp @@ -475,6 +475,11 @@ inline bool is_cg3b(const S& s) { return (s[0] == 'C' && s[1] == 'G' && s[2] == '3' && s[3] == 'B'); } +template +inline bool is_cg3bsf(const S& s) { + return (s[0] == 'C' && s[1] == 'G' && s[2] == 'B' && s[3] == 'F'); +} + inline void insert_if_exists(boost::dynamic_bitset<>& cont, const boost::dynamic_bitset<>* other) { if (other && !other->empty()) { cont.resize(std::max(cont.size(), other->size())); diff --git a/src/version.hpp b/src/version.hpp index f16b307c..ced0348e 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -32,5 +32,6 @@ constexpr uint32_t CG3_REVISION = 13898; constexpr uint32_t CG3_FEATURE_REV = 13898; constexpr uint32_t CG3_TOO_OLD = 10373; constexpr uint32_t CG3_EXTERNAL_PROTOCOL = 7226; +constexpr uint32_t CG3_BINARY_STREAM = 1; #endif From 56d68e635a10aa16b0fdcc521aa3c98ed2f212cd Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 26 Jun 2025 21:02:52 -0400 Subject: [PATCH 04/42] variables; account for ID updates --- src/BinaryApplicator.cpp | 81 +++++++++++++++++++++++++++++++++++----- src/BinaryApplicator.hpp | 23 ++++++++---- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 1f550ec8..e7391dfe 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -92,6 +92,7 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu gWindow->previous.erase(gWindow->previous.begin()); } flushAfter = false; + id_updates.clear(); }; while (!input.eof()) { @@ -169,11 +170,36 @@ bool BinaryApplicator::readWindow() { uint16_t var_count; READ_U16_INTO(var_count); - // TODO + for (uint16_t vn = 0; vn < var_count; vn++) { + char mode = buf[pos]; + pos++; + uint16_t tag1, tag2; + READ_U16_INTO(tag1); + READ_U16_INTO(tag2); + auto hash1 = window_tags[tag1]->hash; + if (mode == BFV_SETVAR) { + cSWindow->variables_set[hash1] = window_tags[tag2]->hash; + cSWindow->variables_rem.erase(hash1); + cSWindow->variables_output.insert(hash1); + } + else if (mode == BFV_SETVAR_ANY) { + cSWindow->variables_set[hash1] = grammar->tag_any; + cSWindow->variables_rem.erase(hash1); + cSWindow->variables_output.insert(hash1); + } + else if (mode == BFV_REMVAR) { + cSWindow->variables_set.erase(hash1); + cSWindow->variables_rem.insert(hash1); + cSWindow->variables_output.insert(hash1); + } + } READ_STR_INTO(cSWindow->text); READ_STR_INTO(cSWindow->text_post); + uint32_t id_start = max_input_id; + uint32_t offset = gWindow->cohort_counter - max_input_id; + uint16_t cohort_count; READ_U16_INTO(cohort_count); uint16_t tag; @@ -192,15 +218,25 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag_count); if (tag_count) { - cCohort->wread = alloc_reading(cCohort); - for (uint16_t tn = 0; tn < tag_count; tn++) { - READ_U16_INTO(tag); - addTagToReading(*cCohort->wread, window_tags[tag]); - } + cCohort->wread = alloc_reading(cCohort); + for (uint16_t tn = 0; tn < tag_count; tn++) { + READ_U16_INTO(tag); + addTagToReading(*cCohort->wread, window_tags[tag]); + } } - READ_U32_INTO(cCohort->dep_self); - READ_U32_INTO(cCohort->dep_parent); + uint32_t self, parent; + READ_U32_INTO(self); + READ_U32_INTO(parent); + if (self > max_input_id) { + max_input_id = self; + } + if (parent != DEP_NO_PARENT) { + cCohort->dep_parent = parent + offset; + } + if (flags & BFC_RIGHTWARD_REL) { + id_updates[self] = cCohort->global_number; + } READ_STR_INTO(cCohort->text); READ_STR_INTO(cCohort->wblank); @@ -290,6 +326,31 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out TagVector tags_to_write; std::map tag_index; + uint16_t var_count = 0; + std::string var_buffer; + for (auto var : window->variables_output) { + var_count++; + Tag* key = grammar->single_tags[var]; + auto iter = window->variables_set.find(var); + if (iter != window->variables_set.end()) { + if (iter->second != grammar->tag_any) { + var_buffer += static_cast(BFV_SETVAR); + WRITE_TAG_INTO(key, var_buffer); + WRITE_TAG_INTO(grammar->single_tags[iter->second], var_buffer); + } + else { + var_buffer += static_cast(BFV_SETVAR_ANY); + WRITE_TAG_INTO(key, var_buffer); + WRITE_U16_INTO(0, var_buffer); + } + } + else { + var_buffer += static_cast(BFV_REMVAR); + WRITE_TAG_INTO(key, var_buffer); + WRITE_U16_INTO(0, var_buffer); + } + } + std::string cohort_buffer; uint16_t cohort_count = 0; for (auto& cohort : window->all_cohorts) { @@ -373,8 +434,8 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_STR_INTO(tag->tag, header_buffer); } - // TODO: variables - WRITE_U16_INTO(0, header_buffer); + WRITE_U16_INTO(var_count, header_buffer); + header_buffer += var_buffer; WRITE_STR_INTO(window->text, header_buffer); WRITE_STR_INTO(window->text_post, header_buffer); diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp index 1fcef4a9..35d0e691 100644 --- a/src/BinaryApplicator.hpp +++ b/src/BinaryApplicator.hpp @@ -26,13 +26,18 @@ namespace CG3 { enum BinaryFormatFlags { - // Window - BFW_FLUSH = (1 << 1), - // Cohort - BFC_DELETED = (1 << 1), - // Reading - BFR_SUBREADING = (1 << 1), - BFR_DELETED = (1 << 2), + // Window + BFW_FLUSH = (1 << 1), + // Cohort + BFC_DELETED = (1 << 1), + BFC_RIGHTWARD_REL = (1 << 2), + // Reading + BFR_SUBREADING = (1 << 1), + BFR_DELETED = (1 << 2), + // Variables + BFV_SETVAR = 1, + BFV_SETVAR_ANY = 2, + BFV_REMVAR = 3, }; class BinaryApplicator : public virtual GrammarApplicator { @@ -45,7 +50,9 @@ class BinaryApplicator : public virtual GrammarApplicator { void printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling = false) override; private: - bool readWindow(); + bool readWindow(); + uint32_t max_input_id = 0; + std::map id_updates; }; } From 60a5e3d4a7aa117dab361ed497cacf3ff3902565 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 27 Jun 2025 11:03:16 -0400 Subject: [PATCH 05/42] start on relations --- src/BinaryApplicator.cpp | 132 +++++++++++++++++++++++---------------- src/BinaryApplicator.hpp | 5 +- src/FormatConverter.cpp | 4 ++ src/cg-proc.cpp | 11 +++- 4 files changed, 92 insertions(+), 60 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index e7391dfe..7b7045b2 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -92,7 +92,6 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu gWindow->previous.erase(gWindow->previous.begin()); } flushAfter = false; - id_updates.clear(); }; while (!input.eof()) { @@ -197,9 +196,6 @@ bool BinaryApplicator::readWindow() { READ_STR_INTO(cSWindow->text); READ_STR_INTO(cSWindow->text_post); - uint32_t id_start = max_input_id; - uint32_t offset = gWindow->cohort_counter - max_input_id; - uint16_t cohort_count; READ_U16_INTO(cohort_count); uint16_t tag; @@ -209,9 +205,10 @@ bool BinaryApplicator::readWindow() { numCohorts++; READ_U16_INTO(flags); - /*if (flags & BFC_DELETED) { - cCohort->type |= CT_DELETED; - }*/ + if (flags & BFC_RELATED) { + cCohort->type |= CT_RELATED; + has_relations = true; + } READ_U16_INTO(tag); cCohort->wordform = window_tags[tag]; @@ -225,17 +222,25 @@ bool BinaryApplicator::readWindow() { } } - uint32_t self, parent; - READ_U32_INTO(self); - READ_U32_INTO(parent); - if (self > max_input_id) { - max_input_id = self; + READ_U32_INTO(cCohort->dep_self); + READ_U32_INTO(cCohort->dep_parent); + + if (cCohort->dep_parent != DEP_NO_PARENT) { + has_dep = true; } - if (parent != DEP_NO_PARENT) { - cCohort->dep_parent = parent + offset; + + uint16_t rel_count; + READ_U16_INTO(rel_count); + for (uint16_t rn = 0; rn < rel_count; rn++) { + READ_U16_INTO(tag); + uint32_t head; + READ_U32_INTO(head); + cCohort->relations_input[window_tags[tag]->comparison_hash].insert(head); } - if (flags & BFC_RIGHTWARD_REL) { - id_updates[self] = cCohort->global_number; + if (rel_count) { + has_relations = true; + gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; + cCohort->type |= CT_RELATED; } READ_STR_INTO(cCohort->text); @@ -360,29 +365,48 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out cohort_count++; uint16_t flags = 0; + if (cohort->type & CT_RELATED) { + flags |= BFC_RELATED; + } WRITE_U16_INTO(flags, cohort_buffer); WRITE_TAG_INTO(cohort->wordform, cohort_buffer); if (cohort->wread) { - std::string tag_buffer; - uint16_t tag_count = 0; - for (auto tter : cohort->wread->tags_list) { - if (tter == cohort->wordform->hash) { - continue; - } - WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); - tag_count++; - } - WRITE_U16_INTO(tag_count, cohort_buffer); - cohort_buffer += tag_buffer; + std::string tag_buffer; + uint16_t tag_count = 0; + for (auto tter : cohort->wread->tags_list) { + if (tter == cohort->wordform->hash) { + continue; + } + WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); + tag_count++; + } + WRITE_U16_INTO(tag_count, cohort_buffer); + cohort_buffer += tag_buffer; } else { - WRITE_U16_INTO(0, cohort_buffer); + WRITE_U16_INTO(0, cohort_buffer); } WRITE_U32_INTO(cohort->dep_self, cohort_buffer); WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); + std::string rel_buffer; + uint16_t rel_count = 0; + for (const auto& miter : cohort->relations) { + auto it = grammar->single_tags.find(miter.first); + if (it == grammar->single_tags.end()) { + it = grammar->single_tags.find(miter.first); + } + for (auto siter : miter.second) { + rel_count += 1; + WRITE_TAG_INTO(it->second, rel_buffer); + WRITE_U32_INTO(siter, rel_buffer); + } + } + WRITE_U16_INTO(rel_count, cohort_buffer); + cohort_buffer += rel_buffer; + WRITE_STR_INTO(cohort->text, cohort_buffer); WRITE_STR_INTO(cohort->wblank, cohort_buffer); @@ -390,32 +414,32 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out uint16_t reading_count = 0; std::sort(cohort->readings.begin(), cohort->readings.end(), Reading::cmp_number); for (auto top_reading : cohort->readings) { - if (top_reading->noprint) { - continue; - } - auto reading = top_reading; - while (reading) { - reading_count++; - uint16_t flags = 0; - if (reading != top_reading) { - flags |= BFR_SUBREADING; - } - WRITE_U16_INTO(flags, reading_buffer); - WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); - std::string tag_buffer; - uint16_t tag_count = 0; - for (auto& tter : reading->tags_list) { - auto tag = grammar->single_tags[tter]; - if ((tag->type & T_WORDFORM) || (tag->type & T_BASEFORM)) { - continue; - } - WRITE_TAG_INTO(tag, tag_buffer); - tag_count++; - } - WRITE_U16_INTO(tag_count, reading_buffer); - reading_buffer += tag_buffer; - reading = reading->next; - } + if (top_reading->noprint) { + continue; + } + auto reading = top_reading; + while (reading) { + reading_count++; + uint16_t flags = 0; + if (reading != top_reading) { + flags |= BFR_SUBREADING; + } + WRITE_U16_INTO(flags, reading_buffer); + WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); + std::string tag_buffer; + uint16_t tag_count = 0; + for (auto& tter : reading->tags_list) { + auto tag = grammar->single_tags[tter]; + if (tag->type & (T_WORDFORM | T_BASEFORM | T_DEPENDENCY | T_RELATION)) { + continue; + } + WRITE_TAG_INTO(tag, tag_buffer); + tag_count++; + } + WRITE_U16_INTO(tag_count, reading_buffer); + reading_buffer += tag_buffer; + reading = reading->next; + } } WRITE_U16_INTO(reading_count, cohort_buffer); cohort_buffer += reading_buffer; diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp index 35d0e691..be9819f2 100644 --- a/src/BinaryApplicator.hpp +++ b/src/BinaryApplicator.hpp @@ -29,8 +29,7 @@ enum BinaryFormatFlags { // Window BFW_FLUSH = (1 << 1), // Cohort - BFC_DELETED = (1 << 1), - BFC_RIGHTWARD_REL = (1 << 2), + BFC_RELATED = (1 << 1), // Reading BFR_SUBREADING = (1 << 1), BFR_DELETED = (1 << 2), @@ -51,8 +50,6 @@ class BinaryApplicator : public virtual GrammarApplicator { private: bool readWindow(); - uint32_t max_input_id = 0; - std::map id_updates; }; } diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp index c6715aa1..bd0e72d2 100644 --- a/src/FormatConverter.cpp +++ b/src/FormatConverter.cpp @@ -132,6 +132,10 @@ void FormatConverter::runGrammarOnText(std::istream& input, std::ostream& output ux_stdin = &input; ux_stdout = &output; + if (fmt_output == CG3SF_BINARY || fmt_input == CG3SF_BINARY) { + grammar->has_relations = true; + } + switch (fmt_input) { case CG3SF_CG: { GrammarApplicator::runGrammarOnText(input, output); diff --git a/src/cg-proc.cpp b/src/cg-proc.cpp index 3f58c3aa..f39cb20a 100644 --- a/src/cg-proc.cpp +++ b/src/cg-proc.cpp @@ -22,6 +22,7 @@ #include "TextualParser.hpp" #include "BinaryGrammar.hpp" #include "ApertiumApplicator.hpp" +#include "BinaryApplicator.hpp" #include "MatxinApplicator.hpp" #include "GrammarApplicator.hpp" @@ -48,7 +49,8 @@ void endProgram(char* name) { cout << " -s, --sections=NUM: specify number of sections to process" << endl; cout << " -f, --stream-format=NUM: set the format of the I/O stream to NUM," << endl; cout << " where `0' is VISL format, `1' is Apertium" << endl; - cout << " format (default: 1)" << endl; + cout << " format, `2` is Matxin, and `3` is binary" << endl; + cout << " (default: 1)" << endl; cout << " -r, --rule=NAME: run only the named rule" << endl; cout << " -t, --trace: print debug output on stderr" << endl; cout << " -w, --wordform-case: enforce surface case on lemma/baseform " << endl; @@ -65,7 +67,8 @@ void endProgram(char* name) { cout << " -s: specify number of sections to process" << endl; cout << " -f: set the format of the I/O stream to NUM," << endl; cout << " where `0' is VISL format, `1' is " << endl; - cout << " Apertium format and `2' is Matxin (default: 1)" << endl; + cout << " Apertium format, `2' is Matxin," << endl; + cout << " and `3` is binary (default: 1)" << endl; cout << " -r: run only the named rule" << endl; cout << " -t: print debug output on stderr" << endl; cout << " -w: enforce surface case on lemma/baseform " << endl; @@ -308,6 +311,10 @@ int main(int argc, char* argv[]) { matxinApplicator->print_only_first = only_first; applicator.reset(matxinApplicator); } + else if (stream_format == 3) { + BinaryApplicator* binaryApplicator = new BinaryApplicator(std::cerr); + applicator.reset(binaryApplicator); + } else { ApertiumApplicator* apertiumApplicator = new ApertiumApplicator(std::cerr); apertiumApplicator->wordform_case = wordform_case; From a5fd9748e3439b55f7d33df3e8ebe9ed21334699 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 27 Jun 2025 11:30:31 -0400 Subject: [PATCH 06/42] write global_number rather than dep_self --- src/BinaryApplicator.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 7b7045b2..8ac6dca8 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -388,8 +388,20 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_U16_INTO(0, cohort_buffer); } - WRITE_U32_INTO(cohort->dep_self, cohort_buffer); - WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); + WRITE_U32_INTO(cohort->global_number, cohort_buffer); + if (cohort->dep_parent == 0 || cohort->dep_parent == DEP_NO_PARENT) { + WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); + } + else { + const Cohort* pr = nullptr; + if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { + const Cohort* pr = gWindow->cohort_map[cohort->dep_parent]; + WRITE_U32_INTO(pr->global_number, cohort_buffer); + } + else { + WRITE_U32_INTO(DEP_NO_PARENT, cohort_buffer); + } + } std::string rel_buffer; uint16_t rel_count = 0; From 9f114b4157f84804374581168f0f4d3da1c4ee3d Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 27 Jun 2025 11:45:51 -0400 Subject: [PATCH 07/42] relations --- src/BinaryApplicator.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 8ac6dca8..5054303c 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -224,6 +224,8 @@ bool BinaryApplicator::readWindow() { READ_U32_INTO(cCohort->dep_self); READ_U32_INTO(cCohort->dep_parent); + gWindow->dep_window[cCohort->dep_self] = cCohort; + gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; if (cCohort->dep_parent != DEP_NO_PARENT) { has_dep = true; @@ -235,7 +237,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag); uint32_t head; READ_U32_INTO(head); - cCohort->relations_input[window_tags[tag]->comparison_hash].insert(head); + cCohort->relations_input[window_tags[tag]->hash].insert(head); } if (rel_count) { has_relations = true; From 9fae2fc5bbdc1cdae4ccb6360e86f30fb05af18b Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 2 Jul 2025 13:51:17 -0400 Subject: [PATCH 08/42] minor optimizations - don't rehash if we don't need to --- src/BinaryApplicator.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 5054303c..8eb6310d 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -161,6 +161,7 @@ bool BinaryApplicator::readWindow() { TagVector window_tags; uint16_t tag_count; READ_U16_INTO(tag_count); + window_tags.reserve(tag_count); for (uint16_t i = 0; i < tag_count; i++) { UString tg; READ_STR_INTO(tg); @@ -218,7 +219,8 @@ bool BinaryApplicator::readWindow() { cCohort->wread = alloc_reading(cCohort); for (uint16_t tn = 0; tn < tag_count; tn++) { READ_U16_INTO(tag); - addTagToReading(*cCohort->wread, window_tags[tag]); + addTagToReading(*cCohort->wread, window_tags[tag], + (tn + 1 == tag_count)); } } @@ -262,18 +264,18 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag_count); for (uint16_t tn = 0; tn < tag_count; tn++) { - READ_U16_INTO(tag); - addTagToReading(*cReading, window_tags[tag]); + READ_U16_INTO(tag); + addTagToReading(*cReading, window_tags[tag], (tn+1 == tag_count)); } if (prev && (flags & BFR_SUBREADING)) { - prev->next = cReading; + prev->next = cReading; } else if (flags & BFR_DELETED) { - cCohort->deleted.push_back(cReading); + cCohort->deleted.push_back(cReading); } else { - cCohort->appendReading(cReading); + cCohort->appendReading(cReading); } prev = cReading; ++numReadings; @@ -395,7 +397,6 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); } else { - const Cohort* pr = nullptr; if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { const Cohort* pr = gWindow->cohort_map[cohort->dep_parent]; WRITE_U32_INTO(pr->global_number, cohort_buffer); From ef63dce2772177a291f0435d17bb57aad3bc26e9 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 2 Jul 2025 18:40:14 -0400 Subject: [PATCH 09/42] python binary parser --- python/CMakeLists.txt | 4 +- python/cg3.py | 124 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 python/cg3.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 479d6606..3529cf89 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -4,7 +4,7 @@ set(PYTHON_EXECUTABLE ${Python_EXECUTABLE}) set(PYTHON_FILE "constraint_grammar.py") set(CPP_WRAP_FILE "constraint_grammar_wrap.cpp") -file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/cg3.py" "from constraint_grammar import *\n") +set(PYTHON_LIBRARY_FILE "cg3.py") set(BUILD_DEFS "") get_directory_property(_defs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS) @@ -21,7 +21,7 @@ add_custom_command(OUTPUT ${CPP_WRAP_FILE} ${PYTHON_FILE} ) add_custom_target(wrapper ALL - DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} + DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} ${PYTHON_LIBRARY_FILE} VERBATIM ) diff --git a/python/cg3.py b/python/cg3.py new file mode 100644 index 00000000..1f128280 --- /dev/null +++ b/python/cg3.py @@ -0,0 +1,124 @@ +from constraint_grammar import * + +from collections import defaultdict +from dataclasses import dataclass, field +import struct +from typing import DefaultDict, Dict, List, Optional + +@dataclass +class Reading: + lemma: str = '' + tags: List[str] = field(default_factory=list) + subreading: Optional[Reading] = None + +@dataclass +class Cohort: + static: Reading = field(default_factory=Reading) + readings: List[Reading] = field(default_factory=list) + dep_self: int = 0 + dep_head: Optional[int] = None + relations: DefalutDict[str, List[int]] = field( + default_factory=lambda: defaultdict(list)) + text: str = '' + wblank: str = '' + +@dataclass +class Window: + cohorts: List[Cohort] = field(default_factory=list) + set_vars: Dict[str, Optional[str]] = field(default_factory=dict) + rem_vars: List[str] = field(default_factory=list) + text: str = '' + text_post: str = '' + flush_after: bool = False + +def parse_binary_window(buf): + pos = 0 + def read_pat(pat): + nonlocal pos, buf + ret = struct.unpack_from('<'+pat, buf, pos) + pos += struct.calcsize('<'+pat) + return ret + def read_u16(): + return read_pat('H')[0] + def read_u32(): + return read_pat('I')[0] + def read_str(): + l = read_u16() + if l == 0: + return b'' + return read_pat(f'{l}s')[0] + return s.decode('utf-8') + window = Window() + window_flags = read_u16() + if window_flags & 1: + window.flush_after = True + tag_count = read_u16() + tags = [read_str() for i in range(tag_count)] + def read_tags(): + nonlocal tags + ct = read_u16() + if ct == 0: + return [] + idx = read_pat(f'{ct}H') + return [tags[t] for t in idx] + var_count = read_u16() + for i in range(var_count): + mode = read_pat('B')[0] + t1 = read_u16() + t2 = read_u16() + if mode == 1: + window.set_vars[tags[t1]] = tags[t2] + elif mode == 2: + window.set_vars[tags[t1]] = None + elif mode == 3: + window.rem_vars.append(tags[t1]) + window.text = read_str() + window.text_post = read_str() + cohort_count = read_u16() + for i in range(cohort_count): + cohort = Cohort() + cohort_flags = read_u16() + cohort.static.lemma = tags[read_u16()] + cohort.static.tags = read_tags() + cohort.dep_self = read_u32() + cohort.dep_parent = read_u32() + if cohort.dep_parent == 0xffffffff: + cohort.dep_parent = None + rel_count = read_u16() + for i in range(rel_count): + tag = tags[read_u16()] + head = read_u32() + cohort.relations[tag].append(head) + cohort.text = read_str() + cohort.wblank = read_str() + reading_count = read_u16() + prev = None + for i in range(reading_count): + reading_flags = read_u16() + reading = Reading() + reading.lemma = tags[read_u16()] + reading.tags = read_tags() + if reading_flags & 1 and prev is not None: + prev.subreading = reading + else: + cohort.readings.append(reading) + prev = reading + window.cohorts.append(cohort) + return window + +def parse_binary_stream(fin): + header = fin.read(8) + label, version = struct.unpack('<4sI', header) + if label != b'CGBF': + raise ValueError('Binary format header not found!') + if version != 1: + raise ValueError('Unknown binary format version!') + while True: + spec = fin.read(4) + if len(spec) != 4: + break; + block_len = struct.unpack(' Date: Wed, 2 Jul 2025 18:44:24 -0400 Subject: [PATCH 10/42] fix indentation --- python/cg3.py | 96 +++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/python/cg3.py b/python/cg3.py index 1f128280..a843a152 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -7,16 +7,16 @@ @dataclass class Reading: - lemma: str = '' - tags: List[str] = field(default_factory=list) + lemma: str = '' + tags: List[str] = field(default_factory=list) subreading: Optional[Reading] = None @dataclass class Cohort: - static: Reading = field(default_factory=Reading) - readings: List[Reading] = field(default_factory=list) - dep_self: int = 0 - dep_head: Optional[int] = None + static: Reading = field(default_factory=Reading) + readings: List[Reading] = field(default_factory=list) + dep_self: int = 0 + dep_head: Optional[int] = None relations: DefalutDict[str, List[int]] = field( default_factory=lambda: defaultdict(list)) text: str = '' @@ -24,7 +24,7 @@ class Cohort: @dataclass class Window: - cohorts: List[Cohort] = field(default_factory=list) + cohorts: List[Cohort] = field(default_factory=list) set_vars: Dict[str, Optional[str]] = field(default_factory=dict) rem_vars: List[str] = field(default_factory=list) text: str = '' @@ -33,34 +33,34 @@ class Window: def parse_binary_window(buf): pos = 0 - def read_pat(pat): - nonlocal pos, buf - ret = struct.unpack_from('<'+pat, buf, pos) - pos += struct.calcsize('<'+pat) - return ret - def read_u16(): - return read_pat('H')[0] - def read_u32(): - return read_pat('I')[0] - def read_str(): - l = read_u16() - if l == 0: - return b'' - return read_pat(f'{l}s')[0] - return s.decode('utf-8') + def read_pat(pat): + nonlocal pos, buf + ret = struct.unpack_from('<'+pat, buf, pos) + pos += struct.calcsize('<'+pat) + return ret + def read_u16(): + return read_pat('H')[0] + def read_u32(): + return read_pat('I')[0] + def read_str(): + l = read_u16() + if l == 0: + return b'' + return read_pat(f'{l}s')[0] + return s.decode('utf-8') window = Window() - window_flags = read_u16() + window_flags = read_u16() if window_flags & 1: window.flush_after = True - tag_count = read_u16() - tags = [read_str() for i in range(tag_count)] - def read_tags(): - nonlocal tags - ct = read_u16() - if ct == 0: - return [] - idx = read_pat(f'{ct}H') - return [tags[t] for t in idx] + tag_count = read_u16() + tags = [read_str() for i in range(tag_count)] + def read_tags(): + nonlocal tags + ct = read_u16() + if ct == 0: + return [] + idx = read_pat(f'{ct}H') + return [tags[t] for t in idx] var_count = read_u16() for i in range(var_count): mode = read_pat('B')[0] @@ -72,31 +72,31 @@ def read_tags(): window.set_vars[tags[t1]] = None elif mode == 3: window.rem_vars.append(tags[t1]) - window.text = read_str() - window.text_post = read_str() - cohort_count = read_u16() - for i in range(cohort_count): + window.text = read_str() + window.text_post = read_str() + cohort_count = read_u16() + for i in range(cohort_count): cohort = Cohort() - cohort_flags = read_u16() + cohort_flags = read_u16() cohort.static.lemma = tags[read_u16()] cohort.static.tags = read_tags() - cohort.dep_self = read_u32() - cohort.dep_parent = read_u32() - if cohort.dep_parent == 0xffffffff: - cohort.dep_parent = None - rel_count = read_u16() - for i in range(rel_count): + cohort.dep_self = read_u32() + cohort.dep_parent = read_u32() + if cohort.dep_parent == 0xffffffff: + cohort.dep_parent = None + rel_count = read_u16() + for i in range(rel_count): tag = tags[read_u16()] head = read_u32() cohort.relations[tag].append(head) cohort.text = read_str() cohort.wblank = read_str() - reading_count = read_u16() + reading_count = read_u16() prev = None - for i in range(reading_count): - reading_flags = read_u16() + for i in range(reading_count): + reading_flags = read_u16() reading = Reading() - reading.lemma = tags[read_u16()] + reading.lemma = tags[read_u16()] reading.tags = read_tags() if reading_flags & 1 and prev is not None: prev.subreading = reading @@ -104,7 +104,7 @@ def read_tags(): cohort.readings.append(reading) prev = reading window.cohorts.append(cohort) - return window + return window def parse_binary_stream(fin): header = fin.read(8) From ad0279faa802fbb0cab9f76413f806895d4b42ed Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 2 Jul 2025 18:57:19 -0400 Subject: [PATCH 11/42] stop using CI as compiler --- python/cg3.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cg3.py b/python/cg3.py index a843a152..35ae217b 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -9,7 +9,7 @@ class Reading: lemma: str = '' tags: List[str] = field(default_factory=list) - subreading: Optional[Reading] = None + subreading: Optional['Reading'] = None @dataclass class Cohort: @@ -17,7 +17,7 @@ class Cohort: readings: List[Reading] = field(default_factory=list) dep_self: int = 0 dep_head: Optional[int] = None - relations: DefalutDict[str, List[int]] = field( + relations: DefaultDict[str, List[int]] = field( default_factory=lambda: defaultdict(list)) text: str = '' wblank: str = '' @@ -45,9 +45,8 @@ def read_u32(): def read_str(): l = read_u16() if l == 0: - return b'' - return read_pat(f'{l}s')[0] - return s.decode('utf-8') + return '' + return read_pat(f'{l}s')[0].decode('utf-8') window = Window() window_flags = read_u16() if window_flags & 1: From 9ea2ea76ca3fb753d4caaf6ab2a31a07321fd611 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 2 Jul 2025 19:49:04 -0400 Subject: [PATCH 12/42] add docs --- manual/streamformats.xml | 48 ++++++++++++++++++++++++++++++++++++++++ python/cg3.py | 8 +++++++ 2 files changed, 56 insertions(+) diff --git a/manual/streamformats.xml b/manual/streamformats.xml index 5038aa53..c988c762 100644 --- a/manual/streamformats.xml +++ b/manual/streamformats.xml @@ -183,4 +183,52 @@ +
+ Binary Format + + Binary Stream Format + + + The binary format can be generated by cg-conv and can be parsed either by cg-conv or by the Python bindings. + It is designed for faster parsing than the textual formats. + The intended usecase is cases where the same input needs to be processed multiple times (such as when testing several grammars). + + + The stream begins with a header containing CGBF followed by a 4-byte version number (currently 1). + After that, each window begins with 4 bytes specifying the length of the block and then the following structure: + + window flags [2] + > 1 = flush_after + tags [array of str] + variables [array] + mode + > 1 = SETVAR (var = val) + > 2 = SETVAR (var = *) + > 3 = REMVAR + var [tag] + val or 0 [tag] + text [str] + text_post [str] + cohorts [array] + flags [2] + wordform [tag] + static_tags [array of tag] + dep_self [4] + dep_parent or 0xFFFFFFFF [4] + relations [array] + tag [tag] + head [4] + text [str] + wblank [str] + readings [array] + flags [2] + > 1 = is subreading of predecessor + baseform [tag] + tags [array of tag] + + Where arrays and strings are both encoded with a 2-byte length followed by the specified number of objects or UTF-8 bytes. + Each item of type [tag] is a 2-byte index into the window-wide tags array. + +
+ diff --git a/python/cg3.py b/python/cg3.py index 35ae217b..4b003f80 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -32,6 +32,11 @@ class Window: flush_after: bool = False def parse_binary_window(buf): + '''Given a bytestring `buf` containing a single window + (not including the length header), parse and return a Window() + object. For most applications you probbaly want parse_binary_stream() + instead.''' + pos = 0 def read_pat(pat): nonlocal pos, buf @@ -106,6 +111,9 @@ def read_tags(): return window def parse_binary_stream(fin): + '''Given a file `fin`, yield a series of Window() objects. + raises ValueError if stream header is missing or invalid.''' + header = fin.read(8) label, version = struct.unpack('<4sI', header) if label != b'CGBF': From 03fe90d88108c1f7e1a2c282619028564979dd58 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 17 Jul 2025 14:50:40 -0400 Subject: [PATCH 13/42] typo in python; --dep-delimit for conv --- python/cg3.py | 2 +- src/GrammarApplicator.cpp | 3 +++ src/cg-conv.cpp | 9 +++++++++ src/options.cpp | 1 + src/options.hpp | 1 + src/options_conv.hpp | 3 +++ 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/python/cg3.py b/python/cg3.py index 4b003f80..979a5db9 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -16,7 +16,7 @@ class Cohort: static: Reading = field(default_factory=Reading) readings: List[Reading] = field(default_factory=list) dep_self: int = 0 - dep_head: Optional[int] = None + dep_parent: Optional[int] = None relations: DefaultDict[str, List[int]] = field( default_factory=lambda: defaultdict(list)) text: str = '' diff --git a/src/GrammarApplicator.cpp b/src/GrammarApplicator.cpp index 786f19fd..7155d376 100644 --- a/src/GrammarApplicator.cpp +++ b/src/GrammarApplicator.cpp @@ -1009,6 +1009,9 @@ void GrammarApplicator::setOptions(UConverter* conv) { if (options[PRINT_IDS].doesOccur) { print_ids = true; } + if (options[PRINT_DEP].doesOccur) { + has_dep = true; + } if (options[NUM_WINDOWS].doesOccur) { num_windows = std::stoul(options[NUM_WINDOWS].value); } diff --git a/src/cg-conv.cpp b/src/cg-conv.cpp index e88bb122..719d29ad 100644 --- a/src/cg-conv.cpp +++ b/src/cg-conv.cpp @@ -232,6 +232,15 @@ int main(int argc, char* argv[]) { applicator.parse_dep = true; applicator.has_dep = true; } + if (options_conv[DEP_DELIMIT].doesOccur) { + if (!options_conv[DEP_DELIMIT].value.empty()) { + applicator.dep_delimit = std::stoul(options_conv[DEP_DELIMIT].value); + } + else { + applicator.dep_delimit = 10; + } + applicator.parse_dep = true; + } applicator.is_conv = true; applicator.trace = true; applicator.verbosity_level = 0; diff --git a/src/options.cpp b/src/options.cpp index 0bb10e38..70049057 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -73,6 +73,7 @@ options_t options{ UOption{"unicode-tags", 0, UOPT_NO_ARG, "outputs Unicode code points for things like ->"}, UOption{"unique-tags", 0, UOPT_NO_ARG, "outputs unique tags only once per reading"}, UOption{"print-ids", 0, UOPT_NO_ARG, "always output IDs"}, + UOption{"print-dep", 0, UOPT_NO_ARG, "always output dependencies"}, UOption{"num-windows", 0, UOPT_REQUIRES_ARG, "number of windows to keep in before/ahead buffers; defaults to 2"}, UOption{"always-span", 0, UOPT_NO_ARG, "forces scanning tests to always span across window boundaries"}, diff --git a/src/options.hpp b/src/options.hpp index 9bdef8a4..39a112fc 100644 --- a/src/options.hpp +++ b/src/options.hpp @@ -71,6 +71,7 @@ enum OPTIONS { UNICODE_TAGS, UNIQUE_TAGS, PRINT_IDS, + PRINT_DEP, NUM_WINDOWS, ALWAYS_SPAN, SOFT_LIMIT, diff --git a/src/options_conv.hpp b/src/options_conv.hpp index c56e9f7f..66e097da 100644 --- a/src/options_conv.hpp +++ b/src/options_conv.hpp @@ -27,6 +27,7 @@ namespace OptionsConv { using ::Options::UOption; using ::Options::UOPT_NO_ARG; +using ::Options::UOPT_OPTIONAL_ARG; using ::Options::UOPT_REQUIRES_ARG; enum OPTIONS { @@ -59,6 +60,7 @@ enum OPTIONS { SUB_LTR, ORDERED, PARSE_DEP, + DEP_DELIMIT, UNICODE_TAGS, PIPE_DELETED, NO_BREAK, @@ -95,6 +97,7 @@ std::array options_conv{ UOption{"ltr", 'l', UOPT_NO_ARG, "sets sub-reading direction to LTR"}, UOption{"ordered", 'o', UOPT_NO_ARG, "tag order matters mode"}, UOption{"parse-dep", 'D', UOPT_NO_ARG, "parse dependency (defaults to treating as normal tags)"}, + UOption{"dep-delimit", 0, UOPT_OPTIONAL_ARG, "delimit windows based on dependency; defaults to 10"}, UOption{"unicode-tags", 0, UOPT_NO_ARG, "outputs Unicode code points for things like ->"}, UOption{"deleted", 0, UOPT_NO_ARG, "read deleted readings as such, instead of as text"}, UOption{"no-break", 'B', UOPT_NO_ARG, "inhibits any extra whitespace in output"}, From 1670d657d59e60451bbe50dc0a22c09623e380f6 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 26 Jul 2025 13:15:13 -0400 Subject: [PATCH 14/42] add baseform properly --- src/BinaryApplicator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 8eb6310d..f58d7f89 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -260,7 +260,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(flags); READ_U16_INTO(tag); - cReading->baseform = window_tags[tag]->hash; + addTagToReading(*cReading, window_tags[tag]); READ_U16_INTO(tag_count); for (uint16_t tn = 0; tn < tag_count; tn++) { From c149504388ffe9a90efe712f0b0a10949b756c0c Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 26 Jul 2025 13:23:16 -0400 Subject: [PATCH 15/42] also for static tags --- src/BinaryApplicator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index f58d7f89..ab7e0bd3 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -217,6 +217,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag_count); if (tag_count) { cCohort->wread = alloc_reading(cCohort); + addTagToReading(*cCohort->wread, cCohort->wordform); for (uint16_t tn = 0; tn < tag_count; tn++) { READ_U16_INTO(tag); addTagToReading(*cCohort->wread, window_tags[tag], From f17ac6288fc89760317a0100643b7bac33d1cf92 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Wed, 20 Aug 2025 18:55:23 +0200 Subject: [PATCH 16/42] Add format conversion in main; Add tests for binary format, currently 39 / 69 tests passing --- .gitignore | 1 + scripts/cg-sort | 16 +++++++++++++++- src/main.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++++-- src/options.cpp | 17 ++++++++++++++++- src/options.hpp | 15 +++++++++++++++ test/runall.pl | 26 +++++++++++++++++++++---- 6 files changed, 117 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 96b6ff81..af4fd7cb 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ Makefile /test/**/output*.txt /test/**/untraced*.txt /test/**/std*.txt +/test/**/*.bsf*.txt /test/**/*.out.cg3 /test/**/*.cg3b /test/**/*.bin diff --git a/scripts/cg-sort b/scripts/cg-sort index d12c0e55..05d7f106 100755 --- a/scripts/cg-sort +++ b/scripts/cg-sort @@ -14,7 +14,7 @@ use Getopt::Long; Getopt::Long::Configure('bundling'); Getopt::Long::Configure('no_ignore_case'); my %opts = (); -GetOptions(\%opts, ('weight|w:s', 'reverse|r', 'first|1', 'help|?')); +GetOptions(\%opts, ('weight|w:s', 'mapping|m:s', 'reverse|r', 'first|1', 'help|?')); sub print_help { print <<'XOUT'; @@ -25,6 +25,7 @@ Pipe a CG stream through this to sort and unique the readings of each cohort. Options: -?, --help outputs this help -w, --weight sorts by a numeric tag; defaults to W + -m, --mapping sorts mapping tags with given prefix; defaults to @ -r, --reverse reverses the sort order -1, --first only keep the first reading @@ -41,6 +42,11 @@ if (exists($opts{weight}) && length($opts{weight})) { $W = $opts{weight}; } +my $M = '@'; +if (exists($opts{mapping}) && length($opts{mapping})) { + $M = $opts{mapping}; +} + my $in_cohort = 0; my %readings = (); my %deleted = (); @@ -66,6 +72,14 @@ sub print_sorted_readings { if (!@_) { return; } + if (exists($opts{mapping})) { + foreach (@_) { + my @tags = ($_ =~ m@ ($M\S+)@g); + @tags = sort @tags; + my $t = join(' ', @tags); + $_ =~ s@( $M\S+)+@ $t@; + } + } if (exists($opts{weight})) { @_ = sort sort_weight @_; } diff --git a/src/main.cpp b/src/main.cpp index 8980723f..744488aa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,7 +22,7 @@ #include "TextualParser.hpp" #include "GrammarWriter.hpp" #include "BinaryGrammar.hpp" -#include "GrammarApplicator.hpp" +#include "FormatConverter.hpp" #include "version.hpp" #include "options.hpp" @@ -362,9 +362,55 @@ int main(int argc, char* argv[]) { } if (!options[GRAMMAR_ONLY].doesOccur) { - GrammarApplicator applicator(*ux_stderr); + FormatConverter applicator(*ux_stderr); + applicator.fmt_input = CG3SF_CG; + + if (options[IN_CG].doesOccur) { + applicator.fmt_input = CG3SF_CG; + } + else if (options[IN_NICELINE].doesOccur) { + applicator.fmt_input = CG3SF_NICELINE; + } + else if (options[IN_APERTIUM].doesOccur) { + applicator.fmt_input = CG3SF_APERTIUM; + } + else if (options[IN_FST].doesOccur) { + applicator.fmt_input = CG3SF_FST; + } + else if (options[IN_PLAIN].doesOccur) { + applicator.fmt_input = CG3SF_PLAIN; + } + else if (options[IN_JSONL].doesOccur) { + applicator.fmt_input = CG3SF_JSONL; + } + else if (options[IN_BINARY].doesOccur) { + applicator.fmt_input = CG3SF_BINARY; + } + applicator.setGrammar(&grammar); applicator.setOptions(conv); + + applicator.fmt_output = CG3SF_CG; + if (options[OUT_APERTIUM].doesOccur) { + applicator.fmt_output = CG3SF_APERTIUM; + applicator.unicode_tags = true; + } + else if (options[OUT_FST].doesOccur) { + applicator.fmt_output = CG3SF_FST; + } + else if (options[OUT_NICELINE].doesOccur) { + applicator.fmt_output = CG3SF_NICELINE; + } + else if (options[OUT_PLAIN].doesOccur) { + applicator.fmt_output = CG3SF_PLAIN; + } + else if (options[OUT_JSONL].doesOccur) { + applicator.fmt_output = CG3SF_JSONL; + } + else if (options[OUT_BINARY].doesOccur) { + applicator.fmt_output = CG3SF_BINARY; + } + if (options[PROFILING].doesOccur) { applicator.profiler = profiler.get(); } diff --git a/src/options.cpp b/src/options.cpp index 70049057..124fded3 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -95,7 +95,22 @@ options_t options{ UOption{"show-tag-hashes", 0, UOPT_NO_ARG, "prints a list of tags and their hashes as they are parsed during the run"}, UOption{"show-set-hashes", 0, UOPT_NO_ARG, "prints a list of sets and their hashes; implies --grammar-only"}, UOption{"dump-ast", 0, UOPT_NO_ARG, "prints the grammar parse tree; implies --grammar-only"}, - UOption{"no-break", 'B', UOPT_NO_ARG, "inhibits any extra whitespace in output"}, + UOption{"no-break", 0, UOPT_NO_ARG, "inhibits any extra whitespace in output"}, + UOption{"in-cg", 0, UOPT_NO_ARG, "sets input format to CG (default)"}, + UOption{"in-niceline", 0, UOPT_NO_ARG, "sets input format to Niceline CG"}, + UOption{"in-apertium", 0, UOPT_NO_ARG, "sets input format to Apertium"}, + UOption{"in-fst", 0, UOPT_NO_ARG, "sets input format to HFST/XFST"}, + UOption{"in-plain", 0, UOPT_NO_ARG, "sets input format to plain text"}, + UOption{"in-jsonl", 0, UOPT_NO_ARG, "sets input format to JSONL (experimental)"}, + UOption{"in-binary", 0, UOPT_NO_ARG, "sets input format to binary (experimental)"}, + UOption{"out-cg", 0, UOPT_NO_ARG, "sets output format to CG (default)"}, + UOption{"out-apertium", 0, UOPT_NO_ARG, "sets output format to Apertium"}, + UOption{"out-fst", 0, UOPT_NO_ARG, "sets output format to HFST/XFST"}, + UOption{"out-matxin", 0, UOPT_NO_ARG, "sets output format to Matxin"}, + UOption{"out-niceline", 0, UOPT_NO_ARG, "sets output format to Niceline CG"}, + UOption{"out-plain", 0, UOPT_NO_ARG, "sets output format to plain text"}, + UOption{"out-jsonl", 0, UOPT_NO_ARG, "sets output format to JSONL (experimental)"}, + UOption{"out-binary", 0, UOPT_NO_ARG, "sets output format to binary (experimental)"}, }; options_t options_default = options; diff --git a/src/options.hpp b/src/options.hpp index 39a112fc..82453656 100644 --- a/src/options.hpp +++ b/src/options.hpp @@ -92,6 +92,21 @@ enum OPTIONS { SHOW_SET_HASHES, DUMP_AST, NO_BREAK, + IN_CG, + IN_NICELINE, + IN_APERTIUM, + IN_FST, + IN_PLAIN, + IN_JSONL, + IN_BINARY, + OUT_CG, + OUT_APERTIUM, + OUT_FST, + OUT_MATXIN, + OUT_NICELINE, + OUT_PLAIN, + OUT_JSONL, + OUT_BINARY, NUM_OPTIONS, }; diff --git a/test/runall.pl b/test/runall.pl index 609af943..df4341f5 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -2,9 +2,9 @@ use strict; use warnings; use Cwd qw(realpath); +use FindBin qw($Bin); -my ($bindir, $sep) = $0 =~ /^(.*)(\\|\/).*/; -$bindir = realpath $bindir; +my $bindir = realpath $Bin; chdir $bindir or die("Error: Could not change directory to $bindir !"); # Search paths for the binary @@ -25,6 +25,9 @@ 'grammar.cg3b', 'diff.bin.txt', 'output.bin.txt', + 'diff.bsf.txt', + 'expected.bsf.txt', + 'output.bsf.txt', 'grammar.out.cg3', 'diff.out.txt', 'output.out.txt', @@ -75,12 +78,27 @@ sub run_pl { } if (-s "diff.bin.txt") { - print STDERR "Fail ($gf)\n"; + print STDERR "Fail ($gf) "; $good = 0; } else { - print STDERR "Success\n"; + print STDERR "Success "; + } + + # Normal run, but with binary I/O + my $conv = $binary; + $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; + `cat input.txt | "$conv" --in-cg --out-binary 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + `cat expected.txt | $bindir/../scripts/cg-untrace | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; + `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; + + if (-s "diff.bsf.txt") { + print STDERR "Fail"; + $good = 0; + } else { + print STDERR "Success"; } + print STDERR "\n"; return $good; } From 373bbcdab08a3ef705ad7b7e6b1b9bb3797c6b33 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 20 Aug 2025 16:09:17 -0400 Subject: [PATCH 17/42] handle empty cohorts (41/69) --- src/BinaryApplicator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index ab7e0bd3..3747f1a8 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -253,6 +253,7 @@ bool BinaryApplicator::readWindow() { uint16_t reading_count; READ_U16_INTO(reading_count); + if (!reading_count) initEmptyCohort(*cCohort); Reading* prev = nullptr; for (uint16_t rn = 0; rn < reading_count; rn++) { Reading* cReading = alloc_reading(cCohort); From 151c291b470c885b2f1d7528dfd76ebdda5e6c12 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 20 Aug 2025 16:13:01 -0400 Subject: [PATCH 18/42] add endtag to last cohort (42/69) --- src/BinaryApplicator.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 3747f1a8..f276d9ee 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -283,6 +283,12 @@ bool BinaryApplicator::readWindow() { ++numReadings; } + if (cn+1 == cohort_count) { + for (auto iter : cCohort->readings) { + addTagToReading(*iter, endtag); + } + } + insert_if_exists(cCohort->possible_sets, grammar->sets_any); cSWindow->appendCohort(cCohort); } From 5bd496c612b25a28b5a37620b437a6b5fd909918 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 20 Aug 2025 16:38:15 -0400 Subject: [PATCH 19/42] args to cg-conv (45/69) --- test/T_CopyCohort/bsfargs.txt | 1 + test/T_Dependency/bsfargs.txt | 1 + test/T_Dependency_Loops/bsfargs.txt | 1 + test/T_Dependency_OutOfRange/bsfargs.txt | 1 + test/T_MergeCohorts/bsfargs.txt | 1 + test/T_Movement/bsfargs.txt | 1 + test/T_RemCohort/bsfargs.txt | 1 + test/T_SplitCohort/bsfargs.txt | 1 + test/T_SwitchParent/bsfargs.txt | 1 + test/runall.pl | 10 +++++++--- 10 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 test/T_CopyCohort/bsfargs.txt create mode 100644 test/T_Dependency/bsfargs.txt create mode 100644 test/T_Dependency_Loops/bsfargs.txt create mode 100644 test/T_Dependency_OutOfRange/bsfargs.txt create mode 100644 test/T_MergeCohorts/bsfargs.txt create mode 100644 test/T_Movement/bsfargs.txt create mode 100644 test/T_RemCohort/bsfargs.txt create mode 100644 test/T_SplitCohort/bsfargs.txt create mode 100644 test/T_SwitchParent/bsfargs.txt diff --git a/test/T_CopyCohort/bsfargs.txt b/test/T_CopyCohort/bsfargs.txt new file mode 100644 index 00000000..a288fd24 --- /dev/null +++ b/test/T_CopyCohort/bsfargs.txt @@ -0,0 +1 @@ +--parse-dep \ No newline at end of file diff --git a/test/T_Dependency/bsfargs.txt b/test/T_Dependency/bsfargs.txt new file mode 100644 index 00000000..f5db7bec --- /dev/null +++ b/test/T_Dependency/bsfargs.txt @@ -0,0 +1 @@ +--dep-delimit \ No newline at end of file diff --git a/test/T_Dependency_Loops/bsfargs.txt b/test/T_Dependency_Loops/bsfargs.txt new file mode 100644 index 00000000..a288fd24 --- /dev/null +++ b/test/T_Dependency_Loops/bsfargs.txt @@ -0,0 +1 @@ +--parse-dep \ No newline at end of file diff --git a/test/T_Dependency_OutOfRange/bsfargs.txt b/test/T_Dependency_OutOfRange/bsfargs.txt new file mode 100644 index 00000000..a288fd24 --- /dev/null +++ b/test/T_Dependency_OutOfRange/bsfargs.txt @@ -0,0 +1 @@ +--parse-dep \ No newline at end of file diff --git a/test/T_MergeCohorts/bsfargs.txt b/test/T_MergeCohorts/bsfargs.txt new file mode 100644 index 00000000..f5db7bec --- /dev/null +++ b/test/T_MergeCohorts/bsfargs.txt @@ -0,0 +1 @@ +--dep-delimit \ No newline at end of file diff --git a/test/T_Movement/bsfargs.txt b/test/T_Movement/bsfargs.txt new file mode 100644 index 00000000..f5db7bec --- /dev/null +++ b/test/T_Movement/bsfargs.txt @@ -0,0 +1 @@ +--dep-delimit \ No newline at end of file diff --git a/test/T_RemCohort/bsfargs.txt b/test/T_RemCohort/bsfargs.txt new file mode 100644 index 00000000..a288fd24 --- /dev/null +++ b/test/T_RemCohort/bsfargs.txt @@ -0,0 +1 @@ +--parse-dep \ No newline at end of file diff --git a/test/T_SplitCohort/bsfargs.txt b/test/T_SplitCohort/bsfargs.txt new file mode 100644 index 00000000..a288fd24 --- /dev/null +++ b/test/T_SplitCohort/bsfargs.txt @@ -0,0 +1 @@ +--parse-dep \ No newline at end of file diff --git a/test/T_SwitchParent/bsfargs.txt b/test/T_SwitchParent/bsfargs.txt new file mode 100644 index 00000000..f5db7bec --- /dev/null +++ b/test/T_SwitchParent/bsfargs.txt @@ -0,0 +1 @@ +--dep-delimit \ No newline at end of file diff --git a/test/runall.pl b/test/runall.pl index df4341f5..c991c9e9 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -37,7 +37,7 @@ my $binary = "vislcg3"; sub run_pl { - my ($binary,$override,$args) = @_; + my ($binary,$override,$args,$bsfargs) = @_; my $good = 1; # Normal run @@ -87,7 +87,7 @@ sub run_pl { # Normal run, but with binary I/O my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; - `cat input.txt | "$conv" --in-cg --out-binary 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + `cat input.txt | "$conv" --in-cg --out-binary $bsfargs 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; `cat expected.txt | $bindir/../scripts/cg-untrace | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; @@ -159,6 +159,10 @@ sub run_pl { if (-s 'args.txt') { $args = `cat args.txt`; } + my $bsfargs = ''; + if (-s 'bsfargs.txt') { + $bsfargs = `cat bsfargs.txt`; + } if (-x 'run.pl') { `./run.pl "$binary" \Q$c\E $args`; if ($?) { @@ -167,7 +171,7 @@ sub run_pl { } } else { - if (!run_pl($binary, $c, $args)) { + if (!run_pl($binary, $c, $args, $bsfargs)) { $bad = 1; $failed += 1; } From 329b5159ba33005bf0863e5212989a651d5fd9d6 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Wed, 20 Aug 2025 16:44:42 -0400 Subject: [PATCH 20/42] unique_tags (46/69) --- src/BinaryApplicator.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index f276d9ee..15ef82c8 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -451,11 +451,18 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); std::string tag_buffer; uint16_t tag_count = 0; + uint32SortedVector unique; for (auto& tter : reading->tags_list) { auto tag = grammar->single_tags[tter]; if (tag->type & (T_WORDFORM | T_BASEFORM | T_DEPENDENCY | T_RELATION)) { continue; } + if (unique_tags) { + if (unique.find(tter) != unique.end()) { + continue; + } + unique.insert(tter); + } WRITE_TAG_INTO(tag, tag_buffer); tag_count++; } From 628168b2075cb66f4a217ea5f6479f69950d1426 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 21 Aug 2025 15:41:14 -0400 Subject: [PATCH 21/42] delimiters in tests (49/69) --- test/T_CmdArgs/bsfgrammar.cg3 | 1 + test/T_ContextTestJump/bsfgrammar.cg3 | 1 + test/T_With/bsfgrammar.cg3 | 1 + test/runall.pl | 6 +++++- 4 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 test/T_CmdArgs/bsfgrammar.cg3 create mode 100644 test/T_ContextTestJump/bsfgrammar.cg3 create mode 100644 test/T_With/bsfgrammar.cg3 diff --git a/test/T_CmdArgs/bsfgrammar.cg3 b/test/T_CmdArgs/bsfgrammar.cg3 new file mode 100644 index 00000000..3b0c07fe --- /dev/null +++ b/test/T_CmdArgs/bsfgrammar.cg3 @@ -0,0 +1 @@ +DELIMITERS = "<.>" ; \ No newline at end of file diff --git a/test/T_ContextTestJump/bsfgrammar.cg3 b/test/T_ContextTestJump/bsfgrammar.cg3 new file mode 100644 index 00000000..3b0c07fe --- /dev/null +++ b/test/T_ContextTestJump/bsfgrammar.cg3 @@ -0,0 +1 @@ +DELIMITERS = "<.>" ; \ No newline at end of file diff --git a/test/T_With/bsfgrammar.cg3 b/test/T_With/bsfgrammar.cg3 new file mode 100644 index 00000000..e393de09 --- /dev/null +++ b/test/T_With/bsfgrammar.cg3 @@ -0,0 +1 @@ +DELIMITERS = "<.>" ; diff --git a/test/runall.pl b/test/runall.pl index c991c9e9..c5b783da 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -87,7 +87,11 @@ sub run_pl { # Normal run, but with binary I/O my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; - `cat input.txt | "$conv" --in-cg --out-binary $bsfargs 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + if (-s "bsfgrammar.cg3") { + `cat input.txt | "$binary" --in-cg --out-binary -g bsfgrammar.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + } else { + `cat input.txt | "$conv" --in-cg --out-binary $bsfargs 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + } `cat expected.txt | $bindir/../scripts/cg-untrace | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; From daf2d86354f9b1a5f2df1316ad9290aac70c6975 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 21 Aug 2025 16:15:41 -0400 Subject: [PATCH 22/42] print end tag --- src/BinaryApplicator.cpp | 4 +++- test/T_CmdArgs/{bsfgrammar.cg3 => conv1.cg3} | 0 .../{bsfgrammar.cg3 => conv1.cg3} | 0 test/T_CopyCohort/bsfargs.txt | 1 - test/T_CopyCohort/conv1.cg3 | 1 + test/T_Delimit/conv2.cg3 | 1 + test/T_Dependency/bsfargs.txt | 1 - test/T_Dependency/conv1.cg3 | 1 + test/T_Dependency_Loops/bsfargs.txt | 1 - test/T_Dependency_Loops/conv1.cg3 | 1 + test/T_Dependency_OutOfRange/bsfargs.txt | 1 - test/T_Dependency_OutOfRange/conv1.cg3 | 1 + test/T_JumpExecute/conv2.cg3 | 1 + test/T_MergeCohorts/bsfargs.txt | 1 - test/T_MergeCohorts/conv1.cg3 | 1 + test/T_Movement/bsfargs.txt | 1 - test/T_Movement/conv1.cg3 | 1 + test/T_RemCohort/bsfargs.txt | 1 - test/T_RemCohort/conv1.cg3 | 1 + test/T_Select/conv2.cg3 | 1 + test/T_SplitCohort/bsfargs.txt | 1 - test/T_SplitCohort/conv1.cg3 | 1 + test/T_SwitchParent/bsfargs.txt | 1 - test/T_SwitchParent/conv1.cg3 | 1 + test/T_With/{bsfgrammar.cg3 => conv1.cg3} | 0 test/runall.pl | 20 ++++++++++--------- 26 files changed, 26 insertions(+), 19 deletions(-) rename test/T_CmdArgs/{bsfgrammar.cg3 => conv1.cg3} (100%) rename test/T_ContextTestJump/{bsfgrammar.cg3 => conv1.cg3} (100%) delete mode 100644 test/T_CopyCohort/bsfargs.txt create mode 100644 test/T_CopyCohort/conv1.cg3 create mode 100644 test/T_Delimit/conv2.cg3 delete mode 100644 test/T_Dependency/bsfargs.txt create mode 100644 test/T_Dependency/conv1.cg3 delete mode 100644 test/T_Dependency_Loops/bsfargs.txt create mode 100644 test/T_Dependency_Loops/conv1.cg3 delete mode 100644 test/T_Dependency_OutOfRange/bsfargs.txt create mode 100644 test/T_Dependency_OutOfRange/conv1.cg3 create mode 100644 test/T_JumpExecute/conv2.cg3 delete mode 100644 test/T_MergeCohorts/bsfargs.txt create mode 100644 test/T_MergeCohorts/conv1.cg3 delete mode 100644 test/T_Movement/bsfargs.txt create mode 100644 test/T_Movement/conv1.cg3 delete mode 100644 test/T_RemCohort/bsfargs.txt create mode 100644 test/T_RemCohort/conv1.cg3 create mode 100644 test/T_Select/conv2.cg3 delete mode 100644 test/T_SplitCohort/bsfargs.txt create mode 100644 test/T_SplitCohort/conv1.cg3 delete mode 100644 test/T_SwitchParent/bsfargs.txt create mode 100644 test/T_SwitchParent/conv1.cg3 rename test/T_With/{bsfgrammar.cg3 => conv1.cg3} (100%) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 15ef82c8..db0cbcce 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -285,7 +285,9 @@ bool BinaryApplicator::readWindow() { if (cn+1 == cohort_count) { for (auto iter : cCohort->readings) { - addTagToReading(*iter, endtag); + if (iter->tags.find(endtag) == iter->tags.end()) { + addTagToReading(*iter, endtag); + } } } diff --git a/test/T_CmdArgs/bsfgrammar.cg3 b/test/T_CmdArgs/conv1.cg3 similarity index 100% rename from test/T_CmdArgs/bsfgrammar.cg3 rename to test/T_CmdArgs/conv1.cg3 diff --git a/test/T_ContextTestJump/bsfgrammar.cg3 b/test/T_ContextTestJump/conv1.cg3 similarity index 100% rename from test/T_ContextTestJump/bsfgrammar.cg3 rename to test/T_ContextTestJump/conv1.cg3 diff --git a/test/T_CopyCohort/bsfargs.txt b/test/T_CopyCohort/bsfargs.txt deleted file mode 100644 index a288fd24..00000000 --- a/test/T_CopyCohort/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---parse-dep \ No newline at end of file diff --git a/test/T_CopyCohort/conv1.cg3 b/test/T_CopyCohort/conv1.cg3 new file mode 100644 index 00000000..d167d8b3 --- /dev/null +++ b/test/T_CopyCohort/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --parse-dep ; diff --git a/test/T_Delimit/conv2.cg3 b/test/T_Delimit/conv2.cg3 new file mode 100644 index 00000000..f15e28c3 --- /dev/null +++ b/test/T_Delimit/conv2.cg3 @@ -0,0 +1 @@ +CmdArgs += -e ; diff --git a/test/T_Dependency/bsfargs.txt b/test/T_Dependency/bsfargs.txt deleted file mode 100644 index f5db7bec..00000000 --- a/test/T_Dependency/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---dep-delimit \ No newline at end of file diff --git a/test/T_Dependency/conv1.cg3 b/test/T_Dependency/conv1.cg3 new file mode 100644 index 00000000..0f36431e --- /dev/null +++ b/test/T_Dependency/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --dep-delimit ; diff --git a/test/T_Dependency_Loops/bsfargs.txt b/test/T_Dependency_Loops/bsfargs.txt deleted file mode 100644 index a288fd24..00000000 --- a/test/T_Dependency_Loops/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---parse-dep \ No newline at end of file diff --git a/test/T_Dependency_Loops/conv1.cg3 b/test/T_Dependency_Loops/conv1.cg3 new file mode 100644 index 00000000..d167d8b3 --- /dev/null +++ b/test/T_Dependency_Loops/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --parse-dep ; diff --git a/test/T_Dependency_OutOfRange/bsfargs.txt b/test/T_Dependency_OutOfRange/bsfargs.txt deleted file mode 100644 index a288fd24..00000000 --- a/test/T_Dependency_OutOfRange/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---parse-dep \ No newline at end of file diff --git a/test/T_Dependency_OutOfRange/conv1.cg3 b/test/T_Dependency_OutOfRange/conv1.cg3 new file mode 100644 index 00000000..d167d8b3 --- /dev/null +++ b/test/T_Dependency_OutOfRange/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --parse-dep ; diff --git a/test/T_JumpExecute/conv2.cg3 b/test/T_JumpExecute/conv2.cg3 new file mode 100644 index 00000000..f15e28c3 --- /dev/null +++ b/test/T_JumpExecute/conv2.cg3 @@ -0,0 +1 @@ +CmdArgs += -e ; diff --git a/test/T_MergeCohorts/bsfargs.txt b/test/T_MergeCohorts/bsfargs.txt deleted file mode 100644 index f5db7bec..00000000 --- a/test/T_MergeCohorts/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---dep-delimit \ No newline at end of file diff --git a/test/T_MergeCohorts/conv1.cg3 b/test/T_MergeCohorts/conv1.cg3 new file mode 100644 index 00000000..0f36431e --- /dev/null +++ b/test/T_MergeCohorts/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --dep-delimit ; diff --git a/test/T_Movement/bsfargs.txt b/test/T_Movement/bsfargs.txt deleted file mode 100644 index f5db7bec..00000000 --- a/test/T_Movement/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---dep-delimit \ No newline at end of file diff --git a/test/T_Movement/conv1.cg3 b/test/T_Movement/conv1.cg3 new file mode 100644 index 00000000..0f36431e --- /dev/null +++ b/test/T_Movement/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --dep-delimit ; diff --git a/test/T_RemCohort/bsfargs.txt b/test/T_RemCohort/bsfargs.txt deleted file mode 100644 index a288fd24..00000000 --- a/test/T_RemCohort/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---parse-dep \ No newline at end of file diff --git a/test/T_RemCohort/conv1.cg3 b/test/T_RemCohort/conv1.cg3 new file mode 100644 index 00000000..d167d8b3 --- /dev/null +++ b/test/T_RemCohort/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --parse-dep ; diff --git a/test/T_Select/conv2.cg3 b/test/T_Select/conv2.cg3 new file mode 100644 index 00000000..f15e28c3 --- /dev/null +++ b/test/T_Select/conv2.cg3 @@ -0,0 +1 @@ +CmdArgs += -e ; diff --git a/test/T_SplitCohort/bsfargs.txt b/test/T_SplitCohort/bsfargs.txt deleted file mode 100644 index a288fd24..00000000 --- a/test/T_SplitCohort/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---parse-dep \ No newline at end of file diff --git a/test/T_SplitCohort/conv1.cg3 b/test/T_SplitCohort/conv1.cg3 new file mode 100644 index 00000000..d167d8b3 --- /dev/null +++ b/test/T_SplitCohort/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --parse-dep ; diff --git a/test/T_SwitchParent/bsfargs.txt b/test/T_SwitchParent/bsfargs.txt deleted file mode 100644 index f5db7bec..00000000 --- a/test/T_SwitchParent/bsfargs.txt +++ /dev/null @@ -1 +0,0 @@ ---dep-delimit \ No newline at end of file diff --git a/test/T_SwitchParent/conv1.cg3 b/test/T_SwitchParent/conv1.cg3 new file mode 100644 index 00000000..0f36431e --- /dev/null +++ b/test/T_SwitchParent/conv1.cg3 @@ -0,0 +1 @@ +CmdArgs += --dep-delimit ; diff --git a/test/T_With/bsfgrammar.cg3 b/test/T_With/conv1.cg3 similarity index 100% rename from test/T_With/bsfgrammar.cg3 rename to test/T_With/conv1.cg3 diff --git a/test/runall.pl b/test/runall.pl index c5b783da..06139155 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -37,7 +37,7 @@ my $binary = "vislcg3"; sub run_pl { - my ($binary,$override,$args,$bsfargs) = @_; + my ($binary,$override,$args) = @_; my $good = 1; # Normal run @@ -87,10 +87,16 @@ sub run_pl { # Normal run, but with binary I/O my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; - if (-s "bsfgrammar.cg3") { - `cat input.txt | "$binary" --in-cg --out-binary -g bsfgrammar.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + if (-s "conv1.cg3") { + `cat input.txt | "$binary" --in-cg --out-binary -g conv1.cg3 2>stderr.bsf.conv1.txt >stdout.bsf.conv1.bin`; } else { - `cat input.txt | "$conv" --in-cg --out-binary $bsfargs 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + `cat input.txt | "$conv" --in-cg --out-binary 2>stderr.bsf.conv1.txt >stdout.bsf.conv1.bin`; + } + `cat stdout.bsf.conv1.bin | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt >stdout.bsf.vislcg3.bin`; + if (-s "conv2.cg3") { + `cat stdout.bsf.vislcg3.bin | "$binary" --in-binary --out-cg -g conv2.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + } else { + `cat stdout.bsf.vislcg3.bin | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; } `cat expected.txt | $bindir/../scripts/cg-untrace | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; @@ -163,10 +169,6 @@ sub run_pl { if (-s 'args.txt') { $args = `cat args.txt`; } - my $bsfargs = ''; - if (-s 'bsfargs.txt') { - $bsfargs = `cat bsfargs.txt`; - } if (-x 'run.pl') { `./run.pl "$binary" \Q$c\E $args`; if ($?) { @@ -175,7 +177,7 @@ sub run_pl { } } else { - if (!run_pl($binary, $c, $args, $bsfargs)) { + if (!run_pl($binary, $c, $args)) { $bad = 1; $failed += 1; } From e192cbfecdfd24f44631bbf7f47a1eae6d2b6241 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Thu, 21 Aug 2025 16:23:30 -0400 Subject: [PATCH 23/42] dep_has_spanned (Omniscan) --- python/cg3.py | 3 +++ src/BinaryApplicator.cpp | 6 ++++++ src/BinaryApplicator.hpp | 1 + test/T_Omniscan/conv1.cg3 | 1 + 4 files changed, 11 insertions(+) create mode 100644 test/T_Omniscan/conv1.cg3 diff --git a/python/cg3.py b/python/cg3.py index 979a5db9..9c136ff5 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -30,6 +30,7 @@ class Window: text: str = '' text_post: str = '' flush_after: bool = False + dep_has_spanned: bool = False def parse_binary_window(buf): '''Given a bytestring `buf` containing a single window @@ -56,6 +57,8 @@ def read_str(): window_flags = read_u16() if window_flags & 1: window.flush_after = True + if window_flags & 2: + window.dep_has_spanned = True tag_count = read_u16() tags = [read_str() for i in range(tag_count)] def read_tags(): diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index db0cbcce..7ad040eb 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -157,6 +157,9 @@ bool BinaryApplicator::readWindow() { if (flags & BFW_FLUSH) { cSWindow->flush_after = true; } + if (flags & BFW_DEP_SPAN) { + dep_has_spanned = true; + } TagVector window_tags; uint16_t tag_count; @@ -483,6 +486,9 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out if (window->flush_after) { flags |= BFW_FLUSH; } + if (dep_has_spanned) { + flags |= BFW_DEP_SPAN; + } WRITE_U16_INTO(flags, header_buffer); WRITE_U16_INTO(tags_to_write.size(), header_buffer); diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp index be9819f2..ad3859cc 100644 --- a/src/BinaryApplicator.hpp +++ b/src/BinaryApplicator.hpp @@ -28,6 +28,7 @@ namespace CG3 { enum BinaryFormatFlags { // Window BFW_FLUSH = (1 << 1), + BFW_DEP_SPAN = (1 << 2), // Cohort BFC_RELATED = (1 << 1), // Reading diff --git a/test/T_Omniscan/conv1.cg3 b/test/T_Omniscan/conv1.cg3 new file mode 100644 index 00000000..be353a43 --- /dev/null +++ b/test/T_Omniscan/conv1.cg3 @@ -0,0 +1 @@ +Delimiters = "<$.>" ; \ No newline at end of file From c6e610ff52cddf35a1bd0da8cc06da8c16868a64 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 22 Aug 2025 13:04:32 +0200 Subject: [PATCH 24/42] Include Static (57/69) --- .gitignore | 1 + test/T_CmdArgs/conv1.cg3 | 1 - test/T_ContextTestJump/conv1.cg3 | 1 - test/T_CopyCohort/conv1.cg3 | 1 - test/T_Delimit/conv2.cg3 | 1 - test/T_Dependency/conv1.cg3 | 1 - test/T_Dependency_Loops/conv1.cg3 | 1 - test/T_Dependency_OutOfRange/conv1.cg3 | 1 - test/T_JumpExecute/conv2.cg3 | 1 - test/T_MergeCohorts/conv1.cg3 | 1 - test/T_Movement/conv1.cg3 | 1 - test/T_Omniscan/conv1.cg3 | 1 - test/T_RemCohort/conv1.cg3 | 1 - test/T_Select/conv2.cg3 | 1 - test/T_SplitCohort/conv1.cg3 | 1 - test/T_SwitchParent/conv1.cg3 | 1 - test/T_With/conv1.cg3 | 1 - test/runall.pl | 15 +++------------ 18 files changed, 4 insertions(+), 28 deletions(-) delete mode 100644 test/T_CmdArgs/conv1.cg3 delete mode 100644 test/T_ContextTestJump/conv1.cg3 delete mode 100644 test/T_CopyCohort/conv1.cg3 delete mode 100644 test/T_Delimit/conv2.cg3 delete mode 100644 test/T_Dependency/conv1.cg3 delete mode 100644 test/T_Dependency_Loops/conv1.cg3 delete mode 100644 test/T_Dependency_OutOfRange/conv1.cg3 delete mode 100644 test/T_JumpExecute/conv2.cg3 delete mode 100644 test/T_MergeCohorts/conv1.cg3 delete mode 100644 test/T_Movement/conv1.cg3 delete mode 100644 test/T_Omniscan/conv1.cg3 delete mode 100644 test/T_RemCohort/conv1.cg3 delete mode 100644 test/T_Select/conv2.cg3 delete mode 100644 test/T_SplitCohort/conv1.cg3 delete mode 100644 test/T_SwitchParent/conv1.cg3 delete mode 100644 test/T_With/conv1.cg3 diff --git a/.gitignore b/.gitignore index af4fd7cb..f930269e 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ Makefile /test/**/untraced*.txt /test/**/std*.txt /test/**/*.bsf*.txt +/test/**/*.bsf.cg3 /test/**/*.out.cg3 /test/**/*.cg3b /test/**/*.bin diff --git a/test/T_CmdArgs/conv1.cg3 b/test/T_CmdArgs/conv1.cg3 deleted file mode 100644 index 3b0c07fe..00000000 --- a/test/T_CmdArgs/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -DELIMITERS = "<.>" ; \ No newline at end of file diff --git a/test/T_ContextTestJump/conv1.cg3 b/test/T_ContextTestJump/conv1.cg3 deleted file mode 100644 index 3b0c07fe..00000000 --- a/test/T_ContextTestJump/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -DELIMITERS = "<.>" ; \ No newline at end of file diff --git a/test/T_CopyCohort/conv1.cg3 b/test/T_CopyCohort/conv1.cg3 deleted file mode 100644 index d167d8b3..00000000 --- a/test/T_CopyCohort/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --parse-dep ; diff --git a/test/T_Delimit/conv2.cg3 b/test/T_Delimit/conv2.cg3 deleted file mode 100644 index f15e28c3..00000000 --- a/test/T_Delimit/conv2.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += -e ; diff --git a/test/T_Dependency/conv1.cg3 b/test/T_Dependency/conv1.cg3 deleted file mode 100644 index 0f36431e..00000000 --- a/test/T_Dependency/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --dep-delimit ; diff --git a/test/T_Dependency_Loops/conv1.cg3 b/test/T_Dependency_Loops/conv1.cg3 deleted file mode 100644 index d167d8b3..00000000 --- a/test/T_Dependency_Loops/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --parse-dep ; diff --git a/test/T_Dependency_OutOfRange/conv1.cg3 b/test/T_Dependency_OutOfRange/conv1.cg3 deleted file mode 100644 index d167d8b3..00000000 --- a/test/T_Dependency_OutOfRange/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --parse-dep ; diff --git a/test/T_JumpExecute/conv2.cg3 b/test/T_JumpExecute/conv2.cg3 deleted file mode 100644 index f15e28c3..00000000 --- a/test/T_JumpExecute/conv2.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += -e ; diff --git a/test/T_MergeCohorts/conv1.cg3 b/test/T_MergeCohorts/conv1.cg3 deleted file mode 100644 index 0f36431e..00000000 --- a/test/T_MergeCohorts/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --dep-delimit ; diff --git a/test/T_Movement/conv1.cg3 b/test/T_Movement/conv1.cg3 deleted file mode 100644 index 0f36431e..00000000 --- a/test/T_Movement/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --dep-delimit ; diff --git a/test/T_Omniscan/conv1.cg3 b/test/T_Omniscan/conv1.cg3 deleted file mode 100644 index be353a43..00000000 --- a/test/T_Omniscan/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -Delimiters = "<$.>" ; \ No newline at end of file diff --git a/test/T_RemCohort/conv1.cg3 b/test/T_RemCohort/conv1.cg3 deleted file mode 100644 index d167d8b3..00000000 --- a/test/T_RemCohort/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --parse-dep ; diff --git a/test/T_Select/conv2.cg3 b/test/T_Select/conv2.cg3 deleted file mode 100644 index f15e28c3..00000000 --- a/test/T_Select/conv2.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += -e ; diff --git a/test/T_SplitCohort/conv1.cg3 b/test/T_SplitCohort/conv1.cg3 deleted file mode 100644 index d167d8b3..00000000 --- a/test/T_SplitCohort/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --parse-dep ; diff --git a/test/T_SwitchParent/conv1.cg3 b/test/T_SwitchParent/conv1.cg3 deleted file mode 100644 index 0f36431e..00000000 --- a/test/T_SwitchParent/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -CmdArgs += --dep-delimit ; diff --git a/test/T_With/conv1.cg3 b/test/T_With/conv1.cg3 deleted file mode 100644 index e393de09..00000000 --- a/test/T_With/conv1.cg3 +++ /dev/null @@ -1 +0,0 @@ -DELIMITERS = "<.>" ; diff --git a/test/runall.pl b/test/runall.pl index 06139155..f504f367 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -87,18 +87,9 @@ sub run_pl { # Normal run, but with binary I/O my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; - if (-s "conv1.cg3") { - `cat input.txt | "$binary" --in-cg --out-binary -g conv1.cg3 2>stderr.bsf.conv1.txt >stdout.bsf.conv1.bin`; - } else { - `cat input.txt | "$conv" --in-cg --out-binary 2>stderr.bsf.conv1.txt >stdout.bsf.conv1.bin`; - } - `cat stdout.bsf.conv1.bin | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt >stdout.bsf.vislcg3.bin`; - if (-s "conv2.cg3") { - `cat stdout.bsf.vislcg3.bin | "$binary" --in-binary --out-cg -g conv2.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; - } else { - `cat stdout.bsf.vislcg3.bin | "$conv" --in-binary --out-cg 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; - } - `cat expected.txt | $bindir/../scripts/cg-untrace | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; + `echo "Include Static grammar.cg3 ;" > grammar.bsf.cg3`; + `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; + `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; if (-s "diff.bsf.txt") { From c6155ccca306c34445ebcfc5ca3769f3babf1bba Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 09:56:35 -0400 Subject: [PATCH 25/42] fix flag offsets --- src/BinaryApplicator.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp index ad3859cc..edcace99 100644 --- a/src/BinaryApplicator.hpp +++ b/src/BinaryApplicator.hpp @@ -27,13 +27,13 @@ namespace CG3 { enum BinaryFormatFlags { // Window - BFW_FLUSH = (1 << 1), - BFW_DEP_SPAN = (1 << 2), + BFW_FLUSH = (1 << 0), + BFW_DEP_SPAN = (1 << 1), // Cohort - BFC_RELATED = (1 << 1), + BFC_RELATED = (1 << 0), // Reading - BFR_SUBREADING = (1 << 1), - BFR_DELETED = (1 << 2), + BFR_SUBREADING = (1 << 0), + BFR_DELETED = (1 << 1), // Variables BFV_SETVAR = 1, BFV_SETVAR_ANY = 2, From ad8e6414a85152e5aa4cd3ff234780df581682f3 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 10:30:26 -0400 Subject: [PATCH 26/42] split mappings --- src/BinaryApplicator.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 7ad040eb..3f23b3b3 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -268,10 +268,19 @@ bool BinaryApplicator::readWindow() { addTagToReading(*cReading, window_tags[tag]); READ_U16_INTO(tag_count); + TagList mappings; for (uint16_t tn = 0; tn < tag_count; tn++) { READ_U16_INTO(tag); - addTagToReading(*cReading, window_tags[tag], (tn+1 == tag_count)); + if (window_tags[tag]->type & T_MAPPING) { + mappings.push_back(window_tags[tag]); + } + else { + addTagToReading(*cReading, window_tags[tag]); + } } + if (!mappings.empty()) { + splitMappings(mappings, *cCohort, *cReading, true); + } if (prev && (flags & BFR_SUBREADING)) { prev->next = cReading; From 32d3b3272d47cf5b529ba092cd07748b2bc001af Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 11:43:16 -0400 Subject: [PATCH 27/42] ensure tags are mapping tags --- src/BinaryApplicator.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 3f23b3b3..de1ba341 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -169,6 +169,12 @@ bool BinaryApplicator::readWindow() { UString tg; READ_STR_INTO(tg); window_tags.push_back(addTag(tg)); + if (tg[0] == grammar->mapping_prefix) { + window_tags.back()->type |= T_MAPPING; + } + else { + window_tags.back()->type &= ~T_MAPPING; + } } uint16_t var_count; From e70fd683d61696edc6fccadf2f529b3d9dc18c36 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 12:13:36 -0400 Subject: [PATCH 28/42] stabilize-relations (#142) --- scripts/cg-stabilize-relations | 22 ++++++++++++++++++++++ test/runall.pl | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100755 scripts/cg-stabilize-relations diff --git a/scripts/cg-stabilize-relations b/scripts/cg-stabilize-relations new file mode 100755 index 00000000..d7bdb805 --- /dev/null +++ b/scripts/cg-stabilize-relations @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import argparse +import re +import sys + +parser = argparse.ArgumentParser('Pipe a CG stream through this to stabilize IDs and relations so they have consistent numbers') +args = parser.parse_args() + +id_map = {} + +tag = re.compile(r'\b(ID:|R:\w+:)(\d+)\b') +def repl(matchobj): + global id_map + n = matchobj.group(2) + if n not in id_map: + id_map[n] = str(len(id_map) + 1) + return matchobj.group(1) + id_map[n] + +for line in sys.stdin: + sys.stdout.write(tag.sub(repl, line)) + diff --git a/test/runall.pl b/test/runall.pl index f504f367..08029204 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -88,8 +88,8 @@ sub run_pl { my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; `echo "Include Static grammar.cg3 ;" > grammar.bsf.cg3`; - `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | grep -v '' >output.bsf.txt`; - `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m > expected.bsf.txt`; + `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" | grep -v '' >output.bsf.txt`; + `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; if (-s "diff.bsf.txt") { From 389cdd4d5d6946333ab508d7c2e55dd77599f702 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 12:16:04 -0400 Subject: [PATCH 29/42] some tests have FLUSH in them --- test/runall.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runall.pl b/test/runall.pl index 08029204..e9c4b157 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -88,8 +88,9 @@ sub run_pl { my $conv = $binary; $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; `echo "Include Static grammar.cg3 ;" > grammar.bsf.cg3`; - `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" | grep -v '' >output.bsf.txt`; + `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" >output.bsf.txt`; `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" > expected.bsf.txt`; + `echo '' >> expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; if (-s "diff.bsf.txt") { From 22a6be86c6e46348e5b682a472052643ec1a66f6 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 22 Aug 2025 12:41:14 -0400 Subject: [PATCH 30/42] parent.local_number == 0 -> parent = 0 --- src/BinaryApplicator.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index de1ba341..09646a4a 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -427,7 +427,12 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out else { if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { const Cohort* pr = gWindow->cohort_map[cohort->dep_parent]; - WRITE_U32_INTO(pr->global_number, cohort_buffer); + if (pr->local_number == 0) { + WRITE_U32_INTO(0, cohort_buffer); + } + else { + WRITE_U32_INTO(pr->global_number, cohort_buffer); + } } else { WRITE_U32_INTO(DEP_NO_PARENT, cohort_buffer); From 21f8c02eff5f1e2d2cb9bc494e3e1dffe1e5468b Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 11:54:06 +0200 Subject: [PATCH 31/42] dep_window happens in appendCohort (62/69 passing tests); Minor other fixes and cleanup --- CMakeLists.txt | 2 +- include/getopt/getopt.cpp | 6 +- scripts/cg-stabilize-relations | 1 - src/ApertiumApplicator.cpp | 6 +- src/BinaryApplicator.cpp | 41 ++++++------ src/BinaryGrammar_read.cpp | 24 +++---- src/BinaryGrammar_read_10043.cpp | 22 +++--- src/Grammar.cpp | 2 +- src/GrammarApplicator.cpp | 19 +++--- src/GrammarApplicator_matchSet.cpp | 4 +- src/GrammarApplicator_reflow.cpp | 2 +- src/GrammarApplicator_runContextualTest.cpp | 2 +- src/GrammarApplicator_runGrammar.cpp | 14 ++-- src/GrammarApplicator_runRules.cpp | 4 +- src/GrammarWriter.cpp | 6 +- src/MatxinApplicator.cpp | 16 ++--- src/MweSplitApplicator.cpp | 2 +- src/NicelineApplicator.cpp | 14 ++-- src/PlaintextApplicator.cpp | 8 +-- src/TextualParser.cpp | 74 ++++++++++++--------- src/Window.cpp | 6 +- src/cg-conv.cpp | 4 +- src/cg-mwesplit.cpp | 4 +- src/cg-proc.cpp | 2 +- src/inlines.hpp | 2 +- src/main.cpp | 4 +- src/parser_helpers.hpp | 2 +- src/uextras.hpp | 2 +- 28 files changed, 153 insertions(+), 142 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index deccf986..2eeaca5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,7 +149,7 @@ if(EMSCRIPTEN) endif() endif() -add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit) +add_definitions(-DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit -DU_CHARSET_IS_UTF8=1) include_directories("include") include_directories("src") diff --git a/include/getopt/getopt.cpp b/include/getopt/getopt.cpp index 83e71dc2..8bb0ac3e 100644 --- a/include/getopt/getopt.cpp +++ b/include/getopt/getopt.cpp @@ -37,14 +37,14 @@ int getopt(int argc, char **argv, const char *opts) { argv[optind][0] != '-' || argv[optind][1] == '\0') return(EOF); else if (strcmp(argv[optind], "--") == 0) { - optind++; + ++optind; return(EOF); } optopt = c = argv[optind][sp]; if (c == ':' || (cp=strchr(opts, c)) == 0) { ERR(": illegal option -- ", (char)c); if (argv[optind][++sp] == '\0') { - optind++; + ++optind; sp = 1; } return('?'); @@ -65,7 +65,7 @@ int getopt(int argc, char **argv, const char *opts) { else { if (argv[optind][++sp] == '\0') { sp = 1; - optind++; + ++optind; } optarg = nullptr; } diff --git a/scripts/cg-stabilize-relations b/scripts/cg-stabilize-relations index d7bdb805..30f93e87 100755 --- a/scripts/cg-stabilize-relations +++ b/scripts/cg-stabilize-relations @@ -19,4 +19,3 @@ def repl(matchobj): for line in sys.stdin: sys.stdout.write(tag.sub(repl, line)) - diff --git a/src/ApertiumApplicator.cpp b/src/ApertiumApplicator.cpp index 8c89187c..a8a76646 100644 --- a/src/ApertiumApplicator.cpp +++ b/src/ApertiumApplicator.cpp @@ -382,7 +382,7 @@ void ApertiumApplicator::runGrammarOnText(std::istream& input, std::ostream& out lCohort = cCohort = alloc_cohort(cSWindow); cCohort->global_number = gWindow->cohort_counter++; - numCohorts++; + ++numCohorts; cCohort->text = blank; blank.clear(); @@ -782,8 +782,8 @@ void ApertiumApplicator::printReading(const Reading* reading, std::ostream& outp if (reading->parent->dep_parent == 0) { pr = reading->parent->parent->cohorts[0]; } - else if (reading->parent->parent->parent->cohort_map.find(reading->parent->dep_parent) != reading->parent->parent->parent->cohort_map.end()) { - pr = reading->parent->parent->parent->cohort_map[reading->parent->dep_parent]; + else if (gWindow->cohort_map.find(reading->parent->dep_parent) != gWindow->cohort_map.end()) { + pr = gWindow->cohort_map[reading->parent->dep_parent]; } } diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 09646a4a..2cb46a04 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -165,7 +165,7 @@ bool BinaryApplicator::readWindow() { uint16_t tag_count; READ_U16_INTO(tag_count); window_tags.reserve(tag_count); - for (uint16_t i = 0; i < tag_count; i++) { + for (uint16_t i = 0; i < tag_count; ++i) { UString tg; READ_STR_INTO(tg); window_tags.push_back(addTag(tg)); @@ -179,9 +179,9 @@ bool BinaryApplicator::readWindow() { uint16_t var_count; READ_U16_INTO(var_count); - for (uint16_t vn = 0; vn < var_count; vn++) { + for (uint16_t vn = 0; vn < var_count; ++vn) { char mode = buf[pos]; - pos++; + ++pos; uint16_t tag1, tag2; READ_U16_INTO(tag1); READ_U16_INTO(tag2); @@ -209,10 +209,10 @@ bool BinaryApplicator::readWindow() { uint16_t cohort_count; READ_U16_INTO(cohort_count); uint16_t tag; - for (uint16_t cn = 0; cn < cohort_count; cn++) { + for (uint16_t cn = 0; cn < cohort_count; ++cn) { Cohort* cCohort = alloc_cohort(cSWindow); cCohort->global_number = gWindow->cohort_counter++; - numCohorts++; + ++numCohorts; READ_U16_INTO(flags); if (flags & BFC_RELATED) { @@ -227,7 +227,7 @@ bool BinaryApplicator::readWindow() { if (tag_count) { cCohort->wread = alloc_reading(cCohort); addTagToReading(*cCohort->wread, cCohort->wordform); - for (uint16_t tn = 0; tn < tag_count; tn++) { + for (uint16_t tn = 0; tn < tag_count; ++tn) { READ_U16_INTO(tag); addTagToReading(*cCohort->wread, window_tags[tag], (tn + 1 == tag_count)); @@ -236,7 +236,6 @@ bool BinaryApplicator::readWindow() { READ_U32_INTO(cCohort->dep_self); READ_U32_INTO(cCohort->dep_parent); - gWindow->dep_window[cCohort->dep_self] = cCohort; gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; if (cCohort->dep_parent != DEP_NO_PARENT) { @@ -245,7 +244,7 @@ bool BinaryApplicator::readWindow() { uint16_t rel_count; READ_U16_INTO(rel_count); - for (uint16_t rn = 0; rn < rel_count; rn++) { + for (uint16_t rn = 0; rn < rel_count; ++rn) { READ_U16_INTO(tag); uint32_t head; READ_U32_INTO(head); @@ -264,7 +263,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(reading_count); if (!reading_count) initEmptyCohort(*cCohort); Reading* prev = nullptr; - for (uint16_t rn = 0; rn < reading_count; rn++) { + for (uint16_t rn = 0; rn < reading_count; ++rn) { Reading* cReading = alloc_reading(cCohort); addTagToReading(*cReading, cCohort->wordform); @@ -275,7 +274,7 @@ bool BinaryApplicator::readWindow() { READ_U16_INTO(tag_count); TagList mappings; - for (uint16_t tn = 0; tn < tag_count; tn++) { + for (uint16_t tn = 0; tn < tag_count; ++tn) { READ_U16_INTO(tag); if (window_tags[tag]->type & T_MAPPING) { mappings.push_back(window_tags[tag]); @@ -319,7 +318,7 @@ bool BinaryApplicator::readWindow() { #define WRITE_U16_INTO(n, buffer) \ do { \ std::string tmp(2, 0); \ - uint16_t tmp_n = (n); \ + auto tmp_n = static_cast(n); \ tmp.assign(reinterpret_cast(&tmp_n), 2); \ (buffer) += tmp; \ } while (false) @@ -327,7 +326,7 @@ bool BinaryApplicator::readWindow() { #define WRITE_U32_INTO(n, buffer) \ do { \ std::string tmp(4, 0); \ - uint32_t tmp_n = (n); \ + auto tmp_n = static_cast(n); \ tmp.assign(reinterpret_cast(&tmp_n), 4); \ (buffer) += tmp; \ } while (false) @@ -335,7 +334,7 @@ bool BinaryApplicator::readWindow() { #define WRITE_TAG_INTO(tag, buffer) \ do { \ if (tag_index.find((tag)) == tag_index.end()) { \ - tag_index[(tag)] = tags_to_write.size(); \ + tag_index[(tag)] = UI32(tags_to_write.size()); \ tags_to_write.push_back((tag)); \ } \ WRITE_U16_INTO(tag_index[(tag)], buffer); \ @@ -366,7 +365,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out uint16_t var_count = 0; std::string var_buffer; for (auto var : window->variables_output) { - var_count++; + ++var_count; Tag* key = grammar->single_tags[var]; auto iter = window->variables_set.find(var); if (iter != window->variables_set.end()) { @@ -394,7 +393,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { continue; } - cohort_count++; + ++cohort_count; uint16_t flags = 0; if (cohort->type & CT_RELATED) { @@ -411,7 +410,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out continue; } WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); - tag_count++; + ++tag_count; } WRITE_U16_INTO(tag_count, cohort_buffer); cohort_buffer += tag_buffer; @@ -426,7 +425,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out } else { if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { - const Cohort* pr = gWindow->cohort_map[cohort->dep_parent]; + auto pr = gWindow->cohort_map[cohort->dep_parent]; if (pr->local_number == 0) { WRITE_U32_INTO(0, cohort_buffer); } @@ -447,7 +446,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out it = grammar->single_tags.find(miter.first); } for (auto siter : miter.second) { - rel_count += 1; + ++rel_count; WRITE_TAG_INTO(it->second, rel_buffer); WRITE_U32_INTO(siter, rel_buffer); } @@ -467,7 +466,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out } auto reading = top_reading; while (reading) { - reading_count++; + ++reading_count; uint16_t flags = 0; if (reading != top_reading) { flags |= BFR_SUBREADING; @@ -489,7 +488,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out unique.insert(tter); } WRITE_TAG_INTO(tag, tag_buffer); - tag_count++; + ++tag_count; } WRITE_U16_INTO(tag_count, reading_buffer); reading_buffer += tag_buffer; @@ -524,7 +523,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out WRITE_U16_INTO(cohort_count, header_buffer); - uint32_t total_size = header_buffer.size() + cohort_buffer.size(); + auto total_size = UI32(header_buffer.size() + cohort_buffer.size()); writeRaw(output, total_size); output.write(header_buffer.data(), header_buffer.size()); output.write(cohort_buffer.data(), cohort_buffer.size()); diff --git a/src/BinaryGrammar_read.cpp b/src/BinaryGrammar_read.cpp index a006b114..9d37e63a 100644 --- a/src/BinaryGrammar_read.cpp +++ b/src/BinaryGrammar_read.cpp @@ -122,7 +122,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { auto num_single_tags = u32tmp; grammar->num_tags = num_single_tags; grammar->single_tags_list.resize(num_single_tags); - for (uint32_t i = 0; i < num_single_tags; i++) { + for (uint32_t i = 0; i < num_single_tags; ++i) { Tag* t = grammar->allocateTag(); auto fields = readBE(input); @@ -254,7 +254,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); } auto num_pref_targets = u32tmp; - for (uint32_t i = 0; i < num_pref_targets; i++) { + for (uint32_t i = 0; i < num_pref_targets; ++i) { u32tmp = readBE(input); grammar->preferred_targets.push_back(u32tmp); } @@ -264,7 +264,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); } auto num_par_pairs = u32tmp; - for (uint32_t i = 0; i < num_par_pairs; i++) { + for (uint32_t i = 0; i < num_par_pairs; ++i) { auto left = readBE(input); auto right = readBE(input); grammar->parentheses[left] = right; @@ -276,7 +276,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); } auto num_par_anchors = u32tmp; - for (uint32_t i = 0; i < num_par_anchors; i++) { + for (uint32_t i = 0; i < num_par_anchors; ++i) { auto left = readBE(input); auto right = readBE(input); grammar->anchors[left] = right; @@ -288,7 +288,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { } auto num_sets = u32tmp; grammar->sets_list.resize(num_sets); - for (uint32_t i = 0; i < num_sets; i++) { + for (uint32_t i = 0; i < num_sets; ++i) { Set* s = grammar->allocateSet(); auto fields = readBE(input); @@ -316,7 +316,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { if (fields & (1 << 4)) { u32tmp = readBE(input); auto num_set_ops = u32tmp; - for (uint32_t j = 0; j < num_set_ops; j++) { + for (uint32_t j = 0; j < num_set_ops; ++j) { u32tmp = readBE(input); s->set_ops.push_back(u32tmp); } @@ -324,7 +324,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { if (fields & (1 << 5)) { u32tmp = readBE(input); auto num_sets = u32tmp; - for (uint32_t j = 0; j < num_sets; j++) { + for (uint32_t j = 0; j < num_sets; ++j) { u32tmp = readBE(input); s->sets.push_back(u32tmp); } @@ -370,7 +370,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); } auto num_contexts = u32tmp; - for (uint32_t i = 0; i < num_contexts; i++) { + for (uint32_t i = 0; i < num_contexts; ++i) { ContextualTest* t = readContextualTest(input); grammar->contexts[t->hash] = t; } @@ -381,7 +381,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { } auto num_rules = u32tmp; grammar->rule_by_number.resize(num_rules); - for (uint32_t i = 0; i < num_rules; i++) { + for (uint32_t i = 0; i < num_rules; ++i) { Rule* r = grammar->allocateRule(); auto fields = readBE(input); @@ -460,7 +460,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); auto num_dep_tests = u32tmp; - for (uint32_t j = 0; j < num_dep_tests; j++) { + for (uint32_t j = 0; j < num_dep_tests; ++j) { u32tmp = readBE(input); ContextualTest* t = grammar->contexts[u32tmp]; r->addContextualTest(t, r->dep_tests); @@ -468,7 +468,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { u32tmp = readBE(input); auto num_tests = u32tmp; - for (uint32_t j = 0; j < num_tests; j++) { + for (uint32_t j = 0; j < num_tests; ++j) { u32tmp = readBE(input); ContextualTest* t = grammar->contexts[u32tmp]; r->addContextualTest(t, r->tests); @@ -477,7 +477,7 @@ int BinaryGrammar::parse_grammar(std::istream& input) { if (fields & (1 << 15)) { u32tmp = readBE(input); auto num_sub_rules = u32tmp; - for (uint32_t j = 0; j < num_sub_rules; j++) { + for (uint32_t j = 0; j < num_sub_rules; ++j) { u32tmp = readBE(input); r->sub_rules.push_back(grammar->rule_by_number[u32tmp]); } diff --git a/src/BinaryGrammar_read_10043.cpp b/src/BinaryGrammar_read_10043.cpp index b3c6b3e6..906ccb43 100644 --- a/src/BinaryGrammar_read_10043.cpp +++ b/src/BinaryGrammar_read_10043.cpp @@ -83,7 +83,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { auto num_single_tags = u32tmp; grammar->num_tags = num_single_tags; grammar->single_tags_list.resize(num_single_tags); - for (uint32_t i = 0; i < num_single_tags; i++) { + for (uint32_t i = 0; i < num_single_tags; ++i) { Tag* t = grammar->allocateTag(); auto fields = readBE(input); @@ -190,7 +190,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { u32tmp = readBE(input); } auto num_pref_targets = u32tmp; - for (uint32_t i = 0; i < num_pref_targets; i++) { + for (uint32_t i = 0; i < num_pref_targets; ++i) { u32tmp = readBE(input); grammar->preferred_targets.push_back(u32tmp); } @@ -200,7 +200,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { u32tmp = readBE(input); } auto num_par_pairs = u32tmp; - for (uint32_t i = 0; i < num_par_pairs; i++) { + for (uint32_t i = 0; i < num_par_pairs; ++i) { auto left = readBE(input); auto right = readBE(input); grammar->parentheses[left] = right; @@ -212,7 +212,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { u32tmp = readBE(input); } uint32_t num_par_anchors = u32tmp; - for (uint32_t i = 0; i < num_par_anchors; i++) { + for (uint32_t i = 0; i < num_par_anchors; ++i) { auto left = readBE(input); auto right = readBE(input); grammar->anchors[left] = right; @@ -224,7 +224,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { } auto num_sets = u32tmp; grammar->sets_list.resize(num_sets); - for (uint32_t i = 0; i < num_sets; i++) { + for (uint32_t i = 0; i < num_sets; ++i) { Set* s = grammar->allocateSet(); auto fields = readBE(input); @@ -252,14 +252,14 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { if (fields & (1 << 4)) { u32tmp = readBE(input); auto num_set_ops = u32tmp; - for (uint32_t j = 0; j < num_set_ops; j++) { + for (uint32_t j = 0; j < num_set_ops; ++j) { u32tmp = readBE(input); s->set_ops.push_back(u32tmp); } } if (fields & (1 << 5)) { auto num_sets = readBE(input); - for (uint32_t j = 0; j < num_sets; j++) { + for (uint32_t j = 0; j < num_sets; ++j) { u32tmp = readBE(input); s->sets.push_back(u32tmp); } @@ -302,7 +302,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { } auto num_contexts = u32tmp; contexts_list.resize(num_contexts); - for (uint32_t i = 0; i < num_contexts; i++) { + for (uint32_t i = 0; i < num_contexts; ++i) { ContextualTest* t = readContextualTest_10043(input); grammar->contexts[t->hash] = t; contexts_list[i] = t; @@ -314,7 +314,7 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { } auto num_rules = u32tmp; grammar->rule_by_number.resize(num_rules); - for (uint32_t i = 0; i < num_rules; i++) { + for (uint32_t i = 0; i < num_rules; ++i) { Rule* r = grammar->allocateRule(); auto fields = readBE(input); @@ -387,14 +387,14 @@ int BinaryGrammar::readBinaryGrammar_10043(std::istream& input) { } auto num_dep_tests = readBE(input); - for (uint32_t j = 0; j < num_dep_tests; j++) { + for (uint32_t j = 0; j < num_dep_tests; ++j) { u32tmp = readBE(input); ContextualTest* t = contexts_list[u32tmp - 1]; r->addContextualTest(t, r->dep_tests); } auto num_tests = readBE(input); - for (uint32_t j = 0; j < num_tests; j++) { + for (uint32_t j = 0; j < num_tests; ++j) { u32tmp = readBE(input); ContextualTest* t = contexts_list[u32tmp - 1]; r->addContextualTest(t, r->tests); diff --git a/src/Grammar.cpp b/src/Grammar.cpp index c0785472..1dc50f41 100644 --- a/src/Grammar.cpp +++ b/src/Grammar.cpp @@ -552,7 +552,7 @@ Tag* Grammar::allocateTag(const UChar* txt) { Tag* Grammar::addTag(Tag* tag) { uint32_t hash = tag->rehash(); - for (uint32_t seed = 0; seed < 10000; seed++) { + for (uint32_t seed = 0; seed < 10000; ++seed) { uint32_t ih = hash + seed; Taguint32HashMap::iterator it; if ((it = single_tags.find(ih)) != single_tags.end()) { diff --git a/src/GrammarApplicator.cpp b/src/GrammarApplicator.cpp index 7155d376..1628956f 100644 --- a/src/GrammarApplicator.cpp +++ b/src/GrammarApplicator.cpp @@ -178,7 +178,7 @@ void GrammarApplicator::index() { if (sections.empty()) { int32_t smax = SI32(grammar->sections.size()); - for (int32_t i = 0; i < smax; i++) { + for (int32_t i = 0; i < smax; ++i) { for (auto r : grammar->rules) { if (r->section < 0 || r->section > i) { continue; @@ -190,8 +190,8 @@ void GrammarApplicator::index() { } else { numsections = UI32(sections.size()); - for (uint32_t n = 0; n < numsections; n++) { - for (uint32_t e = 0; e <= n; e++) { + for (uint32_t n = 0; n < numsections; ++n) { + for (uint32_t e = 0; e <= n; ++e) { for (auto r : grammar->rules) { if (r->section != SI32(sections[e]) - 1) { continue; @@ -229,7 +229,7 @@ void GrammarApplicator::index() { Tag* GrammarApplicator::addTag(Tag* tag) { uint32_t hash = tag->rehash(); uint32_t seed = 0; - for (; seed < 10000; seed++) { + for (; seed < 10000; ++seed) { uint32_t ih = hash + seed; Taguint32HashMap::iterator it; if ((it = grammar->single_tags.find(ih)) != grammar->single_tags.end()) { @@ -387,7 +387,7 @@ void GrammarApplicator::printReading(const Reading* reading, std::ostream& outpu } unique.insert(tter); } - const Tag* tag = grammar->single_tags[tter]; + auto tag = grammar->single_tags[tter]; if (tag->type & T_DEPENDENCY && has_dep && !dep_original) { continue; } @@ -401,14 +401,13 @@ void GrammarApplicator::printReading(const Reading* reading, std::ostream& outpu if (!reading->parent->dep_self) { reading->parent->dep_self = reading->parent->global_number; } - const Cohort* pr = nullptr; - pr = reading->parent; + auto pr = reading->parent; if (reading->parent->dep_parent != DEP_NO_PARENT) { if (reading->parent->dep_parent == 0) { pr = reading->parent->parent->cohorts[0]; } - else if (reading->parent->parent->parent->cohort_map.find(reading->parent->dep_parent) != reading->parent->parent->parent->cohort_map.end()) { - pr = reading->parent->parent->parent->cohort_map[reading->parent->dep_parent]; + else if (gWindow->cohort_map.find(reading->parent->dep_parent) != gWindow->cohort_map.end()) { + pr = gWindow->cohort_map[reading->parent->dep_parent]; } } @@ -681,7 +680,7 @@ void GrammarApplicator::pipeOutSingleWindow(const SingleWindow& window, Process& auto cs = UI32(window.cohorts.size()) - 1; writeRaw(ss, cs); - for (uint32_t c = 1; c < cs + 1; c++) { + for (uint32_t c = 1; c < cs + 1; ++c) { pipeOutCohort(window.cohorts[c], ss); } diff --git a/src/GrammarApplicator_matchSet.cpp b/src/GrammarApplicator_matchSet.cpp index 8a5e2834..7b6e5d9e 100644 --- a/src/GrammarApplicator_matchSet.cpp +++ b/src/GrammarApplicator_matchSet.cpp @@ -803,12 +803,12 @@ bool GrammarApplicator::doesSetMatchReading(const Reading& reading, const uint32 ++i; } if (match) { - match_sub++; + ++match_sub; retval = true; break; } if (failfast) { - match_sub++; + ++match_sub; retval = false; break; } diff --git a/src/GrammarApplicator_reflow.cpp b/src/GrammarApplicator_reflow.cpp index 644701fa..5301c335 100644 --- a/src/GrammarApplicator_reflow.cpp +++ b/src/GrammarApplicator_reflow.cpp @@ -645,7 +645,7 @@ void GrammarApplicator::splitMappings(TagList& mappings, Cohort& cohort, Reading nr->mapping = ttag; } cohort.appendReading(nr); - numReadings++; + ++numReadings; } reading.mapped = mapped; diff --git a/src/GrammarApplicator_runContextualTest.cpp b/src/GrammarApplicator_runContextualTest.cpp index e519f97d..55e0417f 100644 --- a/src/GrammarApplicator_runContextualTest.cpp +++ b/src/GrammarApplicator_runContextualTest.cpp @@ -477,7 +477,7 @@ Cohort* GrammarApplicator::runContextualTest(SingleWindow* sWindow, size_t posit goto label_gotACohort; } - for (uint32_t i = 1; left || right; i++) { + for (uint32_t i = 1; left || right; ++i) { if (left) { rvs = 0; cohort = runSingleTest(left, lpos - i, test, rvs, &retval, deep, origin); diff --git a/src/GrammarApplicator_runGrammar.cpp b/src/GrammarApplicator_runGrammar.cpp index 69db28e8..033fc156 100644 --- a/src/GrammarApplicator_runGrammar.cpp +++ b/src/GrammarApplicator_runGrammar.cpp @@ -80,7 +80,7 @@ Reading* GrammarApplicator::initEmptyCohort(Cohort& cCohort) { addTagToReading(*cReading, cCohort.wordform); cReading->noprint = true; cCohort.appendReading(cReading); - numReadings++; + ++numReadings; return cReading; } @@ -212,7 +212,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (cCohort && (cSWindow->cohorts.size() >= hard_limit || (!dep_delimit && grammar->delimiters && doesSetMatchCohortNormal(*cCohort, grammar->delimiters->number)))) { @@ -230,7 +230,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (!cSWindow) { @@ -247,7 +247,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp lSWindow = cSWindow; cCohort = nullptr; - numWindows++; + ++numWindows; did_soft_lookback = false; } if (cCohort && cSWindow) { @@ -271,7 +271,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp lCohort = cCohort; lReading = nullptr; indents.clear(); - numCohorts++; + ++numCohorts; cCohort->line_number = numLines; space += 2; @@ -396,7 +396,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp readings->back()->rehash(); } indents.push_back(std::make_pair(indent, cReading)); - numReadings++; + ++numReadings; // Check whether the cohort still belongs to the window, as per --dep-delimit if (!is_deleted && dep_delimit && dep_highest_seen && (cCohort->dep_self <= dep_highest_seen || cCohort->dep_self - dep_highest_seen > dep_delimit)) { @@ -645,7 +645,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } else if (lCohort) { diff --git a/src/GrammarApplicator_runRules.cpp b/src/GrammarApplicator_runRules.cpp index 95a31058..fad57670 100644 --- a/src/GrammarApplicator_runRules.cpp +++ b/src/GrammarApplicator_runRules.cpp @@ -1332,7 +1332,7 @@ uint32_t GrammarApplicator::runRulesOnSingleWindow(SingleWindow& current, const if (selected.size() < target->readings.size() && !selected.empty()) { ReadingList drop; size_t si = 0; - for (size_t ri = 0; ri < target->readings.size(); ri++) { + for (size_t ri = 0; ri < target->readings.size(); ++ri) { // Manually trace, since reading_cb doesn't get called on non-matching readings Reading* rd = target->readings[ri]; if (rule->sub_reading != 32767) { @@ -1342,7 +1342,7 @@ uint32_t GrammarApplicator::runRulesOnSingleWindow(SingleWindow& current, const rd->hit_by.push_back(rule->number); } if (si < selected.size() && target->readings[ri] == selected[si]) { - si++; + ++si; } else { target->readings[ri]->deleted = true; diff --git a/src/GrammarWriter.cpp b/src/GrammarWriter.cpp index 53aff716..e58b5be3 100644 --- a/src/GrammarWriter.cpp +++ b/src/GrammarWriter.cpp @@ -79,7 +79,7 @@ void GrammarWriter::printSet(std::ostream& output, const Set& curset) { } u_fprintf(output, "SET %S = ", n); u_fprintf(output, "%S ", grammar->sets_list[curset.sets[0]]->name.data()); - for (uint32_t i = 0; i < curset.sets.size() - 1; i++) { + for (uint32_t i = 0; i < curset.sets.size() - 1; ++i) { u_fprintf(output, "%S %S ", stringbits[curset.set_ops[i]].data(), grammar->sets_list[curset.sets[i + 1]]->name.data()); } u_fprintf(output, " ;\n\n"); @@ -287,7 +287,7 @@ void GrammarWriter::printRule(std::ostream& to, const Rule& rule) { } u_fprintf(to, " "); - for (uint32_t i = 0; i < FLAGS_COUNT; i++) { + for (uint32_t i = 0; i < FLAGS_COUNT; ++i) { if (i == FL_BEFORE || i == FL_AFTER || i == FL_WITHCHILD) { continue; } @@ -544,7 +544,7 @@ void GrammarWriter::printContextualTest(std::ostream& to, const ContextualTest& u_fprintf(to, "("); printContextualTest(to, **iter); u_fprintf(to, ")"); - iter++; + ++iter; if (iter != test.ors.end()) { u_fprintf(to, " OR "); } diff --git a/src/MatxinApplicator.cpp b/src/MatxinApplicator.cpp index 0f9acf2e..bbd338bf 100644 --- a/src/MatxinApplicator.cpp +++ b/src/MatxinApplicator.cpp @@ -177,7 +177,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; } // end >= soft_limit if (cCohort && (cSWindow->cohorts.size() >= hard_limit || (grammar->delimiters && doesSetMatchCohortNormal(*cCohort, grammar->delimiters->number)))) { if (!is_conv && cSWindow->cohorts.size() >= hard_limit) { @@ -192,7 +192,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; } // end >= hard_limit // If we don't have a current window, create one if (!cSWindow) { @@ -216,7 +216,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu lSWindow->text = firstblank; firstblank.clear(); cCohort = nullptr; - numWindows++; + ++numWindows; } // created at least one cSWindow by now // If the current cohort is looking ok, and we have an available @@ -258,7 +258,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu //u_fprintf(output, "# %S\n", wordform); cCohort->wordform = addTag(wordform); - numCohorts++; + ++numCohorts; // We're now at the beginning of the readings UString current_reading; @@ -319,7 +319,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu } cCohort->appendReading(cReading); - numReadings++; + ++numReadings; current_reading.clear(); @@ -339,7 +339,7 @@ void MatxinApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu } cCohort->appendReading(cReading); - numReadings++; + ++numReadings; current_reading.clear(); continue; // while not $ @@ -790,7 +790,7 @@ void MatxinApplicator::procNode(int& depth, std::map& nodes, std::map const UChar* si = node.si.data() + !node.si.empty(); if (n != 0) { - for (int i = 0; i < depth * 2; i++) { + for (int i = 0; i < depth * 2; ++i) { u_fprintf(output, " "); } @@ -818,7 +818,7 @@ void MatxinApplicator::procNode(int& depth, std::map& nodes, std::map } if (n != 0) { - for (int i = 0; i < depth * 2; i++) { + for (int i = 0; i < depth * 2; ++i) { u_fprintf(output, " "); } diff --git a/src/MweSplitApplicator.cpp b/src/MweSplitApplicator.cpp index 1869deb4..585fe568 100644 --- a/src/MweSplitApplicator.cpp +++ b/src/MweSplitApplicator.cpp @@ -178,7 +178,7 @@ void MweSplitApplicator::printSingleWindow(SingleWindow* window, std::ostream& o } auto cs = UI32(window->cohorts.size()); - for (uint32_t c = 0; c < cs; c++) { + for (uint32_t c = 0; c < cs; ++c) { Cohort* cohort = window->cohorts[c]; std::vector cs = splitMwe(cohort); for (auto& iter : cs) { diff --git a/src/NicelineApplicator.cpp b/src/NicelineApplicator.cpp index 24537121..9536c5ac 100644 --- a/src/NicelineApplicator.cpp +++ b/src/NicelineApplicator.cpp @@ -137,7 +137,7 @@ void NicelineApplicator::runGrammarOnText(std::istream& input, std::ostream& out lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (cCohort && (cSWindow->cohorts.size() >= hard_limit || (!dep_delimit && grammar->delimiters && doesSetMatchCohortNormal(*cCohort, grammar->delimiters->number)))) { @@ -153,7 +153,7 @@ void NicelineApplicator::runGrammarOnText(std::istream& input, std::ostream& out lSWindow = cSWindow; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (!cSWindow) { @@ -163,7 +163,7 @@ void NicelineApplicator::runGrammarOnText(std::istream& input, std::ostream& out lSWindow = cSWindow; cCohort = nullptr; - numWindows++; + ++numWindows; did_soft_lookback = false; } if (cCohort && cSWindow) { @@ -190,7 +190,7 @@ void NicelineApplicator::runGrammarOnText(std::istream& input, std::ostream& out cCohort->global_number = gWindow->cohort_counter++; cCohort->wordform = addTag(tag); lCohort = cCohort; - numCohorts++; + ++numCohorts; ++space; while (space && space[0]) { @@ -257,7 +257,7 @@ void NicelineApplicator::runGrammarOnText(std::istream& input, std::ostream& out splitMappings(mappings, *cCohort, *cReading, true); } cCohort->appendReading(cReading); - numReadings++; + ++numReadings; if (tab) { space = ++tab; @@ -359,8 +359,8 @@ void NicelineApplicator::printReading(const Reading* reading, std::ostream& outp if (reading->parent->dep_parent == 0) { pr = reading->parent->parent->cohorts[0]; } - else if (reading->parent->parent->parent->cohort_map.find(reading->parent->dep_parent) != reading->parent->parent->parent->cohort_map.end()) { - pr = reading->parent->parent->parent->cohort_map[reading->parent->dep_parent]; + else if (gWindow->cohort_map.find(reading->parent->dep_parent) != gWindow->cohort_map.end()) { + pr = gWindow->cohort_map[reading->parent->dep_parent]; } } diff --git a/src/PlaintextApplicator.cpp b/src/PlaintextApplicator.cpp index 10f2370e..0c9d37fb 100644 --- a/src/PlaintextApplicator.cpp +++ b/src/PlaintextApplicator.cpp @@ -129,7 +129,7 @@ void PlaintextApplicator::runGrammarOnText(std::istream& input, std::ostream& ou lCohort = cCohort; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (cCohort && (cSWindow->cohorts.size() >= hard_limit || (!dep_delimit && grammar->delimiters && doesSetMatchCohortNormal(*cCohort, grammar->delimiters->number)))) { @@ -146,7 +146,7 @@ void PlaintextApplicator::runGrammarOnText(std::istream& input, std::ostream& ou lCohort = cCohort; cSWindow = nullptr; cCohort = nullptr; - numCohorts++; + ++numCohorts; did_soft_lookback = false; } if (!cSWindow) { @@ -157,7 +157,7 @@ void PlaintextApplicator::runGrammarOnText(std::istream& input, std::ostream& ou lSWindow = cSWindow; lCohort = cSWindow->cohorts[0]; cCohort = nullptr; - numWindows++; + ++numWindows; did_soft_lookback = false; } if (gWindow->next.size() > num_windows) { @@ -227,7 +227,7 @@ void PlaintextApplicator::runGrammarOnText(std::istream& input, std::ostream& ou tag.append(u">\""); cCohort->wordform = addTag(tag); lCohort = cCohort; - numCohorts++; + ++numCohorts; cReading = initEmptyCohort(*cCohort); cReading->noprint = !add_tags; if (add_tags) { diff --git a/src/TextualParser.cpp b/src/TextualParser.cpp index 8c7c6cad..96adfadc 100644 --- a/src/TextualParser.cpp +++ b/src/TextualParser.cpp @@ -2678,25 +2678,32 @@ void TextualParser::parseFromUChar(UChar* input, const char* fname) { grammar_size = static_cast(_stat.st_size); } - UFILE* grammar = u_fopen(abspath.data(), "rb", nullptr, nullptr); - if (!grammar) { - u_fprintf(ux_stderr, "%s: Error: Error opening %s for reading!\n", filebase, abspath.data()); - CG3Quit(1); - } - UChar32 bom = u_fgetcx(grammar); - if (bom != 0xfeff && bom != static_cast(0xffffffff)) { - u_fungetc(bom, grammar); + std::string buf; + buf.resize(grammar_size); + { + std::ifstream grammar(abspath.data(), std::ios::binary); + if (!grammar) { + u_fprintf(ux_stderr, "%s: Error: Error opening %s for reading!\n", filebase, abspath.data()); + CG3Quit(1); + } + if (!grammar.read(&buf[0], grammar_size)) { + u_fprintf(ux_stderr, "%s: Error: Error reading %s!\n", filebase, abspath.data()); + CG3Quit(1); + } + if (buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') { + buf.erase(0, 3); + } } grammarbufs.emplace_back(new UString(grammar_size * 2, 0)); auto& data = *grammarbufs.back().get(); - uint32_t read = u_file_read(&data[4], SI32(grammar_size * 2), grammar); - u_fclose(grammar); - if (read >= grammar_size * 2 - 1) { + int32_t size = 0; + u_strFromUTF8(&data[4], SI32(grammar_size * 2), &size, buf.data(), SI32(buf.size()), &err); + if (size >= SI32(grammar_size * 2 - 1)) { u_fprintf(ux_stderr, "%s: Error: Converting from underlying codepage to UTF-16 exceeded factor 2 buffer.\n", filebase); CG3Quit(1); } - data.resize(read + 4 + 1); + data.resize(size + 4 + 1); uint32_t olines = 0; swapper oswap(true, olines, result->lines); @@ -2877,26 +2884,34 @@ int TextualParser::parse_grammar(const char* fname) { result->grammar_size = static_cast(_stat.st_size); } - UFILE* grammar = u_fopen(filename, "rb", nullptr, nullptr); - if (!grammar) { - u_fprintf(ux_stderr, "%s: Error: Error opening %s for reading!\n", filebase, filename); - CG3Quit(1); - } - UChar32 bom = u_fgetcx(grammar); - if (bom != 0xfeff && bom != static_cast(0xffffffff)) { - u_fungetc(bom, grammar); + std::string buf; + buf.resize(result->grammar_size); + { + std::ifstream grammar(filename, std::ios::binary); + if (!grammar) { + u_fprintf(ux_stderr, "%s: Error: Error opening %s for reading!\n", filebase, filename); + CG3Quit(1); + } + if (!grammar.read(&buf[0], result->grammar_size)) { + u_fprintf(ux_stderr, "%s: Error: Error reading %s!\n", filebase, filename); + CG3Quit(1); + } + if (buf[0] == '\xEF' && buf[1] == '\xBB' && buf[2] == '\xBF') { + buf.erase(0, 3); + } } // It reads into the buffer at offset 4 because certain functions may look back, so we need some nulls in front. grammarbufs.emplace_back(new UString(result->grammar_size * 2, 0)); auto& data = *grammarbufs.back().get(); - uint32_t read = u_file_read(&data[4], SI32(result->grammar_size * 2), grammar); - u_fclose(grammar); - if (read >= result->grammar_size * 2 - 1) { + int32_t size = 0; + UErrorCode err = U_ZERO_ERROR; + u_strFromUTF8(&data[4], SI32(result->grammar_size * 2), &size, buf.data(), SI32(buf.size()), &err); + if (size >= SI32(result->grammar_size * 2 - 1)) { u_fprintf(ux_stderr, "%s: Error: Converting from underlying codepage to UTF-16 exceeded factor 2 buffer.\n", filebase); CG3Quit(1); } - data.resize(read + 4 + 1); + data.resize(size + 4 + 1); return parse_grammar(data); } @@ -2909,12 +2924,11 @@ int TextualParser::parse_grammar(const char* buffer, size_t length) { grammarbufs.emplace_back(new UString(length * 2, 0)); auto& data = *grammarbufs.back().get(); + int32_t size = 0; UErrorCode err = U_ZERO_ERROR; - UConverter* conv = ucnv_open("UTF-8", &err); - auto tmp = ucnv_toUChars(conv, &data[4], SI32(length * 2), buffer, SI32(length), &err); - - if (static_cast(tmp) >= length * 2 - 1) { - u_fprintf(ux_stderr, "%s: Error: Converting from underlying codepage to UTF-16 exceeded factor 2 buffer!\n", filebase); + u_strFromUTF8(&data[4], SI32(result->grammar_size * 2), &size, buffer, SI32(length), &err); + if (size >= SI32(result->grammar_size * 2 - 1)) { + u_fprintf(ux_stderr, "%s: Error: Converting from underlying codepage to UTF-16 exceeded factor 2 buffer.\n", filebase); CG3Quit(1); } @@ -3029,7 +3043,7 @@ int TextualParser::parse_grammar(UString& data) { // Create context sets for nested rules { constexpr UStringView grp[] = { STR_UU_C1, STR_UU_C2, STR_UU_C3, STR_UU_C4, STR_UU_C5, STR_UU_C6, STR_UU_C7, STR_UU_C8, STR_UU_C9 }; - for (size_t i = 0; i < 9; i++) { + for (size_t i = 0; i < 9; ++i) { Set* set_c = result->allocateSet(); set_c->line = 0; set_c->setName(grp[i]); diff --git a/src/Window.cpp b/src/Window.cpp index 0f6395a0..52177ce6 100644 --- a/src/Window.cpp +++ b/src/Window.cpp @@ -43,14 +43,14 @@ Window::~Window() { SingleWindow* Window::allocSingleWindow() { SingleWindow* swindow = alloc_swindow(this); - window_counter++; + ++window_counter; swindow->number = window_counter; return swindow; } SingleWindow* Window::allocPushSingleWindow() { SingleWindow* swindow = alloc_swindow(this); - window_counter++; + ++window_counter; swindow->number = window_counter; if (!next.empty()) { swindow->next = next.front(); @@ -66,7 +66,7 @@ SingleWindow* Window::allocPushSingleWindow() { SingleWindow* Window::allocAppendSingleWindow() { SingleWindow* swindow = alloc_swindow(this); - window_counter++; + ++window_counter; swindow->number = window_counter; if (!next.empty()) { swindow->previous = next.back(); diff --git a/src/cg-conv.cpp b/src/cg-conv.cpp index 719d29ad..7c9f593d 100644 --- a/src/cg-conv.cpp +++ b/src/cg-conv.cpp @@ -63,13 +63,13 @@ int main(int argc, char* argv[]) { fprintf(out, "Options:\n"); size_t longest = 0; - for (uint32_t i = 0; i < options_conv.size(); i++) { + for (uint32_t i = 0; i < options_conv.size(); ++i) { if (!options_conv[i].description.empty()) { size_t len = strlen(options_conv[i].longName); longest = std::max(longest, len); } } - for (uint32_t i = 0; i < options_conv.size(); i++) { + for (uint32_t i = 0; i < options_conv.size(); ++i) { if (!options_conv[i].description.empty() && options_conv[i].description[0] != '!') { fprintf(out, " "); if (options_conv[i].shortName) { diff --git a/src/cg-mwesplit.cpp b/src/cg-mwesplit.cpp index e13581ae..4ac8dc37 100644 --- a/src/cg-mwesplit.cpp +++ b/src/cg-mwesplit.cpp @@ -63,13 +63,13 @@ int main(int argc, char** argv) { fprintf(out, "Options:\n"); size_t longest = 0; - for (uint32_t i = 0; i < NUM_OPTIONS_MWE; i++) { + for (uint32_t i = 0; i < NUM_OPTIONS_MWE; ++i) { if (!options_mwe[i].description.empty()) { size_t len = strlen(options_mwe[i].longName); longest = std::max(longest, len); } } - for (uint32_t i = 0; i < NUM_OPTIONS_MWE; i++) { + for (uint32_t i = 0; i < NUM_OPTIONS_MWE; ++i) { if (!options_mwe[i].description.empty() && options_mwe[i].description[0] != '!') { fprintf(out, " "); if (options_mwe[i].shortName) { diff --git a/src/cg-proc.cpp b/src/cg-proc.cpp index f39cb20a..1f7e992b 100644 --- a/src/cg-proc.cpp +++ b/src/cg-proc.cpp @@ -327,7 +327,7 @@ int main(int argc, char* argv[]) { applicator->setGrammar(&grammar); applicator->setOptions(); - for (int32_t i = 1; i <= sections; i++) { + for (int32_t i = 1; i <= sections; ++i) { applicator->sections.push_back(i); } diff --git a/src/inlines.hpp b/src/inlines.hpp index 7db76b32..48c53f91 100644 --- a/src/inlines.hpp +++ b/src/inlines.hpp @@ -305,7 +305,7 @@ template inline bool ISESC(const Char* p) { uint32_t a = 1; while (*(p - a) == '\\') { - a++; + ++a; } return (a % 2 == 0); } diff --git a/src/main.cpp b/src/main.cpp index 744488aa..669ef612 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -86,13 +86,13 @@ int main(int argc, char* argv[]) { fprintf(out, "Options:\n"); size_t longest = 0; - for (uint32_t i = 0; i < options.size(); i++) { + for (uint32_t i = 0; i < options.size(); ++i) { if (!options[i].description.empty()) { size_t len = strlen(options[i].longName); longest = std::max(longest, len); } } - for (uint32_t i = 0; i < options.size(); i++) { + for (uint32_t i = 0; i < options.size(); ++i) { if (!options[i].description.empty()) { fprintf(out, " "); if (options[i].shortName) { diff --git a/src/parser_helpers.hpp b/src/parser_helpers.hpp index f5975af9..825a11e3 100644 --- a/src/parser_helpers.hpp +++ b/src/parser_helpers.hpp @@ -54,7 +54,7 @@ Tag* parseTag(const UChar* to, const UChar* p, State& state, bool unescape=true) const UChar* tmp = to; while (tmp[0] && tmp[0] == '^') { tag->type |= T_FAILFAST; - tmp++; + ++tmp; } size_t length = u_strlen(tmp); diff --git a/src/uextras.hpp b/src/uextras.hpp index 0d77c15e..3062c74e 100644 --- a/src/uextras.hpp +++ b/src/uextras.hpp @@ -150,7 +150,7 @@ inline int ux_isSetOp(const UChar* it) { inline bool ux_isEmpty(const UChar* text) { size_t length = u_strlen(text); if (length > 0) { - for (size_t i = 0; i < length; i++) { + for (size_t i = 0; i < length; ++i) { if (!ISSPACE(text[i])) { return false; } From 9ddc564947bcba9ab4da605136a4f797312c0787 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 13:22:25 +0200 Subject: [PATCH 32/42] Move text belonging to removed cohorts to prior not-removed cohorts, or the containing window (63/69) --- src/BinaryApplicator.cpp | 18 ++++++++++++++++++ src/FormatConverter.hpp | 3 --- src/GrammarApplicator.hpp | 4 ++++ src/GrammarApplicator_runGrammar.cpp | 6 ++++++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 2cb46a04..886540a8 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -387,6 +387,24 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out } } + // Move text belonging to removed cohorts to prior not-removed cohorts, or the containing window + for (size_t i = 0; i < window->all_cohorts.size(); ++i) { + auto cohort = window->all_cohorts[i]; + if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { + if (!cohort->text.empty()) { + for (size_t j = i; j > 0; --j) { + if (window->all_cohorts[j - 1]->local_number == 0 || (window->all_cohorts[j - 1]->type & CT_REMOVED)) { + continue; + } + window->all_cohorts[j-1]->text += cohort->text; + cohort->text.clear(); + } + window->text += cohort->text; + cohort->text.clear(); + } + } + } + std::string cohort_buffer; uint16_t cohort_count = 0; for (auto& cohort : window->all_cohorts) { diff --git a/src/FormatConverter.hpp b/src/FormatConverter.hpp index 16f6cff6..a9cb1449 100644 --- a/src/FormatConverter.hpp +++ b/src/FormatConverter.hpp @@ -29,7 +29,6 @@ #include "NicelineApplicator.hpp" #include "PlaintextApplicator.hpp" #include "Grammar.hpp" -#include "cg3.h" namespace CG3 { @@ -42,8 +41,6 @@ class FormatConverter : public ApertiumApplicator, public BinaryApplicator, publ void runGrammarOnText(std::istream& input, std::ostream& output); std::unique_ptr detectFormat(std::istream& in); - cg3_sformat fmt_input = CG3SF_CG; - cg3_sformat fmt_output = CG3SF_CG; Grammar conv_grammar; diff --git a/src/GrammarApplicator.hpp b/src/GrammarApplicator.hpp index 8f03d5bb..c10b6721 100644 --- a/src/GrammarApplicator.hpp +++ b/src/GrammarApplicator.hpp @@ -31,6 +31,7 @@ #include "interval_vector.hpp" #include "flat_unordered_set.hpp" #include "scoped_stack.hpp" +#include "cg3.h" #include class Process; @@ -126,6 +127,9 @@ class GrammarApplicator { bool add_spacing = true; bool print_ids = false; + cg3_sformat fmt_input = CG3SF_CG; + cg3_sformat fmt_output = CG3SF_CG; + bool dep_has_spanned = false; uint32_t dep_delimit = 0; bool dep_absolute = false; diff --git a/src/GrammarApplicator_runGrammar.cpp b/src/GrammarApplicator_runGrammar.cpp index 033fc156..81ace7fa 100644 --- a/src/GrammarApplicator_runGrammar.cpp +++ b/src/GrammarApplicator_runGrammar.cpp @@ -145,6 +145,12 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp ux_stripBOM(input); + if (fmt_output == CG3SF_BINARY) { + cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); + lSWindow = cSWindow; + } + while (!input.eof()) { ++lines; auto packoff = get_line_clean(line, cleaned, input); From 7a3bdb47d7884e6a7ed828c3db7fd6790a462e02 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 13:27:20 +0200 Subject: [PATCH 33/42] Baseform/wordform type isn't enough to exclude (64/69) --- src/BinaryApplicator.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index 886540a8..f9de5528 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -229,8 +229,7 @@ bool BinaryApplicator::readWindow() { addTagToReading(*cCohort->wread, cCohort->wordform); for (uint16_t tn = 0; tn < tag_count; ++tn) { READ_U16_INTO(tag); - addTagToReading(*cCohort->wread, window_tags[tag], - (tn + 1 == tag_count)); + addTagToReading(*cCohort->wread, window_tags[tag], (tn + 1 == tag_count)); } } @@ -496,7 +495,10 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out uint32SortedVector unique; for (auto& tter : reading->tags_list) { auto tag = grammar->single_tags[tter]; - if (tag->type & (T_WORDFORM | T_BASEFORM | T_DEPENDENCY | T_RELATION)) { + if (tter == reading->baseform || tter == reading->parent->wordform->hash) { + continue; + } + if (tag->type & (T_DEPENDENCY | T_RELATION)) { continue; } if (unique_tags) { From 6c82135e18f220ecd46e76a211b05b796f3e69e3 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 13:47:00 +0200 Subject: [PATCH 34/42] Create window for trailing vars (65/69) --- src/GrammarApplicator_runGrammar.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/GrammarApplicator_runGrammar.cpp b/src/GrammarApplicator_runGrammar.cpp index 81ace7fa..3171fb07 100644 --- a/src/GrammarApplicator_runGrammar.cpp +++ b/src/GrammarApplicator_runGrammar.cpp @@ -145,6 +145,15 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp ux_stripBOM(input); + auto adopt_variables = [&]() { + cSWindow->variables_set.insert(variables_set.begin(), variables_set.end()); + variables_set.clear(); + cSWindow->variables_rem.insert(variables_rem.begin(), variables_rem.end()); + variables_rem.clear(); + cSWindow->variables_output.insert(variables_output.begin(), variables_output.end()); + variables_output.clear(); + }; + if (fmt_output == CG3SF_BINARY) { cSWindow = gWindow->allocAppendSingleWindow(); initEmptySingleWindow(cSWindow); @@ -244,13 +253,6 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp cSWindow = gWindow->allocAppendSingleWindow(); initEmptySingleWindow(cSWindow); - cSWindow->variables_set = variables_set; - variables_set.clear(); - cSWindow->variables_rem = variables_rem; - variables_rem.clear(); - cSWindow->variables_output = variables_output; - variables_output.clear(); - lSWindow = cSWindow; cCohort = nullptr; ++numWindows; @@ -271,6 +273,9 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp u_fflush(ux_stderr); } } + if (cSWindow->all_cohorts.size() == 1) { + adopt_variables(); + } cCohort = alloc_cohort(cSWindow); cCohort->global_number = gWindow->cohort_counter++; cCohort->wordform = addTag(&cleaned[0]); @@ -685,6 +690,11 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp cCohort = nullptr; cSWindow = nullptr; } + if (fmt_output == CG3SF_BINARY && !variables_output.empty()) { + cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); + adopt_variables(); + } while (!gWindow->next.empty()) { gWindow->shuffleWindowsDown(); runGrammarOnWindow(); From 86a856cda1aeb4e4fc3f71bee332309eadc3612e Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 17:01:11 +0200 Subject: [PATCH 35/42] 3 distinct packet types (67/69) --- src/BinaryApplicator.cpp | 1031 ++++++++++++++------------ src/BinaryApplicator.hpp | 41 +- src/FormatConverter.cpp | 8 +- src/GrammarApplicator.cpp | 18 +- src/GrammarApplicator_runGrammar.cpp | 24 +- src/JsonlApplicator.cpp | 2 +- src/inlines.hpp | 140 ++-- test/runall.pl | 1 - 8 files changed, 715 insertions(+), 550 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index f9de5528..ac0f19ed 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -29,524 +29,609 @@ BinaryApplicator::BinaryApplicator(std::ostream& ux_err) } void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& output) { - ux_stdin = &input; - ux_stdout = &output; - - if (!input.good()) { - u_fprintf(ux_stderr, "Error: Input is null - nothing to parse!\n"); - CG3Quit(1); - } - if (input.eof()) { - u_fprintf(ux_stderr, "Error: Input is empty - nothing to parse!\n"); - CG3Quit(1); - } - if (!output) { - u_fprintf(ux_stderr, "Error: Output is null - cannot write to nothing!\n"); - CG3Quit(1); - } - - if (!grammar) { - u_fprintf(ux_stderr, "Error: No grammar provided - cannot continue! Hint: call setGrammar() first.\n"); - CG3Quit(1); - } - - { - std::string header(8, 0); - if (!input.read(&header[0], 8)) { - u_fprintf(ux_stderr, "Error: Could not read stream header!\n"); - CG3Quit(1); - } - if (!is_cg3bsf(header)) { - u_fprintf(ux_stderr, "Error: Stream does not start with magic bytes - cannot read as binary!\n"); - CG3Quit(1); - } - uint32_t version = reinterpret_cast(&header[4])[0]; - if (version != CG3_BINARY_STREAM) { - u_fprintf(ux_stderr, "Error: Stream is version %u but this reader only knows version %u!\n", version, CG3_BINARY_STREAM); - CG3Quit(1); - } - } - - index(); - - uint32_t resetAfter = ((num_windows + 4) * 2 + 1); - bool flushAfter = false; - - gWindow->window_span = num_windows; - - auto flush = [&]() { - if (gWindow->back()) { - gWindow->back()->flush_after = true; - } - - while (!gWindow->next.empty()) { - gWindow->shuffleWindowsDown(); - runGrammarOnWindow(); - } - - gWindow->shuffleWindowsDown(); - while (!gWindow->previous.empty()) { - SingleWindow* tmp = gWindow->previous.front(); - printSingleWindow(tmp, output); - free_swindow(tmp); - gWindow->previous.erase(gWindow->previous.begin()); - } - flushAfter = false; - }; - - while (!input.eof()) { - flushAfter = readWindow(); - ++numWindows; - if (gWindow->next.size() > num_windows) { - gWindow->shuffleWindowsDown(); - runGrammarOnWindow(); - if (numWindows % resetAfter == 0) { - resetIndexes(); - } - } - if (flushAfter) { - flush(); - } - } - flush(); -} + ux_stdin = &input; + ux_stdout = &output; -#define READ_U16_INTO(dest) \ - do { \ - (dest) = reinterpret_cast(&buf[pos])[0]; \ - pos += 2; \ - } while (false) - -#define READ_U32_INTO(dest) \ - do { \ - (dest) = reinterpret_cast(&buf[pos])[0]; \ - pos += 4; \ - } while (false) - -#define READ_STR_INTO(dest) \ - do { \ - uint16_t tl = reinterpret_cast(&buf[pos])[0]; \ - pos += 2; \ - (dest).clear(); \ - (dest).resize(tl, 0); \ - int32_t olen = 0; \ - UErrorCode status = U_ZERO_ERROR; \ - u_strFromUTF8(&(dest)[0], tl, &olen, &buf[pos], tl, &status); \ - (dest).resize(olen); \ - pos += tl; \ - } while (false) - -bool BinaryApplicator::readWindow() { - uint32_t cs = 0; - readRaw(*ux_stdin, cs); - - if (ux_stdin->eof()) { - return true; - } - - SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); - initEmptySingleWindow(cSWindow); - - std::string buf(cs, 0); - ux_stdin->read(&buf[0], cs); - uint32_t pos = 0; - - // TODO: flags - uint16_t flags; - READ_U16_INTO(flags); - if (flags & BFW_FLUSH) { - cSWindow->flush_after = true; - } - if (flags & BFW_DEP_SPAN) { - dep_has_spanned = true; - } - - TagVector window_tags; - uint16_t tag_count; - READ_U16_INTO(tag_count); - window_tags.reserve(tag_count); - for (uint16_t i = 0; i < tag_count; ++i) { - UString tg; - READ_STR_INTO(tg); - window_tags.push_back(addTag(tg)); - if (tg[0] == grammar->mapping_prefix) { - window_tags.back()->type |= T_MAPPING; + if (!input.good()) { + u_fprintf(ux_stderr, "Error: Input is null - nothing to parse!\n"); + CG3Quit(1); } - else { - window_tags.back()->type &= ~T_MAPPING; + if (input.eof()) { + u_fprintf(ux_stderr, "Error: Input is empty - nothing to parse!\n"); + CG3Quit(1); } - } - - uint16_t var_count; - READ_U16_INTO(var_count); - for (uint16_t vn = 0; vn < var_count; ++vn) { - char mode = buf[pos]; - ++pos; - uint16_t tag1, tag2; - READ_U16_INTO(tag1); - READ_U16_INTO(tag2); - auto hash1 = window_tags[tag1]->hash; - if (mode == BFV_SETVAR) { - cSWindow->variables_set[hash1] = window_tags[tag2]->hash; - cSWindow->variables_rem.erase(hash1); - cSWindow->variables_output.insert(hash1); - } - else if (mode == BFV_SETVAR_ANY) { - cSWindow->variables_set[hash1] = grammar->tag_any; - cSWindow->variables_rem.erase(hash1); - cSWindow->variables_output.insert(hash1); - } - else if (mode == BFV_REMVAR) { - cSWindow->variables_set.erase(hash1); - cSWindow->variables_rem.insert(hash1); - cSWindow->variables_output.insert(hash1); - } - } - - READ_STR_INTO(cSWindow->text); - READ_STR_INTO(cSWindow->text_post); - - uint16_t cohort_count; - READ_U16_INTO(cohort_count); - uint16_t tag; - for (uint16_t cn = 0; cn < cohort_count; ++cn) { - Cohort* cCohort = alloc_cohort(cSWindow); - cCohort->global_number = gWindow->cohort_counter++; - ++numCohorts; - - READ_U16_INTO(flags); - if (flags & BFC_RELATED) { - cCohort->type |= CT_RELATED; - has_relations = true; + if (!output) { + u_fprintf(ux_stderr, "Error: Output is null - cannot write to nothing!\n"); + CG3Quit(1); } - READ_U16_INTO(tag); - cCohort->wordform = window_tags[tag]; + if (!grammar) { + u_fprintf(ux_stderr, "Error: No grammar provided - cannot continue! Hint: call setGrammar() first.\n"); + CG3Quit(1); + } - READ_U16_INTO(tag_count); - if (tag_count) { - cCohort->wread = alloc_reading(cCohort); - addTagToReading(*cCohort->wread, cCohort->wordform); - for (uint16_t tn = 0; tn < tag_count; ++tn) { - READ_U16_INTO(tag); - addTagToReading(*cCohort->wread, window_tags[tag], (tn + 1 == tag_count)); + { + std::string header(8, 0); + if (!input.read(&header[0], 8)) { + u_fprintf(ux_stderr, "Error: Could not read stream header!\n"); + CG3Quit(1); + } + if (!is_cg3bsf(header)) { + u_fprintf(ux_stderr, "Error: Stream does not start with magic bytes - cannot read as binary!\n"); + CG3Quit(1); + } + uint32_t version = reinterpret_cast(&header[4])[0]; + if (version != CG3_BINARY_STREAM) { + u_fprintf(ux_stderr, "Error: Stream is version %u but this reader only knows version %u!\n", version, CG3_BINARY_STREAM); + CG3Quit(1); } - } + } + + index(); + + uint32_t resetAfter = ((num_windows + 4) * 2 + 1); - READ_U32_INTO(cCohort->dep_self); - READ_U32_INTO(cCohort->dep_parent); - gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; + gWindow->window_span = num_windows; + + auto flush = [&](bool flush_after = false) { + auto backSWindow = gWindow->back(); + if (backSWindow) { + backSWindow->flush_after = flush_after; + } - if (cCohort->dep_parent != DEP_NO_PARENT) { - has_dep = true; + while (!gWindow->next.empty()) { + gWindow->shuffleWindowsDown(); + runGrammarOnWindow(); + } + + gWindow->shuffleWindowsDown(); + while (!gWindow->previous.empty()) { + SingleWindow* tmp = gWindow->previous.front(); + printSingleWindow(tmp, output); + free_swindow(tmp); + gWindow->previous.erase(gWindow->previous.begin()); + } + + return backSWindow; + }; + + while (!input.eof()) { + auto packet = readPacket(); + if (packet.type == BFP_WINDOW) { + //auto cSWindow = static_cast(packet.payload); + ++numWindows; + if (gWindow->next.size() > num_windows) { + gWindow->shuffleWindowsDown(); + runGrammarOnWindow(); + if (numWindows % resetAfter == 0) { + resetIndexes(); + } + } + } + else if (packet.type == BFP_COMMAND) { + auto cmd = static_cast(reinterpret_cast(packet.payload)); + if (cmd == BFC_FLUSH) { + if (!flush(true)) { + printStreamCommand(STR_CMD_FLUSH, *ux_stdout); + } + } + else if (cmd == BFC_EXIT) { + printStreamCommand(STR_CMD_EXIT, *ux_stdout); + return; + } + else if (cmd == BFC_IGNORE) { + printStreamCommand(STR_CMD_IGNORE, *ux_stdout); + } + else if (cmd == BFC_RESUME) { + printStreamCommand(STR_CMD_RESUME, *ux_stdout); + } + } + else if (packet.type == BFP_TEXT) { + auto& text = *static_cast(packet.payload); + printPlainTextLine(text, *ux_stdout); + } } + flush(false); +} - uint16_t rel_count; - READ_U16_INTO(rel_count); - for (uint16_t rn = 0; rn < rel_count; ++rn) { - READ_U16_INTO(tag); - uint32_t head; - READ_U32_INTO(head); - cCohort->relations_input[window_tags[tag]->hash].insert(head); +#define READ_U16_INTO(dest) \ + do { \ + (dest) = reinterpret_cast(&buf[pos])[0]; \ + pos += 2; \ + } while (false) + +#define READ_U32_INTO(dest) \ + do { \ + (dest) = reinterpret_cast(&buf[pos])[0]; \ + pos += 4; \ + } while (false) + +#define READ_STR_INTO(dest) \ + do { \ + uint16_t tl = reinterpret_cast(&buf[pos])[0]; \ + pos += 2; \ + (dest).clear(); \ + (dest).resize(tl, 0); \ + int32_t olen = 0; \ + UErrorCode status = U_ZERO_ERROR; \ + u_strFromUTF8(&(dest)[0], tl, &olen, &buf[pos], tl, &status); \ + (dest).resize(olen); \ + pos += tl; \ + } while (false) + +BinaryPacket BinaryApplicator::readPacket() { + BinaryPacket packet; + readLE(*ux_stdin, packet.type); + if (packet.type == BFP_WINDOW) { + readWindow(packet.payload); } - if (rel_count) { - has_relations = true; - gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; - cCohort->type |= CT_RELATED; + else if (packet.type == BFP_COMMAND) { + readCommand(packet.payload); + } + if (packet.type == BFP_TEXT) { + readText(packet.payload); + } + return packet; +} + +void BinaryApplicator::readWindow(void*& payload) { + uint32_t cs = 0; + readLE(*ux_stdin, cs); + + if (ux_stdin->eof()) { + payload = nullptr; + return; + } + + SingleWindow* cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); + + std::string buf(cs, 0); + ux_stdin->read(&buf[0], cs); + uint32_t pos = 0; + + // TODO: flags + uint16_t flags; + READ_U16_INTO(flags); + if (flags & BFW_DEP_SPAN) { + dep_has_spanned = true; + } + + TagVector window_tags; + uint16_t tag_count; + READ_U16_INTO(tag_count); + window_tags.reserve(tag_count); + for (uint16_t i = 0; i < tag_count; ++i) { + UString tg; + READ_STR_INTO(tg); + window_tags.push_back(addTag(tg)); + if (tg[0] == grammar->mapping_prefix) { + window_tags.back()->type |= T_MAPPING; + } + else { + window_tags.back()->type &= ~T_MAPPING; + } + } + + uint16_t var_count; + READ_U16_INTO(var_count); + for (uint16_t vn = 0; vn < var_count; ++vn) { + char mode = buf[pos]; + ++pos; + uint16_t tag1, tag2; + READ_U16_INTO(tag1); + READ_U16_INTO(tag2); + auto hash1 = window_tags[tag1]->hash; + if (mode == BFV_SETVAR) { + cSWindow->variables_set[hash1] = window_tags[tag2]->hash; + cSWindow->variables_rem.erase(hash1); + cSWindow->variables_output.insert(hash1); + } + else if (mode == BFV_SETVAR_ANY) { + cSWindow->variables_set[hash1] = grammar->tag_any; + cSWindow->variables_rem.erase(hash1); + cSWindow->variables_output.insert(hash1); + } + else if (mode == BFV_REMVAR) { + cSWindow->variables_set.erase(hash1); + cSWindow->variables_rem.insert(hash1); + cSWindow->variables_output.insert(hash1); + } } - READ_STR_INTO(cCohort->text); - READ_STR_INTO(cCohort->wblank); - - uint16_t reading_count; - READ_U16_INTO(reading_count); - if (!reading_count) initEmptyCohort(*cCohort); - Reading* prev = nullptr; - for (uint16_t rn = 0; rn < reading_count; ++rn) { - Reading* cReading = alloc_reading(cCohort); - addTagToReading(*cReading, cCohort->wordform); - - READ_U16_INTO(flags); - - READ_U16_INTO(tag); - addTagToReading(*cReading, window_tags[tag]); - - READ_U16_INTO(tag_count); - TagList mappings; - for (uint16_t tn = 0; tn < tag_count; ++tn) { - READ_U16_INTO(tag); - if (window_tags[tag]->type & T_MAPPING) { - mappings.push_back(window_tags[tag]); - } - else { - addTagToReading(*cReading, window_tags[tag]); - } - } - if (!mappings.empty()) { - splitMappings(mappings, *cCohort, *cReading, true); - } - - if (prev && (flags & BFR_SUBREADING)) { - prev->next = cReading; - } - else if (flags & BFR_DELETED) { - cCohort->deleted.push_back(cReading); - } - else { - cCohort->appendReading(cReading); - } - prev = cReading; - ++numReadings; - } - - if (cn+1 == cohort_count) { - for (auto iter : cCohort->readings) { - if (iter->tags.find(endtag) == iter->tags.end()) { - addTagToReading(*iter, endtag); + READ_STR_INTO(cSWindow->text); + READ_STR_INTO(cSWindow->text_post); + + uint16_t cohort_count; + READ_U16_INTO(cohort_count); + uint16_t tag; + for (uint16_t cn = 0; cn < cohort_count; ++cn) { + Cohort* cCohort = alloc_cohort(cSWindow); + cCohort->global_number = gWindow->cohort_counter++; + ++numCohorts; + + READ_U16_INTO(flags); + if (flags & BFC_RELATED) { + cCohort->type |= CT_RELATED; + has_relations = true; + } + + READ_U16_INTO(tag); + cCohort->wordform = window_tags[tag]; + + READ_U16_INTO(tag_count); + if (tag_count) { + cCohort->wread = alloc_reading(cCohort); + addTagToReading(*cCohort->wread, cCohort->wordform); + for (uint16_t tn = 0; tn < tag_count; ++tn) { + READ_U16_INTO(tag); + addTagToReading(*cCohort->wread, window_tags[tag], (tn + 1 == tag_count)); + } + } + + READ_U32_INTO(cCohort->dep_self); + READ_U32_INTO(cCohort->dep_parent); + gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; + + if (cCohort->dep_parent != DEP_NO_PARENT) { + has_dep = true; + } + + uint16_t rel_count; + READ_U16_INTO(rel_count); + for (uint16_t rn = 0; rn < rel_count; ++rn) { + READ_U16_INTO(tag); + uint32_t head; + READ_U32_INTO(head); + cCohort->relations_input[window_tags[tag]->hash].insert(head); + } + if (rel_count) { + has_relations = true; + gWindow->relation_map[cCohort->dep_self] = cCohort->global_number; + cCohort->type |= CT_RELATED; + } + + READ_STR_INTO(cCohort->text); + READ_STR_INTO(cCohort->wblank); + + uint16_t reading_count; + READ_U16_INTO(reading_count); + if (!reading_count) + initEmptyCohort(*cCohort); + Reading* prev = nullptr; + for (uint16_t rn = 0; rn < reading_count; ++rn) { + Reading* cReading = alloc_reading(cCohort); + addTagToReading(*cReading, cCohort->wordform); + + READ_U16_INTO(flags); + + READ_U16_INTO(tag); + addTagToReading(*cReading, window_tags[tag]); + + READ_U16_INTO(tag_count); + TagList mappings; + for (uint16_t tn = 0; tn < tag_count; ++tn) { + READ_U16_INTO(tag); + if (window_tags[tag]->type & T_MAPPING) { + mappings.push_back(window_tags[tag]); + } + else { + addTagToReading(*cReading, window_tags[tag]); + } + } + if (!mappings.empty()) { + splitMappings(mappings, *cCohort, *cReading, true); + } + + if (prev && (flags & BFR_SUBREADING)) { + prev->next = cReading; + } + else if (flags & BFR_DELETED) { + cCohort->deleted.push_back(cReading); + } + else { + cCohort->appendReading(cReading); + } + prev = cReading; + ++numReadings; + } + + if (cn + 1 == cohort_count) { + for (auto iter : cCohort->readings) { + if (iter->tags.find(endtag) == iter->tags.end()) { + addTagToReading(*iter, endtag); + } } } + + insert_if_exists(cCohort->possible_sets, grammar->sets_any); + cSWindow->appendCohort(cCohort); } - insert_if_exists(cCohort->possible_sets, grammar->sets_any); - cSWindow->appendCohort(cCohort); - } + payload = cSWindow; +} + +void BinaryApplicator::readCommand(void*& payload) { + auto cmd = readLE(*ux_stdin); + payload = reinterpret_cast(static_cast(cmd)); +} - return cSWindow->flush_after; +void BinaryApplicator::readText(void*& payload) { + readUTF8_LE(*ux_stdin, text); + payload = &text; } -#define WRITE_U16_INTO(n, buffer) \ - do { \ - std::string tmp(2, 0); \ - auto tmp_n = static_cast(n); \ - tmp.assign(reinterpret_cast(&tmp_n), 2); \ - (buffer) += tmp; \ - } while (false) - -#define WRITE_U32_INTO(n, buffer) \ - do { \ - std::string tmp(4, 0); \ - auto tmp_n = static_cast(n); \ - tmp.assign(reinterpret_cast(&tmp_n), 4); \ - (buffer) += tmp; \ - } while (false) - -#define WRITE_TAG_INTO(tag, buffer) \ - do { \ - if (tag_index.find((tag)) == tag_index.end()) { \ - tag_index[(tag)] = UI32(tags_to_write.size()); \ - tags_to_write.push_back((tag)); \ - } \ - WRITE_U16_INTO(tag_index[(tag)], buffer); \ - } while (false) - -#define WRITE_STR_INTO(s, buffer) \ - do { \ - std::string tmp((s).size() * 4, 0); \ - int32_t olen = 0; \ - UErrorCode status = U_ZERO_ERROR; \ - u_strToUTF8(&tmp[0], SI32((s).size() * 4 - 1), &olen, (s).data(), SI32((s).size()), &status); \ - tmp.resize(olen); \ - WRITE_U16_INTO(UI16(olen), (buffer)); \ - (buffer) += tmp; \ - } while (false) +#define WRITE_U16_INTO(n, buffer) \ + do { \ + std::string tmp(2, 0); \ + auto tmp_n = static_cast(n); \ + tmp.assign(reinterpret_cast(&tmp_n), 2); \ + (buffer) += tmp; \ + } while (false) + +#define WRITE_U32_INTO(n, buffer) \ + do { \ + std::string tmp(4, 0); \ + auto tmp_n = static_cast(n); \ + tmp.assign(reinterpret_cast(&tmp_n), 4); \ + (buffer) += tmp; \ + } while (false) + +#define WRITE_TAG_INTO(tag, buffer) \ + do { \ + if (tag_index.find((tag)) == tag_index.end()) { \ + tag_index[(tag)] = UI32(tags_to_write.size()); \ + tags_to_write.push_back((tag)); \ + } \ + WRITE_U16_INTO(tag_index[(tag)], buffer); \ + } while (false) + +#define WRITE_STR_INTO(s, buffer) \ + do { \ + std::string tmp((s).size() * 4, 0); \ + int32_t olen = 0; \ + UErrorCode status = U_ZERO_ERROR; \ + u_strToUTF8(&tmp[0], SI32((s).size() * 4 - 1), &olen, (s).data(), SI32((s).size()), &status); \ + tmp.resize(olen); \ + WRITE_U16_INTO(UI16(olen), (buffer)); \ + (buffer) += tmp; \ + } while (false) void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling) { - if (window->number == 1) { - output.write("CGBF", 4); - std::string version; - WRITE_U32_INTO(CG3_BINARY_STREAM, version); - output.write(version.data(), 4); - } - - TagVector tags_to_write; - std::map tag_index; - - uint16_t var_count = 0; - std::string var_buffer; - for (auto var : window->variables_output) { - ++var_count; - Tag* key = grammar->single_tags[var]; - auto iter = window->variables_set.find(var); - if (iter != window->variables_set.end()) { - if (iter->second != grammar->tag_any) { - var_buffer += static_cast(BFV_SETVAR); - WRITE_TAG_INTO(key, var_buffer); - WRITE_TAG_INTO(grammar->single_tags[iter->second], var_buffer); - } - else { - var_buffer += static_cast(BFV_SETVAR_ANY); - WRITE_TAG_INTO(key, var_buffer); - WRITE_U16_INTO(0, var_buffer); - } - } - else { - var_buffer += static_cast(BFV_REMVAR); - WRITE_TAG_INTO(key, var_buffer); - WRITE_U16_INTO(0, var_buffer); - } - } - - // Move text belonging to removed cohorts to prior not-removed cohorts, or the containing window - for (size_t i = 0; i < window->all_cohorts.size(); ++i) { - auto cohort = window->all_cohorts[i]; - if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { - if (!cohort->text.empty()) { - for (size_t j = i; j > 0; --j) { - if (window->all_cohorts[j - 1]->local_number == 0 || (window->all_cohorts[j - 1]->type & CT_REMOVED)) { - continue; - } - window->all_cohorts[j-1]->text += cohort->text; - cohort->text.clear(); - } - window->text += cohort->text; - cohort->text.clear(); - } - } - } - - std::string cohort_buffer; - uint16_t cohort_count = 0; - for (auto& cohort : window->all_cohorts) { - if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { - continue; - } - ++cohort_count; - - uint16_t flags = 0; - if (cohort->type & CT_RELATED) { - flags |= BFC_RELATED; + if (!header_done) { + output.write("CGBF", 4); + writeLE(output, CG3_BINARY_STREAM); + header_done = true; } - WRITE_U16_INTO(flags, cohort_buffer); - - WRITE_TAG_INTO(cohort->wordform, cohort_buffer); - if (cohort->wread) { - std::string tag_buffer; - uint16_t tag_count = 0; - for (auto tter : cohort->wread->tags_list) { - if (tter == cohort->wordform->hash) { - continue; - } - WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); - ++tag_count; - } - WRITE_U16_INTO(tag_count, cohort_buffer); - cohort_buffer += tag_buffer; - } - else { - WRITE_U16_INTO(0, cohort_buffer); - } - - WRITE_U32_INTO(cohort->global_number, cohort_buffer); - if (cohort->dep_parent == 0 || cohort->dep_parent == DEP_NO_PARENT) { - WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); - } - else { - if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { - auto pr = gWindow->cohort_map[cohort->dep_parent]; - if (pr->local_number == 0) { - WRITE_U32_INTO(0, cohort_buffer); + + writeLE(output, UI8(BFP_WINDOW)); + + TagVector tags_to_write; + std::map tag_index; + + uint16_t var_count = 0; + std::string var_buffer; + for (auto var : window->variables_output) { + ++var_count; + Tag* key = grammar->single_tags[var]; + auto iter = window->variables_set.find(var); + if (iter != window->variables_set.end()) { + if (iter->second != grammar->tag_any) { + var_buffer += static_cast(BFV_SETVAR); + WRITE_TAG_INTO(key, var_buffer); + WRITE_TAG_INTO(grammar->single_tags[iter->second], var_buffer); } else { - WRITE_U32_INTO(pr->global_number, cohort_buffer); + var_buffer += static_cast(BFV_SETVAR_ANY); + WRITE_TAG_INTO(key, var_buffer); + WRITE_U16_INTO(0, var_buffer); } } else { - WRITE_U32_INTO(DEP_NO_PARENT, cohort_buffer); + var_buffer += static_cast(BFV_REMVAR); + WRITE_TAG_INTO(key, var_buffer); + WRITE_U16_INTO(0, var_buffer); } } - std::string rel_buffer; - uint16_t rel_count = 0; - for (const auto& miter : cohort->relations) { - auto it = grammar->single_tags.find(miter.first); - if (it == grammar->single_tags.end()) { - it = grammar->single_tags.find(miter.first); - } - for (auto siter : miter.second) { - ++rel_count; - WRITE_TAG_INTO(it->second, rel_buffer); - WRITE_U32_INTO(siter, rel_buffer); + // Move text belonging to removed cohorts to prior not-removed cohorts, or the containing window + for (size_t i = 0; i < window->all_cohorts.size(); ++i) { + auto cohort = window->all_cohorts[i]; + if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { + if (!cohort->text.empty()) { + for (size_t j = i; j > 0; --j) { + if (window->all_cohorts[j - 1]->local_number == 0 || (window->all_cohorts[j - 1]->type & CT_REMOVED)) { + continue; + } + window->all_cohorts[j - 1]->text += cohort->text; + cohort->text.clear(); + } + window->text += cohort->text; + cohort->text.clear(); + } } } - WRITE_U16_INTO(rel_count, cohort_buffer); - cohort_buffer += rel_buffer; - WRITE_STR_INTO(cohort->text, cohort_buffer); - WRITE_STR_INTO(cohort->wblank, cohort_buffer); - - std::string reading_buffer; - uint16_t reading_count = 0; - std::sort(cohort->readings.begin(), cohort->readings.end(), Reading::cmp_number); - for (auto top_reading : cohort->readings) { - if (top_reading->noprint) { + std::string cohort_buffer; + uint16_t cohort_count = 0; + for (auto& cohort : window->all_cohorts) { + if (cohort->local_number == 0 || (cohort->type & CT_REMOVED)) { continue; } - auto reading = top_reading; - while (reading) { - ++reading_count; - uint16_t flags = 0; - if (reading != top_reading) { - flags |= BFR_SUBREADING; - } - WRITE_U16_INTO(flags, reading_buffer); - WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); + cohort->unignoreAll(); + ++cohort_count; + + uint16_t flags = 0; + if (cohort->type & CT_RELATED) { + flags |= BFC_RELATED; + } + WRITE_U16_INTO(flags, cohort_buffer); + + WRITE_TAG_INTO(cohort->wordform, cohort_buffer); + if (cohort->wread) { std::string tag_buffer; uint16_t tag_count = 0; - uint32SortedVector unique; - for (auto& tter : reading->tags_list) { - auto tag = grammar->single_tags[tter]; - if (tter == reading->baseform || tter == reading->parent->wordform->hash) { + for (auto tter : cohort->wread->tags_list) { + if (tter == cohort->wordform->hash) { continue; } - if (tag->type & (T_DEPENDENCY | T_RELATION)) { - continue; + WRITE_TAG_INTO(grammar->single_tags[tter], tag_buffer); + ++tag_count; + } + WRITE_U16_INTO(tag_count, cohort_buffer); + cohort_buffer += tag_buffer; + } + else { + WRITE_U16_INTO(0, cohort_buffer); + } + + WRITE_U32_INTO(cohort->global_number, cohort_buffer); + if (cohort->dep_parent == 0 || cohort->dep_parent == DEP_NO_PARENT) { + WRITE_U32_INTO(cohort->dep_parent, cohort_buffer); + } + else { + if (gWindow->cohort_map.find(cohort->dep_parent) != gWindow->cohort_map.end()) { + auto pr = gWindow->cohort_map[cohort->dep_parent]; + if (pr->local_number == 0) { + WRITE_U32_INTO(0, cohort_buffer); + } + else { + WRITE_U32_INTO(pr->global_number, cohort_buffer); + } + } + else { + WRITE_U32_INTO(DEP_NO_PARENT, cohort_buffer); + } + } + + std::string rel_buffer; + uint16_t rel_count = 0; + for (const auto& miter : cohort->relations) { + auto it = grammar->single_tags.find(miter.first); + if (it == grammar->single_tags.end()) { + it = grammar->single_tags.find(miter.first); + } + for (auto siter : miter.second) { + ++rel_count; + WRITE_TAG_INTO(it->second, rel_buffer); + WRITE_U32_INTO(siter, rel_buffer); + } + } + WRITE_U16_INTO(rel_count, cohort_buffer); + cohort_buffer += rel_buffer; + + WRITE_STR_INTO(cohort->text, cohort_buffer); + WRITE_STR_INTO(cohort->wblank, cohort_buffer); + + std::string reading_buffer; + uint16_t reading_count = 0; + std::sort(cohort->readings.begin(), cohort->readings.end(), Reading::cmp_number); + for (auto top_reading : cohort->readings) { + if (top_reading->noprint) { + continue; + } + auto reading = top_reading; + while (reading) { + ++reading_count; + uint16_t flags = 0; + if (reading != top_reading) { + flags |= BFR_SUBREADING; } - if (unique_tags) { - if (unique.find(tter) != unique.end()) { + WRITE_U16_INTO(flags, reading_buffer); + WRITE_TAG_INTO(grammar->single_tags[reading->baseform], reading_buffer); + std::string tag_buffer; + uint16_t tag_count = 0; + uint32SortedVector unique; + for (auto& tter : reading->tags_list) { + auto tag = grammar->single_tags[tter]; + if (tter == reading->baseform || tter == reading->parent->wordform->hash) { + continue; + } + if (tag->type & (T_DEPENDENCY | T_RELATION)) { continue; } - unique.insert(tter); + if (unique_tags) { + if (unique.find(tter) != unique.end()) { + continue; + } + unique.insert(tter); + } + WRITE_TAG_INTO(tag, tag_buffer); + ++tag_count; } - WRITE_TAG_INTO(tag, tag_buffer); - ++tag_count; + WRITE_U16_INTO(tag_count, reading_buffer); + reading_buffer += tag_buffer; + reading = reading->next; } - WRITE_U16_INTO(tag_count, reading_buffer); - reading_buffer += tag_buffer; - reading = reading->next; - } - } - WRITE_U16_INTO(reading_count, cohort_buffer); - cohort_buffer += reading_buffer; - } - - std::string header_buffer; - - uint16_t flags = 0; - if (window->flush_after) { - flags |= BFW_FLUSH; - } - if (dep_has_spanned) { - flags |= BFW_DEP_SPAN; - } - WRITE_U16_INTO(flags, header_buffer); - - WRITE_U16_INTO(tags_to_write.size(), header_buffer); - for (auto& tag : tags_to_write) { - WRITE_STR_INTO(tag->tag, header_buffer); - } - - WRITE_U16_INTO(var_count, header_buffer); - header_buffer += var_buffer; - - WRITE_STR_INTO(window->text, header_buffer); - WRITE_STR_INTO(window->text_post, header_buffer); - - WRITE_U16_INTO(cohort_count, header_buffer); - - auto total_size = UI32(header_buffer.size() + cohort_buffer.size()); - writeRaw(output, total_size); - output.write(header_buffer.data(), header_buffer.size()); - output.write(cohort_buffer.data(), cohort_buffer.size()); - output.flush(); + } + WRITE_U16_INTO(reading_count, cohort_buffer); + cohort_buffer += reading_buffer; + } + + std::string header_buffer; + + uint16_t flags = 0; + if (dep_has_spanned) { + flags |= BFW_DEP_SPAN; + } + WRITE_U16_INTO(flags, header_buffer); + + WRITE_U16_INTO(tags_to_write.size(), header_buffer); + for (auto& tag : tags_to_write) { + WRITE_STR_INTO(tag->tag, header_buffer); + } + + WRITE_U16_INTO(var_count, header_buffer); + header_buffer += var_buffer; + + WRITE_STR_INTO(window->text, header_buffer); + WRITE_STR_INTO(window->text_post, header_buffer); + + WRITE_U16_INTO(cohort_count, header_buffer); + + auto total_size = UI32(header_buffer.size() + cohort_buffer.size()); + writeLE(output, total_size); + output.write(header_buffer.data(), header_buffer.size()); + output.write(cohort_buffer.data(), cohort_buffer.size()); + + if (window->flush_after) { + printStreamCommand(STR_CMD_FLUSH, output); + } + + output.flush(); +} + +void BinaryApplicator::printStreamCommand(UStringView cmd, std::ostream& output) { + if (!header_done) { + output.write("CGBF", 4); + writeLE(output, CG3_BINARY_STREAM); + header_done = true; + } + + writeLE(output, UI8(BFP_COMMAND)); + if (cmd == STR_CMD_FLUSH) { + writeLE(output, UI8(BFC_FLUSH)); + } + else if (cmd == STR_CMD_EXIT) { + writeLE(output, UI8(BFC_EXIT)); + } + else if (cmd == STR_CMD_IGNORE) { + writeLE(output, UI8(BFC_IGNORE)); + } + else if (cmd == STR_CMD_RESUME) { + writeLE(output, UI8(BFC_RESUME)); + } } + +void BinaryApplicator::printPlainTextLine(UStringView line, std::ostream& output) { + if (!header_done) { + output.write("CGBF", 4); + writeLE(output, CG3_BINARY_STREAM); + header_done = true; + } + + writeLE(output, UI8(BFP_TEXT)); + writeUTF8_LE(output, line); +} + } diff --git a/src/BinaryApplicator.hpp b/src/BinaryApplicator.hpp index edcace99..e6cb3f7a 100644 --- a/src/BinaryApplicator.hpp +++ b/src/BinaryApplicator.hpp @@ -18,8 +18,8 @@ */ #pragma once -#ifndef GRAMMARAPPLICATORBINARY_H -#define GRAMMARAPPLICATORBINARY_H +#ifndef c6d28b7452ec699b_GRAMMARAPPLICATORBINARY_H +#define c6d28b7452ec699b_GRAMMARAPPLICATORBINARY_H #include "GrammarApplicator.hpp" @@ -27,8 +27,7 @@ namespace CG3 { enum BinaryFormatFlags { // Window - BFW_FLUSH = (1 << 0), - BFW_DEP_SPAN = (1 << 1), + BFW_DEP_SPAN = (1 << 0), // Cohort BFC_RELATED = (1 << 0), // Reading @@ -40,17 +39,43 @@ enum BinaryFormatFlags { BFV_REMVAR = 3, }; +enum BinaryPacketType : uint8_t { + BFP_INVALID = 0, + BFP_WINDOW = 1, + BFP_COMMAND = 2, + BFP_TEXT = 3, +}; + +enum BinaryCommandType : uint8_t { + BFC_FLUSH = 1, + BFC_EXIT = 2, + BFC_IGNORE = 3, + BFC_RESUME = 4, +}; + +struct BinaryPacket { + BinaryPacketType type = BFP_INVALID; + void* payload = nullptr; +}; + class BinaryApplicator : public virtual GrammarApplicator { public: - BinaryApplicator(std::ostream& ux_err); + BinaryApplicator(std::ostream& ux_err); - void runGrammarOnText(std::istream& input, std::ostream& output); + void runGrammarOnText(std::istream& input, std::ostream& output); protected: - void printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling = false) override; + void printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling = false) override; + void printStreamCommand(UStringView cmd, std::ostream& output) override; + void printPlainTextLine(UStringView line, std::ostream& output) override; private: - bool readWindow(); + bool header_done = false; + UString text; + BinaryPacket readPacket(); + void readWindow(void*& payload); + void readCommand(void*& payload); + void readText(void*& payload); }; } diff --git a/src/FormatConverter.cpp b/src/FormatConverter.cpp index bd0e72d2..a38094e1 100644 --- a/src/FormatConverter.cpp +++ b/src/FormatConverter.cpp @@ -244,8 +244,10 @@ void FormatConverter::printStreamCommand(UStringView cmd, std::ostream& output) JsonlApplicator::printStreamCommand(cmd, output); break; } - case CG3SF_BINARY: + case CG3SF_BINARY: { + BinaryApplicator::printStreamCommand(cmd, output); break; + } case CG3SF_CG: case CG3SF_APERTIUM: case CG3SF_FST: @@ -264,8 +266,10 @@ void FormatConverter::printPlainTextLine(UStringView line, std::ostream& output) JsonlApplicator::printPlainTextLine(line, output); break; } - case CG3SF_BINARY: + case CG3SF_BINARY: { + BinaryApplicator::printPlainTextLine(line, output); break; + } case CG3SF_CG: case CG3SF_APERTIUM: case CG3SF_FST: diff --git a/src/GrammarApplicator.cpp b/src/GrammarApplicator.cpp index 1628956f..d9e84e1f 100644 --- a/src/GrammarApplicator.cpp +++ b/src/GrammarApplicator.cpp @@ -581,7 +581,7 @@ void GrammarApplicator::printSingleWindow(SingleWindow* window, std::ostream& ou } if (window->flush_after) { - printStreamCommand(UString(STR_CMD_FLUSH), output); + printStreamCommand(STR_CMD_FLUSH, output); } u_fflush(output); } @@ -604,7 +604,7 @@ void GrammarApplicator::pipeOutReading(const Reading* reading, std::ostream& out writeRaw(ss, flags); if (reading->baseform) { - writeUTF8String(ss, grammar->single_tags.find(reading->baseform)->second->tag); + writeUTF8_Raw(ss, grammar->single_tags.find(reading->baseform)->second->tag); } uint32_t cs = 0; @@ -628,7 +628,7 @@ void GrammarApplicator::pipeOutReading(const Reading* reading, std::ostream& out if (tag->type & T_DEPENDENCY && has_dep) { continue; } - writeUTF8String(ss, tag->tag); + writeUTF8_Raw(ss, tag->tag); } const auto& str = ss.str(); @@ -655,7 +655,7 @@ void GrammarApplicator::pipeOutCohort(const Cohort* cohort, std::ostream& output writeRaw(ss, cohort->dep_parent); } - writeUTF8String(ss, cohort->wordform->tag); + writeUTF8_Raw(ss, cohort->wordform->tag); uint32_t cs = UI32(cohort->readings.size()); writeRaw(ss, cs); @@ -663,7 +663,7 @@ void GrammarApplicator::pipeOutCohort(const Cohort* cohort, std::ostream& output pipeOutReading(rter1, ss); } if (!cohort->text.empty()) { - writeUTF8String(ss, cohort->text); + writeUTF8_Raw(ss, cohort->text); } const auto& str = ss.str(); @@ -718,7 +718,7 @@ void GrammarApplicator::pipeInReading(Reading* reading, Process& input, bool for reading->deleted = (flags & (1 << 2)) != 0; if (flags & (1 << 3)) { - UString str = readUTF8String(ss); + UString str = readUTF8_Raw(ss); if (str != grammar->single_tags.find(reading->baseform)->second->tag) { Tag* tag = addTag(str); reading->baseform = tag->hash; @@ -743,7 +743,7 @@ void GrammarApplicator::pipeInReading(Reading* reading, Process& input, bool for } for (size_t i = 0; i < cs; ++i) { - UString str = readUTF8String(ss); + UString str = readUTF8_Raw(ss); Tag* tag = addTag(str); reading->tags_list.push_back(tag->hash); if (debug_level > 1) { @@ -784,7 +784,7 @@ void GrammarApplicator::pipeInCohort(Cohort* cohort, Process& input) { } bool force_readings = false; - UString str = readUTF8String(input); + UString str = readUTF8_Raw(input); if (str != cohort->wordform->tag) { Tag* tag = addTag(str); cohort->wordform = tag; @@ -803,7 +803,7 @@ void GrammarApplicator::pipeInCohort(Cohort* cohort, Process& input) { } if (flags & (1 << 0)) { - cohort->text = readUTF8String(input); + cohort->text = readUTF8_Raw(input); if (debug_level > 1) { u_fprintf(ux_stderr, "DEBUG: cohort text %S\n", cohort->text.data()); } diff --git a/src/GrammarApplicator_runGrammar.cpp b/src/GrammarApplicator_runGrammar.cpp index 3171fb07..f23650fb 100644 --- a/src/GrammarApplicator_runGrammar.cpp +++ b/src/GrammarApplicator_runGrammar.cpp @@ -154,11 +154,14 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp variables_output.clear(); }; - if (fmt_output == CG3SF_BINARY) { - cSWindow = gWindow->allocAppendSingleWindow(); - initEmptySingleWindow(cSWindow); - lSWindow = cSWindow; - } + auto binary_maybe_window = [&]() { + if (fmt_output == CG3SF_BINARY) { + cSWindow = gWindow->allocAppendSingleWindow(); + initEmptySingleWindow(cSWindow); + lSWindow = cSWindow; + } + }; + binary_maybe_window(); while (!input.eof()) { ++lines; @@ -502,7 +505,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp } if (!backSWindow) { - printStreamCommand(UString(STR_CMD_FLUSH), output); + printStreamCommand(STR_CMD_FLUSH, output); } line[0] = 0; variables.clear(); @@ -517,7 +520,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp } is_cmd = true; ignoreinput = true; - printStreamCommand(UString(STR_CMD_IGNORE), output); + printStreamCommand(STR_CMD_IGNORE, output); line[0] = 0; } else if (&cleaned[0] == STR_CMD_RESUME) { @@ -526,7 +529,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp } is_cmd = true; ignoreinput = false; - printStreamCommand(UString(STR_CMD_RESUME), output); + printStreamCommand(STR_CMD_RESUME, output); line[0] = 0; } else if (&cleaned[0] == STR_CMD_EXIT) { @@ -534,7 +537,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp u_fprintf(ux_stderr, "Info: EXIT encountered on line %u. Exiting...\n", numLines); } is_cmd = true; - printStreamCommand(UString(STR_CMD_EXIT), output); + printStreamCommand(STR_CMD_EXIT, output); goto CGCMD_EXIT; } else if (u_strncmp(&cleaned[0], STR_CMD_SETVAR.data(), SI32(STR_CMD_SETVAR.size())) == 0) { @@ -691,8 +694,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp cSWindow = nullptr; } if (fmt_output == CG3SF_BINARY && !variables_output.empty()) { - cSWindow = gWindow->allocAppendSingleWindow(); - initEmptySingleWindow(cSWindow); + binary_maybe_window(); adopt_variables(); } while (!gWindow->next.empty()) { diff --git a/src/JsonlApplicator.cpp b/src/JsonlApplicator.cpp index 6d291aea..5f139bf2 100644 --- a/src/JsonlApplicator.cpp +++ b/src/JsonlApplicator.cpp @@ -778,7 +778,7 @@ void JsonlApplicator::printSingleWindow(SingleWindow* window, std::ostream& outp // Print flush command if needed if (window->flush_after) { - printStreamCommand(UString(STR_CMD_FLUSH), output); + printStreamCommand(STR_CMD_FLUSH, output); } } diff --git a/src/inlines.hpp b/src/inlines.hpp index 48c53f91..a693e90f 100644 --- a/src/inlines.hpp +++ b/src/inlines.hpp @@ -497,7 +497,58 @@ inline void readRaw(S& stream, T& value) { stream.read(reinterpret_cast(&value), sizeof(T)); } -inline void writeUTF8String(std::ostream& output, const UChar* str, size_t len = 0) { +template +inline void writeBE(std::ostream& stream, T value) { + value = be::native_to_big(value); + writeRaw(stream, value); +} + +template<> +inline void writeBE(std::ostream& stream, double value) { + int exp = 0; + auto mant64 = UI64(SI64(DBL(std::numeric_limits::max()) * frexp(value, &exp))); + auto exp32 = UI32(exp); + writeBE(stream, mant64); + writeBE(stream, exp32); +} + +template +inline void writeLE(S& stream, T value) { + value = be::native_to_little(value); + writeRaw(stream, value); +} + +template +inline T readBE(std::istream& stream) { + T value; + readRaw(stream, value); + return be::big_to_native(value); +} + +template<> +inline double readBE(std::istream& stream) { + auto mant64 = readBE(stream); + auto exp = static_cast(readBE(stream)); + + auto value = DBL(SI64(mant64)) / DBL(std::numeric_limits::max()); + + return ldexp(value, exp); +} + +template +inline void readLE(S& stream, T& value) { + readRaw(stream, value); + be::little_to_native_inplace(value); +} + +template +inline T readLE(std::istream& stream) { + T value; + readRaw(stream, value); + return be::little_to_native(value); +} + +inline void writeUTF8_Raw(std::ostream& output, const UChar* str, size_t len = 0) { if (len == 0) { len = u_strlen(str); } @@ -512,12 +563,35 @@ inline void writeUTF8String(std::ostream& output, const UChar* str, size_t len = output.write(&buffer[0], cs); } -inline void writeUTF8String(std::ostream& output, const UString& str) { - writeUTF8String(output, str.data(), str.size()); +inline void writeUTF8_Raw(std::ostream& output, const UString& str) { + writeUTF8_Raw(output, str.data(), str.size()); +} + +inline void writeUTF8_LE(std::ostream& output, const UChar* str, size_t len = 0) { + if (len == 0) { + len = u_strlen(str); + } + + std::vector buffer(len * 4); + int32_t olen = 0; + UErrorCode status = U_ZERO_ERROR; + u_strToUTF8(&buffer[0], SI32(len * 4 - 1), &olen, str, SI32(len), &status); + + auto cs = UI16(olen); + writeLE(output, cs); + output.write(&buffer[0], cs); +} + +inline void writeUTF8_LE(std::ostream& output, const UString& str) { + writeUTF8_LE(output, str.data(), str.size()); +} + +inline void writeUTF8_LE(std::ostream& output, const UStringView& str) { + writeUTF8_LE(output, str.data(), str.size()); } template -inline UString readUTF8String(S& input) { +inline UString readUTF8_Raw(S& input) { uint16_t len = 0; readRaw(input, len); @@ -534,54 +608,30 @@ inline UString readUTF8String(S& input) { return rv; } -#ifdef _MSC_VER - // warning C4127: conditional expression is constant - #pragma warning (disable: 4127) -#endif +template +inline void readUTF8_LE(S& input, Str& rv) { + uint16_t len = 0; + readLE(input, len); -template -inline void writeBE(std::ostream& stream, T value) { - value = be::native_to_big(value); - stream.write(reinterpret_cast(&value), sizeof(value)); - if (!stream) { - throw std::runtime_error("Stream was in bad state in writeBE()"); - } -} + rv.clear(); + rv.resize(len); + std::vector buffer(len); + input.read(&buffer[0], len); -template<> -inline void writeBE(std::ostream& stream, double value) { - int exp = 0; - auto mant64 = UI64(SI64(DBL(std::numeric_limits::max()) * frexp(value, &exp))); - auto exp32 = UI32(exp); - writeBE(stream, mant64); - writeBE(stream, exp32); -} + int32_t olen = 0; + UErrorCode status = U_ZERO_ERROR; + u_strFromUTF8(&rv[0], len, &olen, &buffer[0], len, &status); -template -inline T readBE(std::istream& stream) { - if (!stream) { - throw std::runtime_error("Stream was in bad state in readBE()"); - } - T tmp; - stream.read(reinterpret_cast(&tmp), sizeof(tmp)); - return be::big_to_native(tmp); + rv.resize(olen); } -template<> -inline double readBE(std::istream& stream) { - auto mant64 = readBE(stream); - auto exp = static_cast(readBE(stream)); - - auto value = DBL(SI64(mant64)) / DBL(std::numeric_limits::max()); - - return ldexp(value, exp); +template +inline UString readUTF8_LE(S& input) { + UString rv; + readUTF8_LE(input, rv); + return rv; } -#ifdef _MSC_VER - // warning C4127: conditional expression is constant - #pragma warning (default: 4127) -#endif - template inline void GAppSetOpts_ranged(const char* value, Cont& cont, bool fill = true) { cont.clear(); diff --git a/test/runall.pl b/test/runall.pl index e9c4b157..a14336d6 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -90,7 +90,6 @@ sub run_pl { `echo "Include Static grammar.cg3 ;" > grammar.bsf.cg3`; `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" >output.bsf.txt`; `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" > expected.bsf.txt`; - `echo '' >> expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; if (-s "diff.bsf.txt") { From 4acac69ee1286a87d71f0c3d3e312a48053c7d9e Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Fri, 29 Aug 2025 17:42:50 +0200 Subject: [PATCH 36/42] Ensure binary stream is little endian (still 67/69) --- src/BinaryApplicator.cpp | 163 ++++++++++++++++++++------------------- 1 file changed, 82 insertions(+), 81 deletions(-) diff --git a/src/BinaryApplicator.cpp b/src/BinaryApplicator.cpp index ac0f19ed..db558e4d 100644 --- a/src/BinaryApplicator.cpp +++ b/src/BinaryApplicator.cpp @@ -134,31 +134,6 @@ void BinaryApplicator::runGrammarOnText(std::istream& input, std::ostream& outpu flush(false); } -#define READ_U16_INTO(dest) \ - do { \ - (dest) = reinterpret_cast(&buf[pos])[0]; \ - pos += 2; \ - } while (false) - -#define READ_U32_INTO(dest) \ - do { \ - (dest) = reinterpret_cast(&buf[pos])[0]; \ - pos += 4; \ - } while (false) - -#define READ_STR_INTO(dest) \ - do { \ - uint16_t tl = reinterpret_cast(&buf[pos])[0]; \ - pos += 2; \ - (dest).clear(); \ - (dest).resize(tl, 0); \ - int32_t olen = 0; \ - UErrorCode status = U_ZERO_ERROR; \ - u_strFromUTF8(&(dest)[0], tl, &olen, &buf[pos], tl, &status); \ - (dest).resize(olen); \ - pos += tl; \ - } while (false) - BinaryPacket BinaryApplicator::readPacket() { BinaryPacket packet; readLE(*ux_stdin, packet.type); @@ -190,16 +165,49 @@ void BinaryApplicator::readWindow(void*& payload) { ux_stdin->read(&buf[0], cs); uint32_t pos = 0; + auto READ_U16_INTO = [&](uint16_t& dest) { + dest = *reinterpret_cast(&buf[pos]); + be::little_to_native_inplace(dest); + pos += sizeof(dest); + }; + + auto READ_U16 = [&]() { + uint16_t dest; + READ_U16_INTO(dest); + return dest; + }; + + auto READ_U32_INTO = [&](uint32_t& dest) { + dest = *reinterpret_cast(&buf[pos]); + be::little_to_native_inplace(dest); + pos += sizeof(dest); + }; + + auto READ_U32 = [&]() { + uint32_t dest; + READ_U32_INTO(dest); + return dest; + }; + + auto READ_STR_INTO = [&](UString& dest) { + auto tl = READ_U16(); + dest.clear(); + dest.resize(tl); + int32_t olen = 0; + UErrorCode status = U_ZERO_ERROR; + u_strFromUTF8(&(dest)[0], tl, &olen, &buf[pos], tl, &status); + dest.resize(olen); + pos += tl; + }; + // TODO: flags - uint16_t flags; - READ_U16_INTO(flags); + auto flags = READ_U16(); if (flags & BFW_DEP_SPAN) { dep_has_spanned = true; } TagVector window_tags; - uint16_t tag_count; - READ_U16_INTO(tag_count); + auto tag_count = READ_U16(); window_tags.reserve(tag_count); for (uint16_t i = 0; i < tag_count; ++i) { UString tg; @@ -213,14 +221,12 @@ void BinaryApplicator::readWindow(void*& payload) { } } - uint16_t var_count; - READ_U16_INTO(var_count); + auto var_count = READ_U16(); for (uint16_t vn = 0; vn < var_count; ++vn) { char mode = buf[pos]; ++pos; - uint16_t tag1, tag2; - READ_U16_INTO(tag1); - READ_U16_INTO(tag2); + auto tag1 = READ_U16(); + auto tag2 = READ_U16(); auto hash1 = window_tags[tag1]->hash; if (mode == BFV_SETVAR) { cSWindow->variables_set[hash1] = window_tags[tag2]->hash; @@ -242,8 +248,7 @@ void BinaryApplicator::readWindow(void*& payload) { READ_STR_INTO(cSWindow->text); READ_STR_INTO(cSWindow->text_post); - uint16_t cohort_count; - READ_U16_INTO(cohort_count); + auto cohort_count = READ_U16(); uint16_t tag; for (uint16_t cn = 0; cn < cohort_count; ++cn) { Cohort* cCohort = alloc_cohort(cSWindow); @@ -277,12 +282,10 @@ void BinaryApplicator::readWindow(void*& payload) { has_dep = true; } - uint16_t rel_count; - READ_U16_INTO(rel_count); + auto rel_count = READ_U16(); for (uint16_t rn = 0; rn < rel_count; ++rn) { READ_U16_INTO(tag); - uint32_t head; - READ_U32_INTO(head); + auto head = READ_U32(); cCohort->relations_input[window_tags[tag]->hash].insert(head); } if (rel_count) { @@ -294,10 +297,10 @@ void BinaryApplicator::readWindow(void*& payload) { READ_STR_INTO(cCohort->text); READ_STR_INTO(cCohort->wblank); - uint16_t reading_count; - READ_U16_INTO(reading_count); - if (!reading_count) + auto reading_count = READ_U16(); + if (!reading_count) { initEmptyCohort(*cCohort); + } Reading* prev = nullptr; for (uint16_t rn = 0; rn < reading_count; ++rn) { Reading* cReading = alloc_reading(cCohort); @@ -361,42 +364,6 @@ void BinaryApplicator::readText(void*& payload) { payload = &text; } -#define WRITE_U16_INTO(n, buffer) \ - do { \ - std::string tmp(2, 0); \ - auto tmp_n = static_cast(n); \ - tmp.assign(reinterpret_cast(&tmp_n), 2); \ - (buffer) += tmp; \ - } while (false) - -#define WRITE_U32_INTO(n, buffer) \ - do { \ - std::string tmp(4, 0); \ - auto tmp_n = static_cast(n); \ - tmp.assign(reinterpret_cast(&tmp_n), 4); \ - (buffer) += tmp; \ - } while (false) - -#define WRITE_TAG_INTO(tag, buffer) \ - do { \ - if (tag_index.find((tag)) == tag_index.end()) { \ - tag_index[(tag)] = UI32(tags_to_write.size()); \ - tags_to_write.push_back((tag)); \ - } \ - WRITE_U16_INTO(tag_index[(tag)], buffer); \ - } while (false) - -#define WRITE_STR_INTO(s, buffer) \ - do { \ - std::string tmp((s).size() * 4, 0); \ - int32_t olen = 0; \ - UErrorCode status = U_ZERO_ERROR; \ - u_strToUTF8(&tmp[0], SI32((s).size() * 4 - 1), &olen, (s).data(), SI32((s).size()), &status); \ - tmp.resize(olen); \ - WRITE_U16_INTO(UI16(olen), (buffer)); \ - (buffer) += tmp; \ - } while (false) - void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling) { if (!header_done) { output.write("CGBF", 4); @@ -407,7 +374,41 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out writeLE(output, UI8(BFP_WINDOW)); TagVector tags_to_write; - std::map tag_index; + std::map tag_index; + + auto WRITE_U16_INTO = [&](uint16_t n, std::string& buffer) { + be::native_to_little_inplace(n); + auto chr = reinterpret_cast(&n); + buffer += chr[0]; + buffer += chr[1]; + }; + + auto WRITE_U32_INTO = [&](uint32_t n, std::string& buffer) { + be::native_to_little_inplace(n); + auto chr = reinterpret_cast(&n); + buffer += chr[0]; + buffer += chr[1]; + buffer += chr[2]; + buffer += chr[3]; + }; + + auto WRITE_TAG_INTO = [&](Tag* tag, std::string& buffer) { + if (tag_index.find(tag) == tag_index.end()) { + tag_index[tag] = UI16(tags_to_write.size()); + tags_to_write.push_back(tag); + } + WRITE_U16_INTO(tag_index[tag], buffer); + }; + + auto WRITE_STR_INTO = [&](const UString& s, std::string& buffer) { + std::string tmp(s.size() * 4, 0); + int32_t olen = 0; + UErrorCode status = U_ZERO_ERROR; + u_strToUTF8(&tmp[0], SI32(s.size() * 4 - 1), &olen, s.data(), SI32(s.size()), &status); + tmp.resize(olen); + WRITE_U16_INTO(UI16(olen), buffer); + buffer += tmp; + }; uint16_t var_count = 0; std::string var_buffer; @@ -576,7 +577,7 @@ void BinaryApplicator::printSingleWindow(SingleWindow* window, std::ostream& out } WRITE_U16_INTO(flags, header_buffer); - WRITE_U16_INTO(tags_to_write.size(), header_buffer); + WRITE_U16_INTO(UI16(tags_to_write.size()), header_buffer); for (auto& tag : tags_to_write) { WRITE_STR_INTO(tag->tag, header_buffer); } From 2450df02ed09b4b5804ff14e60a59116a5f32a63 Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Fri, 5 Sep 2025 10:19:05 -0400 Subject: [PATCH 37/42] multiple packet types python --- python/cg3.py | 69 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/python/cg3.py b/python/cg3.py index 9c136ff5..064b9411 100644 --- a/python/cg3.py +++ b/python/cg3.py @@ -10,6 +10,7 @@ class Reading: lemma: str = '' tags: List[str] = field(default_factory=list) subreading: Optional['Reading'] = None + deleted: bool = False @dataclass class Cohort: @@ -29,9 +30,15 @@ class Window: rem_vars: List[str] = field(default_factory=list) text: str = '' text_post: str = '' - flush_after: bool = False dep_has_spanned: bool = False +@dataclass +class Packet: + type: str = '' + window: Optional[Window] = None + command: str = '' + text: str = '' + def parse_binary_window(buf): '''Given a bytestring `buf` containing a single window (not including the length header), parse and return a Window() @@ -56,8 +63,6 @@ def read_str(): window = Window() window_flags = read_u16() if window_flags & 1: - window.flush_after = True - if window_flags & 2: window.dep_has_spanned = True tag_count = read_u16() tags = [read_str() for i in range(tag_count)] @@ -109,13 +114,17 @@ def read_tags(): prev.subreading = reading else: cohort.readings.append(reading) + if reading_flags & 2: + reading.deleted = True prev = reading window.cohorts.append(cohort) return window -def parse_binary_stream(fin): - '''Given a file `fin`, yield a series of Window() objects. - raises ValueError if stream header is missing or invalid.''' +def parse_binary_stream(fin, windows_only=False): + '''Given a file `fin`, yield a series of Packet() objects. + raises ValueError if stream header is missing or invalid. + If `windows_only` is True, packets containing commands or text + are skipped and Window() objects are returned instead.''' header = fin.read(8) label, version = struct.unpack('<4sI', header) @@ -124,11 +133,45 @@ def parse_binary_stream(fin): if version != 1: raise ValueError('Unknown binary format version!') while True: - spec = fin.read(4) - if len(spec) != 4: - break; - block_len = struct.unpack(' Date: Fri, 5 Sep 2025 10:28:48 -0400 Subject: [PATCH 38/42] update docs --- manual/streamformats.xml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/manual/streamformats.xml b/manual/streamformats.xml index c988c762..b1d36a0d 100644 --- a/manual/streamformats.xml +++ b/manual/streamformats.xml @@ -195,10 +195,21 @@ The stream begins with a header containing CGBF followed by a 4-byte version number (currently 1). - After that, each window begins with 4 bytes specifying the length of the block and then the following structure: + After that, each packet begins with 1 byte indicating its contents. + 1 is a window, 2> is a command, and 3> is text. + + + Command packets have a second byte identifying the command: 1 for FLUSH, 2 for EXIT, 3 for IGNORE, and 4 for RESUME. + Commands which manipulate variables are represented in window packets. + + + Text packets consist of a 2-byte length followed by the contents in UTF-8. + + + Each window packet begins with 4 bytes specifying the length of the block and then the following structure: window flags [2] - > 1 = flush_after + > 1 = has multi-window dependencies tags [array of str] variables [array] mode @@ -211,6 +222,7 @@ text_post [str] cohorts [array] flags [2] + > 1 = is target of a relation wordform [tag] static_tags [array of tag] dep_self [4] @@ -223,6 +235,7 @@ readings [array] flags [2] > 1 = is subreading of predecessor + > 2 = deleted baseform [tag] tags [array of tag] From ea296543ad64dacf1fa89e29c763fefd8c546a7f Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Mon, 8 Sep 2025 18:47:06 +0200 Subject: [PATCH 39/42] Move mappings to the end; Tell cg-sort which prefix --- scripts/cg-stabilize-relations | 2 +- src/GrammarApplicator.cpp | 11 +++++++++++ src/GrammarApplicator_reflow.cpp | 1 + src/GrammarApplicator_runGrammar.cpp | 2 ++ test/T_Dependency/prefix.txt | 1 + test/T_MappingPrefix/expected.txt | 7 +++---- test/T_MappingPrefix/prefix.txt | 1 + test/runall.pl | 13 +++++++++---- 8 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 test/T_Dependency/prefix.txt create mode 100644 test/T_MappingPrefix/prefix.txt diff --git a/scripts/cg-stabilize-relations b/scripts/cg-stabilize-relations index 30f93e87..b23d9736 100755 --- a/scripts/cg-stabilize-relations +++ b/scripts/cg-stabilize-relations @@ -9,7 +9,7 @@ args = parser.parse_args() id_map = {} -tag = re.compile(r'\b(ID:|R:\w+:)(\d+)\b') +tag = re.compile(r'\b(ID:|R:[^:\s]+:)(\d+)\b') def repl(matchobj): global id_map n = matchobj.group(2) diff --git a/src/GrammarApplicator.cpp b/src/GrammarApplicator.cpp index d9e84e1f..f3d309c0 100644 --- a/src/GrammarApplicator.cpp +++ b/src/GrammarApplicator.cpp @@ -374,6 +374,9 @@ void GrammarApplicator::printReading(const Reading* reading, std::ostream& outpu } uint32SortedVector unique; + static thread_local TagList mappings; + mappings.clear(); + for (auto tter : reading->tags_list) { if ((!show_end_tags && tter == endtag) || tter == begintag) { continue; @@ -394,6 +397,14 @@ void GrammarApplicator::printReading(const Reading* reading, std::ostream& outpu if (tag->type & T_RELATION && has_relations) { continue; } + if (tag->type & T_MAPPING) { + // Move mappings to the end + mappings.push_back(tag); + continue; + } + u_fprintf(output, " %S", tag->tag.data()); + } + for (auto tag : mappings) { u_fprintf(output, " %S", tag->tag.data()); } diff --git a/src/GrammarApplicator_reflow.cpp b/src/GrammarApplicator_reflow.cpp index 5301c335..a7604770 100644 --- a/src/GrammarApplicator_reflow.cpp +++ b/src/GrammarApplicator_reflow.cpp @@ -491,6 +491,7 @@ uint32_t GrammarApplicator::addTagToReading(Reading& reading, Tag* tag, bool reh } if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { + tag->type |= T_MAPPING; if (reading.mapping && reading.mapping != tag) { u_fprintf(ux_stderr, "Error: addTagToReading() cannot add a mapping tag to a reading which already is mapped!\n"); CG3Quit(1); diff --git a/src/GrammarApplicator_runGrammar.cpp b/src/GrammarApplicator_runGrammar.cpp index f23650fb..b8da9fd7 100644 --- a/src/GrammarApplicator_runGrammar.cpp +++ b/src/GrammarApplicator_runGrammar.cpp @@ -368,6 +368,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp if (base && base[0]) { Tag* tag = addTag(base); if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { + tag->type |= T_MAPPING; all_mappings[cReading].push_back(tag); } else { @@ -383,6 +384,7 @@ void GrammarApplicator::runGrammarOnText(std::istream& input, std::ostream& outp if (base && base[0]) { Tag* tag = addTag(base); if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { + tag->type |= T_MAPPING; all_mappings[cReading].push_back(tag); } else { diff --git a/test/T_Dependency/prefix.txt b/test/T_Dependency/prefix.txt new file mode 100644 index 00000000..3038d22f --- /dev/null +++ b/test/T_Dependency/prefix.txt @@ -0,0 +1 @@ +§ diff --git a/test/T_MappingPrefix/expected.txt b/test/T_MappingPrefix/expected.txt index 94b32f8f..e1bb5351 100644 --- a/test/T_MappingPrefix/expected.txt +++ b/test/T_MappingPrefix/expected.txt @@ -1,8 +1,8 @@ "" - "word" @mapped $tag §tag £tag @tag ADD:4 ADD:5 ADD:6 ADD:7 + "word" @mapped $tag §tag @tag £tag ADD:4 ADD:5 ADD:6 ADD:7 "word" £mapped - "word" §mapped $tag §tag £tag @tag ADD:4 ADD:5 ADD:6 ADD:7 - "word" $mapped $tag §tag £tag @tag ADD:4 ADD:5 ADD:6 ADD:7 + "word" §mapped $tag §tag @tag £tag ADD:4 ADD:5 ADD:6 ADD:7 + "word" $mapped $tag §tag @tag £tag ADD:4 ADD:5 ADD:6 ADD:7 "" "word2" a £re-mapped UNMAP:9:normal MAP:12 "" @@ -16,4 +16,3 @@ "" "word3" a b c REMOVE:15 ; "word3" a b c £a £b £c REMOVE:15 - diff --git a/test/T_MappingPrefix/prefix.txt b/test/T_MappingPrefix/prefix.txt new file mode 100644 index 00000000..93660ce9 --- /dev/null +++ b/test/T_MappingPrefix/prefix.txt @@ -0,0 +1 @@ +£ diff --git a/test/runall.pl b/test/runall.pl index a14336d6..ee27b5b4 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -40,6 +40,12 @@ sub run_pl { my ($binary,$override,$args) = @_; my $good = 1; + my $prefix = '@'; + if (-s 'prefix.txt') { + $prefix = `cat prefix.txt`; + chomp($prefix); + } + # Normal run `"$binary" $args $override -g grammar.cg3 -I input.txt -O output.txt >stdout.txt 2>stderr.txt`; `diff -B expected.txt output.txt >diff.txt`; @@ -85,11 +91,9 @@ sub run_pl { } # Normal run, but with binary I/O - my $conv = $binary; - $conv =~ s@vislcg3(\.exe)?$@cg-conv@g; `echo "Include Static grammar.cg3 ;" > grammar.bsf.cg3`; - `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" >output.bsf.txt`; - `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m | "$bindir/../scripts/cg-stabilize-relations" > expected.bsf.txt`; + `cat input.txt | "$binary" $args --in-cg --out-binary -g grammar.bsf.cg3 2>stderr.bsf.conv1.txt | "$binary" $args $override -g grammar.cg3 --in-binary --out-binary 2>stderr.bsf.vislcg3.txt | "$binary" $args --in-binary --out-cg -g grammar.bsf.cg3 2>stderr.bsf.conv2.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m '$prefix' | "$bindir/../scripts/cg-stabilize-relations" >output.bsf.txt`; + `cat expected.txt | "$bindir/../scripts/cg-untrace" | "$bindir/../scripts/cg-sort" -m '$prefix' | "$bindir/../scripts/cg-stabilize-relations" > expected.bsf.txt`; `diff -B expected.bsf.txt output.bsf.txt >diff.bsf.txt`; if (-s "diff.bsf.txt") { @@ -159,6 +163,7 @@ sub run_pl { my $args = ''; if (-s 'args.txt') { $args = `cat args.txt`; + chomp($args); } if (-x 'run.pl') { `./run.pl "$binary" \Q$c\E $args`; From 7aa00086d442dafb17e029b0e6d64c6c28ff556c Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Mon, 8 Sep 2025 19:08:52 +0200 Subject: [PATCH 40/42] Force PERL_UNICODE=SDA in the test runner --- test/runall.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/runall.pl b/test/runall.pl index ee27b5b4..2870edce 100755 --- a/test/runall.pl +++ b/test/runall.pl @@ -7,6 +7,8 @@ my $bindir = realpath $Bin; chdir $bindir or die("Error: Could not change directory to $bindir !"); +$ENV{PERL_UNICODE} = 'SDA'; + # Search paths for the binary my @binlist = ( "../build/src/vislcg3", From 7838daee40ad3f51b10d69058c233d0ca2fb751b Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Mon, 8 Sep 2025 19:25:57 +0200 Subject: [PATCH 41/42] Version --- src/version.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.hpp b/src/version.hpp index ced0348e..abc2f46d 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -26,8 +26,8 @@ constexpr auto CG3_COPYRIGHT_STRING = "Copyright (C) 2007-2025 GrammarSoft ApS. Licensed under GPLv3+"; constexpr uint32_t CG3_VERSION_MAJOR = 1; -constexpr uint32_t CG3_VERSION_MINOR = 5; -constexpr uint32_t CG3_VERSION_PATCH = 7; +constexpr uint32_t CG3_VERSION_MINOR = 6; +constexpr uint32_t CG3_VERSION_PATCH = 0; constexpr uint32_t CG3_REVISION = 13898; constexpr uint32_t CG3_FEATURE_REV = 13898; constexpr uint32_t CG3_TOO_OLD = 10373; From f4f812f5611af249f15dd4f0a29abb6b7f68bab9 Mon Sep 17 00:00:00 2001 From: Tino Didriksen Date: Mon, 8 Sep 2025 19:31:01 +0200 Subject: [PATCH 42/42] Install cg-stabilize-relations --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eeaca5e..b4588a95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,6 +165,7 @@ configure_file(scripts/cg3-autobin.pl.in scripts/cg3-autobin.pl @ONLY) install(PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/scripts/cg3-autobin.pl" "${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-sort" + "${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-stabilize-relations" "${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-strictify" "${CMAKE_CURRENT_SOURCE_DIR}/scripts/cg-untrace" DESTINATION ${CMAKE_INSTALL_BINDIR})