diff --git a/configure.ac b/configure.ac index cc16a65..d996478 100644 --- a/configure.ac +++ b/configure.ac @@ -40,6 +40,8 @@ AC_SUBST(ICU_LIBS) # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) + CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS" diff --git a/src/lsx_processor.cc b/src/lsx_processor.cc index 44d0bc2..7721523 100644 --- a/src/lsx_processor.cc +++ b/src/lsx_processor.cc @@ -1,9 +1,9 @@ #include "lsx_processor.h" #include -#include LSXProcessor::LSXProcessor() + : alphabet(AlphabetExe(&str_write)) { escaped_chars.insert('['); escaped_chars.insert(']'); @@ -21,7 +21,12 @@ LSXProcessor::LSXProcessor() void LSXProcessor::load(FILE *input) { - readTransducerSet(input, alphabetic_chars, alphabet, trans); + readTransducerSet(input, mmapping, mmap_pointer, mmap_len, + str_write, &alphabetic_chars, alphabet, transducers); + for (auto& it : transducers) { + all_finals.insert(&it.second); + } + initial_state.init(all_finals); // symbols word_boundary = alphabet("<$>"_u); @@ -29,13 +34,6 @@ LSXProcessor::load(FILE *input) word_boundary_ns = alphabet("<$->"_u); any_char = alphabet(""_u); any_tag = alphabet(""_u); - - for (auto& it : trans) { - root.addTransition(0, 0, it.second.getInitial(), 0.0); - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); - } - initial_state.init(&root); } void @@ -86,7 +84,8 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) } size_t last_final = 0; UString last_final_out; - State s = initial_state; + State s; + s.init(all_finals); size_t idx = 0; bool firstupper = false; bool uppercase = false; @@ -126,13 +125,9 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) break; } } - UString tag = lu.substr(i, j-i); + int32_t tag = alphabet.lookupDynamic(lu.substr(i, j-i)); i = j-1; - if(!alphabet.isSymbolDefined(tag)) - { - alphabet.includeSymbol(tag); - } - s.step_override(alphabet(tag), any_tag, alphabet(tag)); + s.step_override(tag, any_tag, tag); } else { @@ -140,7 +135,7 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) { i++; } - s.step_override(lu[i], towlower(lu[i]), any_char, lu[i]); + s.step_override(lu[i], u_tolower(lu[i]), any_char, lu[i]); } } s.step(word_boundary, word_boundary_s, word_boundary_ns); diff --git a/src/lsx_processor.h b/src/lsx_processor.h index 7861cf0..7143f71 100644 --- a/src/lsx_processor.h +++ b/src/lsx_processor.h @@ -1,24 +1,30 @@ #ifndef _LSX_PROCESSOR_H_ #define _LSX_PROCESSOR_H_ -#include +#include #include #include #include -#include +#include +#include #include #include class LSXProcessor { private: - Node root; - std::map trans; + StringWriter str_write; + std::map transducers; State initial_state; + std::set all_finals; + AlphabetExe alphabet; + + bool mmapping = false; + void* mmap_pointer; + int mmap_len; + std::set escaped_chars; std::set alphabetic_chars; - std::map all_finals; - Alphabet alphabet; bool null_flush = true; bool dictionary_case = false; bool at_end = false;