Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14,205 changes: 0 additions & 14,205 deletions machine/corpora/BiblicalTermsPt.xml

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .dictionary_text_corpus import DictionaryTextCorpus
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .file_paratext_project_terms_parser import FileParatextProjectTermsParser
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector
from .flatten import flatten
Expand All @@ -25,7 +26,7 @@
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
from .paratext_text_corpus import ParatextTextCorpus
Expand All @@ -47,6 +48,7 @@
from .text_file_text import TextFileText
from .text_file_text_corpus import TextFileTextCorpus
from .text_row import TextRow, TextRowFlags
from .text_row_content_type import TextRowContentType
from .token_processors import (
escape_spaces,
lowercase,
Expand Down Expand Up @@ -101,6 +103,7 @@
"batch",
"Corpus",
"create_versification_ref_corpus",
"TextRowContentType",
"DblBundleTextCorpus",
"DictionaryAlignmentCorpus",
"DictionaryTextCorpus",
Expand All @@ -109,10 +112,12 @@
"extract_scripture_corpus",
"FileParatextProjectFileHandler",
"FileParatextProjectSettingsParser",
"FileParatextProjectTermsParser",
"FileParatextProjectTextUpdater",
"FileParatextProjectVersificationErrorDetector",
"flatten",
"is_scripture",
"KeyTerm",
"lowercase",
"MemoryAlignmentCollection",
"MemoryStreamContainer",
Expand Down
8 changes: 4 additions & 4 deletions machine/corpora/corpora_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ def get_split_indices(
return set(rand.sample(range(corpus_size), min(split_size, corpus_size)))


def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str]]:
def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str, int]]:
file_patterns = list(file_patterns)
if len(file_patterns) == 1 and os.path.isfile(file_patterns[0]):
yield ("*all*", file_patterns[0])
yield ("*all*", file_patterns[0], 0)
else:
for i, file_pattern in enumerate(file_patterns):
if os.path.isfile(file_pattern):
yield (str(i), file_pattern)
yield (str(i), file_pattern, i)
continue

if "*" not in file_pattern and "?" not in file_pattern and not os.path.exists(file_pattern):
Expand Down Expand Up @@ -89,7 +89,7 @@ def get_files(file_patterns: Iterable[str]) -> Iterable[Tuple[str, str]]:
updated_id += group
if len(updated_id) > 0:
id = updated_id
yield (id, filename)
yield (id, filename, i)


def gen(iterable: Iterable[T] = []) -> Generator[T, None, None]:
Expand Down
11 changes: 11 additions & 0 deletions machine/corpora/file_paratext_project_terms_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase


class FileParatextProjectTermsParser(ParatextProjectTermsParserBase):
def __init__(self, project_dir: StrPath) -> None:
super().__init__(
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
)
14 changes: 14 additions & 0 deletions machine/corpora/key_term.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dataclasses import dataclass
from typing import List

from ..scripture.verse_ref import VerseRef


@dataclass(frozen=True)
class KeyTerm:
id: str
category: str
domain: str
renderings: List[str]
references: List[VerseRef]
renderings_patterns: List[str]
11 changes: 9 additions & 2 deletions machine/corpora/n_parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
from .text_corpus import TextCorpus
from .text_corpus_enumerator import TextCorpusEnumerator
from .text_row import TextRow, TextRowFlags
from .text_row_content_type import TextRowContentType


class _RangeRow:
refs: List[Any]
segment: List[str]
is_sentence_start: bool = False
content_type: TextRowContentType = TextRowContentType.SEGMENT

@property
def is_in_range(self):
Expand All @@ -36,6 +38,7 @@ def __init__(self, n: int):
self.text_id = ""
self.versifications: Optional[List[Versification]] = None
self.row_ref_comparer = None
self.content_type = TextRowContentType.SEGMENT

@property
def is_in_range(self) -> bool:
Expand All @@ -44,6 +47,7 @@ def is_in_range(self) -> bool:
def add_text_row(self, row: TextRow, index: int):
self.text_id = row.text_id
self.rows[index].refs.append(row.ref)
self.rows[index].content_type = row.content_type
if self.rows[index].is_empty:
self.rows[index].is_sentence_start = row.is_sentence_start
self.rows[index].segment.extend(row.segment)
Expand All @@ -53,6 +57,7 @@ def create_row(self) -> NParallelTextRow:
reference_refs: List[Any] = [r.refs[0] if len(r.refs) > 0 else None for r in self.rows if len(r.refs) > 0]
for i in range(len(self.rows)):
row = self.rows[i]
self.content_type = row.content_type

if (
self.versifications is not None
Expand All @@ -62,7 +67,7 @@ def create_row(self) -> NParallelTextRow:
refs[i] = [cast(ScriptureRef, r).change_versification(self.versifications[i]) for r in reference_refs]
else:
refs[i] = row.refs.copy()
n_parallel_text_row = NParallelTextRow(self.text_id, refs)
n_parallel_text_row = NParallelTextRow(self.text_id, refs, self.content_type)
n_parallel_text_row.n_segments = [r.segment.copy() for r in self.rows]
n_parallel_text_row.n_flags = [
TextRowFlags.SENTENCE_START if r.is_sentence_start else TextRowFlags.NONE for r in self.rows
Expand Down Expand Up @@ -288,6 +293,7 @@ def _create_rows(
yield range_info.create_row()

default_refs = [[r.ref for r in rows if r is not None][0]]
content_type = TextRowContentType.SEGMENT

text_id: Optional[str] = None
refs: List[List[Any]] = []
Expand All @@ -298,6 +304,7 @@ def _create_rows(
for i in range(len(rows)):
row = rows[i]
if row is not None:
content_type = row.content_type
text_id = text_id or row.text_id
if self.corpora[i].is_scripture:
refs[i] = self._correct_versification([row.ref] if row.ref is None else default_refs, i)
Expand All @@ -314,7 +321,7 @@ def _create_rows(
)
refs = [r or default_refs for r in refs]

new_row = NParallelTextRow(cast(str, text_id), refs)
new_row = NParallelTextRow(cast(str, text_id), refs, content_type)
new_row.n_segments = [r.segment if r is not None else [] for r in rows]
new_row.n_flags = flags
yield new_row
Expand Down
15 changes: 13 additions & 2 deletions machine/corpora/n_parallel_text_row.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
from typing import Any, Sequence

from .text_row import TextRowFlags
from .text_row_content_type import TextRowContentType


class NParallelTextRow:
def __init__(self, text_id: str, n_refs: Sequence[Sequence[Any]]):
def __init__(
self,
text_id: str,
n_refs: Sequence[Sequence[Any]],
content_type: TextRowContentType = TextRowContentType.SEGMENT,
):
if len([n_ref for n_ref in n_refs if n_ref is not None and len(n_ref) > 0]) == 0:
raise ValueError(f"Refs must be provided but n_refs={n_refs}")
self._text_id = text_id
self._n_refs = n_refs
self._n = len(n_refs)
self.n_segments: Sequence[Sequence[str]] = [[] for _ in range(0, self._n)]
self.n_flags: Sequence[TextRowFlags] = [TextRowFlags.SENTENCE_START for _ in range(0, self._n)]
self._content_type = content_type

@property
def text_id(self) -> str:
Expand All @@ -21,6 +28,10 @@ def text_id(self) -> str:
def ref(self) -> Any:
return self._n_refs[0][0]

@property
def content_type(self) -> TextRowContentType:
return self._content_type

@property
def n_refs(self) -> Sequence[Sequence[Any]]:
return self._n_refs
Expand All @@ -42,6 +53,6 @@ def text(self, i: int) -> str:
return " ".join(self.n_segments[i])

def invert(self) -> "NParallelTextRow":
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)))
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)), content_type=self.content_type)
inverted_row.n_flags = list(reversed(self.n_flags))
return inverted_row
8 changes: 7 additions & 1 deletion machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .corpora_utils import get_split_indices
from .corpus import Corpus
from .parallel_text_row import ParallelTextRow
from .text_row_content_type import TextRowContentType
from .token_processors import escape_spaces, lowercase, normalize, unescape_spaces

if TYPE_CHECKING:
Expand Down Expand Up @@ -401,10 +402,11 @@ def to_hf_dataset(
ref_column: Optional[str] = "ref",
translation_column: str = "translation",
alignment_column: Optional[str] = "alignment",
content_type_column: Optional[str] = "content_type",
) -> Dataset:
try:
from datasets.arrow_dataset import Dataset
from datasets.features.features import Features, FeatureType, Sequence, Value
from datasets.features.features import ClassLabel, Features, FeatureType, Sequence, Value
from datasets.features.translation import Translation
except ImportError:
raise RuntimeError("datasets is not installed.")
Expand All @@ -416,6 +418,8 @@ def to_hf_dataset(
features_dict[ref_column] = Sequence(Value("string"))
if alignment_column is not None:
features_dict[alignment_column] = Sequence({source_lang: Value("int32"), target_lang: Value("int32")})
if content_type_column is not None:
features_dict[content_type_column] = ClassLabel(names=[e.name for e in TextRowContentType])
features = Features(features_dict)

def iterable() -> Iterable[dict]:
Expand All @@ -426,6 +430,8 @@ def iterable() -> Iterable[dict]:
example[text_id_column] = row.text_id
if ref_column is not None:
example[ref_column] = row.refs
if content_type_column is not None:
example[content_type_column] = row.content_type.name
example[translation_column] = {source_lang: row.source_text, target_lang: row.target_text}
if alignment_column is not None:
src_indices: List[int] = []
Expand Down
8 changes: 8 additions & 0 deletions machine/corpora/parallel_text_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .aligned_word_pair import AlignedWordPair
from .text_row import TextRowFlags
from .text_row_content_type import TextRowContentType


class ParallelTextRow(Sequence[Sequence[str]]):
Expand All @@ -17,6 +18,7 @@ def __init__(
aligned_word_pairs: Optional[Collection[AlignedWordPair]] = None,
source_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
target_flags: TextRowFlags = TextRowFlags.SENTENCE_START,
content_type: TextRowContentType = TextRowContentType.SEGMENT,
) -> None:
if not text_id:
raise ValueError("A text_id must be set.")
Expand All @@ -25,6 +27,7 @@ def __init__(
self._text_id = text_id
self._source_refs = source_refs
self._target_refs = target_refs
self._content_type = content_type
self.source_segment = source_segment
self.target_segment = target_segment
self.aligned_word_pairs = aligned_word_pairs
Expand All @@ -51,6 +54,10 @@ def ref(self) -> Any:
def refs(self) -> Sequence[Any]:
return self.target_refs if len(self.source_refs) == 0 else self.source_refs

@property
def content_type(self) -> TextRowContentType:
return self._content_type

@property
def is_source_sentence_start(self) -> bool:
return TextRowFlags.SENTENCE_START in self.source_flags
Expand Down Expand Up @@ -107,4 +114,5 @@ def invert(self) -> ParallelTextRow:
None if self.aligned_word_pairs is None else [wp.invert() for wp in self.aligned_word_pairs],
self.target_flags,
self.source_flags,
self.content_type,
)
14 changes: 11 additions & 3 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import List, Sequence, Tuple
from typing import Sequence
from zipfile import ZipFile

from ..utils.typeshed import StrPath
from .dictionary_text_corpus import DictionaryTextCorpus
from .key_term import KeyTerm
from .memory_text import MemoryText
from .text_row import TextRow
from .text_row_content_type import TextRowContentType
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser

Expand All @@ -15,7 +17,7 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g

with ZipFile(filename, "r") as archive:
settings = ZipParatextProjectSettingsParser(archive).parse()
glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse(
key_terms: Sequence[KeyTerm] = ZipParatextProjectTermsParser(archive, settings).parse(
term_categories, use_term_glosses
)
text_id = (
Expand All @@ -24,5 +26,11 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
f"{settings.biblical_terms_file_name}"
)

text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses])
text = MemoryText(
text_id,
[
TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)
for key_term in key_terms
],
)
self._add_text(text)
Loading