From e626582ff43ffb52d7507693d3214fce440736a6 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 11 Jan 2024 17:49:14 +0100 Subject: [PATCH 1/2] add test_tokenize_document_with_slow_tokenizer() --- tests/document/processing/test_tokenization.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/document/processing/test_tokenization.py b/tests/document/processing/test_tokenization.py index 5b043490a..7762b0b90 100644 --- a/tests/document/processing/test_tokenization.py +++ b/tests/document/processing/test_tokenization.py @@ -643,3 +643,12 @@ def test_tokenize_document_partition(text_document, tokenizer): (str(rel.head), rel.label, str(rel.tail)) for rel in tokenized_doc.relations ] assert relation_tuples == [("('it',)", "per:founder", "('O',)")] + + +def test_tokenize_document_with_slow_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) + text_document = TextBasedDocument(text="Hello World") + + tokenized_docs = tokenize_document( + text_document, tokenizer=tokenizer, result_document_type=TokenBasedDocument + ) From dcb733015ad104cb7cc14c85b882b1a519d9d18b Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 11 Jan 2024 18:07:11 +0100 Subject: [PATCH 2/2] add test_tokenize_document_with_slow_tokenizer_and_windowing() --- .../document/processing/test_tokenization.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/document/processing/test_tokenization.py b/tests/document/processing/test_tokenization.py index 7762b0b90..9a82046a3 100644 --- a/tests/document/processing/test_tokenization.py +++ b/tests/document/processing/test_tokenization.py @@ -647,8 +647,25 @@ def test_tokenize_document_partition(text_document, tokenizer): def test_tokenize_document_with_slow_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) - text_document = TextBasedDocument(text="Hello World") + text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.") tokenized_docs = tokenize_document( text_document, tokenizer=tokenizer, result_document_type=TokenBasedDocument ) + assert len(tokenized_docs) == 1 + + +def test_tokenize_document_with_slow_tokenizer_and_windowing(): + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) + text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.") + + tokenized_docs = tokenize_document( + text_document, + tokenizer=tokenizer, + result_document_type=TokenBasedDocument, + max_length=5, + return_overflowing_tokens=True, + ) + assert ( + len(tokenized_docs) == 3 + ) # the input text gets tokenized into 12 tokens and max_length is 5