From e626582ff43ffb52d7507693d3214fce440736a6 Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Thu, 11 Jan 2024 17:49:14 +0100
Subject: [PATCH 1/2] add test_tokenize_document_with_slow_tokenizer()

---
 tests/document/processing/test_tokenization.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/document/processing/test_tokenization.py b/tests/document/processing/test_tokenization.py
index 5b043490a..7762b0b90 100644
--- a/tests/document/processing/test_tokenization.py
+++ b/tests/document/processing/test_tokenization.py
@@ -643,3 +643,12 @@ def test_tokenize_document_partition(text_document, tokenizer):
         (str(rel.head), rel.label, str(rel.tail)) for rel in tokenized_doc.relations
     ]
     assert relation_tuples == [("('it',)", "per:founder", "('O',)")]
+
+
+def test_tokenize_document_with_slow_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+    text_document = TextBasedDocument(text="Hello World")
+
+    tokenized_docs = tokenize_document(
+        text_document, tokenizer=tokenizer, result_document_type=TokenBasedDocument
+    )

From dcb733015ad104cb7cc14c85b882b1a519d9d18b Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Thu, 11 Jan 2024 18:07:11 +0100
Subject: [PATCH 2/2] add
 test_tokenize_document_with_slow_tokenizer_and_windowing()

---
 .../document/processing/test_tokenization.py  | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/document/processing/test_tokenization.py b/tests/document/processing/test_tokenization.py
index 7762b0b90..9a82046a3 100644
--- a/tests/document/processing/test_tokenization.py
+++ b/tests/document/processing/test_tokenization.py
@@ -647,8 +647,25 @@ def test_tokenize_document_partition(text_document, tokenizer):
 
 def test_tokenize_document_with_slow_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
-    text_document = TextBasedDocument(text="Hello World")
+    text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.")
 
     tokenized_docs = tokenize_document(
         text_document, tokenizer=tokenizer, result_document_type=TokenBasedDocument
     )
+    assert len(tokenized_docs) == 1
+
+
+def test_tokenize_document_with_slow_tokenizer_and_windowing():
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+    text_document = TextBasedDocument(text="Alice has a cat. Bob has a dog.")
+
+    tokenized_docs = tokenize_document(
+        text_document,
+        tokenizer=tokenizer,
+        result_document_type=TokenBasedDocument,
+        max_length=5,
+        return_overflowing_tokens=True,
+    )
+    assert (
+        len(tokenized_docs) == 3
+    )  # the input text gets tokenized into 12 tokens and max_length is 5