Skip to content

Commit 21e2b81

Browse files
Test PDF file conversion
1 parent 04b0c37 commit 21e2b81

1 file changed

Lines changed: 52 additions & 13 deletions

File tree

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@
2727

2828
import os
2929
import subprocess
30-
from tempfile import TemporaryDirectory, NamedTemporaryFile
30+
from tempfile import mkdtemp, NamedTemporaryFile
3131
from unittest import mock, TestCase
3232

3333
from faker import Faker
3434
from faker_file.providers.docx_file import DocxFileProvider
3535
from faker_file.providers.odt_file import OdtFileProvider
36+
from faker_file.providers.pdf_file import PdfFileProvider
3637

3738
from cardinal_pythonlib.extract_text import (
3839
document_to_text,
@@ -42,17 +43,15 @@
4243

4344

4445
class DocumentToTextTests(TestCase):
45-
# For external tools we assume the tools are running correctly
46-
# and we just check that they are invoked with the correct arguments.
4746
def setUp(self) -> None:
48-
update_external_tools(
49-
{
50-
"antiword": "/path/to/antiword",
51-
}
52-
)
47+
self.empty_dir = mkdtemp()
5348

49+
self._replace_external_tools_with_fakes()
5450
self.config = TextProcessingConfig()
51+
self._create_mock_objects()
52+
self._register_faker_providers()
5553

54+
def _create_mock_objects(self) -> None:
5655
# Some mock empty output that we don't check
5756
mock_decode = mock.Mock(return_value="")
5857
mock_stdout = mock.Mock(decode=mock_decode)
@@ -61,9 +60,29 @@ def setUp(self) -> None:
6160
return_value=mock.Mock(communicate=mock_communicate)
6261
)
6362

63+
def _register_faker_providers(self) -> None:
6464
self.fake = Faker()
6565
self.fake.add_provider(DocxFileProvider)
6666
self.fake.add_provider(OdtFileProvider)
67+
self.fake.add_provider(PdfFileProvider)
68+
69+
def _replace_external_tools_with_fakes(self) -> None:
70+
# For external tools we assume the tools are running correctly
71+
# and we just check that they are invoked with the correct arguments.
72+
73+
tool_names = [
74+
"antiword",
75+
"pdftotext",
76+
"strings",
77+
"strings2",
78+
"unrtf",
79+
]
80+
81+
tools_dir = {t: os.path.join(self.empty_dir, t) for t in tool_names}
82+
update_external_tools(tools_dir)
83+
84+
def tearDown(self) -> None:
85+
os.rmdir(self.empty_dir)
6786

6887
def test_raises_when_no_filename_or_blob(self) -> None:
6988
with self.assertRaises(ValueError) as cm:
@@ -85,9 +104,8 @@ def test_raises_when_blob_but_no_extension(self) -> None:
85104

86105
def test_raises_when_not_a_file(self) -> None:
87106
with self.assertRaises(ValueError) as cm:
88-
with TemporaryDirectory() as temp_dir_name:
89-
filename = os.path.join(temp_dir_name, "foo")
90-
document_to_text(filename=filename)
107+
filename = os.path.join(self.empty_dir, "foo")
108+
document_to_text(filename=filename)
91109

92110
self.assertIn("no such file", str(cm.exception))
93111

@@ -113,7 +131,7 @@ def test_doc_converted_with_antiword(self) -> None:
113131
expected_calls = [
114132
mock.call(
115133
(
116-
"/path/to/antiword",
134+
f"{self.empty_dir}/antiword",
117135
"-w",
118136
str(self.config.width),
119137
temp_file.name,
@@ -135,7 +153,7 @@ def test_dot_converted_with_antiword(self) -> None:
135153
expected_calls = [
136154
mock.call(
137155
(
138-
"/path/to/antiword",
156+
f"{self.empty_dir}/antiword",
139157
"-w",
140158
str(self.config.width),
141159
temp_file.name,
@@ -193,3 +211,24 @@ def test_odt_converted(self) -> None:
193211
text = document_to_text(odt.data["filename"], config=self.config)
194212

195213
self.assertEqual(text.strip(), content)
214+
215+
def test_pdf_converted(self) -> None:
216+
with mock.patch.multiple(
217+
"cardinal_pythonlib.extract_text.subprocess",
218+
Popen=self.mock_popen,
219+
):
220+
with NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
221+
temp_file.close()
222+
document_to_text(filename=temp_file.name, config=self.config)
223+
224+
expected_calls = [
225+
mock.call(
226+
(
227+
f"{self.empty_dir}/pdftotext",
228+
temp_file.name,
229+
"-",
230+
),
231+
stdout=subprocess.PIPE,
232+
),
233+
]
234+
self.mock_popen.assert_has_calls(expected_calls)

0 commit comments

Comments
 (0)