2727
2828import os
2929import subprocess
30- from tempfile import TemporaryDirectory , NamedTemporaryFile
30+ from tempfile import mkdtemp , NamedTemporaryFile
3131from unittest import mock , TestCase
3232
3333from faker import Faker
3434from faker_file .providers .docx_file import DocxFileProvider
3535from faker_file .providers .odt_file import OdtFileProvider
36+ from faker_file .providers .pdf_file import PdfFileProvider
3637
3738from cardinal_pythonlib .extract_text import (
3839 document_to_text ,
4243
4344
4445class DocumentToTextTests (TestCase ):
45- # For external tools we assume the tools are running correctly
46- # and we just check that they are invoked with the correct arguments.
4746 def setUp (self ) -> None :
48- update_external_tools (
49- {
50- "antiword" : "/path/to/antiword" ,
51- }
52- )
47+ self .empty_dir = mkdtemp ()
5348
49+ self ._replace_external_tools_with_fakes ()
5450 self .config = TextProcessingConfig ()
51+ self ._create_mock_objects ()
52+ self ._register_faker_providers ()
5553
54+ def _create_mock_objects (self ) -> None :
5655 # Some mock empty output that we don't check
5756 mock_decode = mock .Mock (return_value = "" )
5857 mock_stdout = mock .Mock (decode = mock_decode )
@@ -61,9 +60,29 @@ def setUp(self) -> None:
6160 return_value = mock .Mock (communicate = mock_communicate )
6261 )
6362
63+ def _register_faker_providers (self ) -> None :
6464 self .fake = Faker ()
6565 self .fake .add_provider (DocxFileProvider )
6666 self .fake .add_provider (OdtFileProvider )
67+ self .fake .add_provider (PdfFileProvider )
68+
69+ def _replace_external_tools_with_fakes (self ) -> None :
70+ # For external tools we assume the tools are running correctly
71+ # and we just check that they are invoked with the correct arguments.
72+
73+ tool_names = [
74+ "antiword" ,
75+ "pdftotext" ,
76+ "strings" ,
77+ "strings2" ,
78+ "unrtf" ,
79+ ]
80+
81+ tools_dir = {t : os .path .join (self .empty_dir , t ) for t in tool_names }
82+ update_external_tools (tools_dir )
83+
84+ def tearDown (self ) -> None :
85+ os .rmdir (self .empty_dir )
6786
6887 def test_raises_when_no_filename_or_blob (self ) -> None :
6988 with self .assertRaises (ValueError ) as cm :
@@ -85,9 +104,8 @@ def test_raises_when_blob_but_no_extension(self) -> None:
85104
86105 def test_raises_when_not_a_file (self ) -> None :
87106 with self .assertRaises (ValueError ) as cm :
88- with TemporaryDirectory () as temp_dir_name :
89- filename = os .path .join (temp_dir_name , "foo" )
90- document_to_text (filename = filename )
107+ filename = os .path .join (self .empty_dir , "foo" )
108+ document_to_text (filename = filename )
91109
92110 self .assertIn ("no such file" , str (cm .exception ))
93111
@@ -113,7 +131,7 @@ def test_doc_converted_with_antiword(self) -> None:
113131 expected_calls = [
114132 mock .call (
115133 (
116- "/path/to /antiword" ,
134+ f" { self . empty_dir } /antiword" ,
117135 "-w" ,
118136 str (self .config .width ),
119137 temp_file .name ,
@@ -135,7 +153,7 @@ def test_dot_converted_with_antiword(self) -> None:
135153 expected_calls = [
136154 mock .call (
137155 (
138- "/path/to /antiword" ,
156+ f" { self . empty_dir } /antiword" ,
139157 "-w" ,
140158 str (self .config .width ),
141159 temp_file .name ,
@@ -193,3 +211,24 @@ def test_odt_converted(self) -> None:
193211 text = document_to_text (odt .data ["filename" ], config = self .config )
194212
195213 self .assertEqual (text .strip (), content )
214+
215+ def test_pdf_converted (self ) -> None :
216+ with mock .patch .multiple (
217+ "cardinal_pythonlib.extract_text.subprocess" ,
218+ Popen = self .mock_popen ,
219+ ):
220+ with NamedTemporaryFile (suffix = ".pdf" , delete = False ) as temp_file :
221+ temp_file .close ()
222+ document_to_text (filename = temp_file .name , config = self .config )
223+
224+ expected_calls = [
225+ mock .call (
226+ (
227+ f"{ self .empty_dir } /pdftotext" ,
228+ temp_file .name ,
229+ "-" ,
230+ ),
231+ stdout = subprocess .PIPE ,
232+ ),
233+ ]
234+ self .mock_popen .assert_has_calls (expected_calls )
0 commit comments