Hi,
I set up OpenChemIE by:
conda create -n openchemie python=3.9
conda activate openchemie
git clone https://github.com/CrystalEye42/OpenChemIE.git
cd OpenChemIE
pip install --editable .
pip install pdftotext
conda install -c conda-forge poppler
conda install jupyter
jupyter notebook
Then I create a jupyter notebook in the folder where the pdf example is (acs.joc.2c00749.pdf)
Then I do:
import torch
from openchemie import OpenChemIE
model = OpenChemIE()
pdf_path = 'acs.joc.2c00749.pdf'
text_results = model.extract_reactions_from_text_in_pdf(pdf_path)
text_results
Which returns:
[{'page': 1, 'reactions': []},
{'page': 2, 'reactions': []},
{'page': 3, 'reactions': []},
{'page': 4, 'reactions': []},
{'page': 5, 'reactions': []},
{'page': 6, 'reactions': []},
{'page': 7, 'reactions': []},
{'page': 8, 'reactions': []},
{'page': 9, 'reactions': []},
{'page': 10, 'reactions': []}]
When I do:
figure_results = model.extract_reactions_from_figures_in_pdf(pdf_path)
I get:
OSError Traceback (most recent call last)
Cell In[4], line 1
----> 1 figure_results = model.extract_reactions_from_figures_in_pdf(pdf_path)
File c:\users\lyubomir\openchemie\openchemie\interface.py:450, in OpenChemIE.extract_reactions_from_figures_in_pdf(self, pdf, batch_size, num_pages, molscribe, ocr)
404 def extract_reactions_from_figures_in_pdf(self, pdf, batch_size=16, num_pages=None, molscribe=True, ocr=True):
405 """
406 Get reaction information from figures in pdf
407 Parameters:
(...)
448 ]
449 """
--> 450 figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
451 images = [figure['figure']['image'] for figure in figures]
452 results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
File c:\users\lyubomir\openchemie\openchemie\interface.py:203, in OpenChemIE.extract_figures_from_pdf(self, pdf, num_pages, output_bbox, output_image)
199 table_ext.set_output_image(output_image)
201 table_ext.set_output_bbox(output_bbox)
--> 203 return table_ext.extract_all_tables_and_figures(pages, self.pdfparser, content='figures')
File c:\users\lyubomir\openchemie\openchemie\interface.py:74, in OpenChemIE.pdfparser(self)
71 @Property
72 def pdfparser(self):
73 if self._pdfparser is None:
---> 74 self.init_pdfparser()
75 return self._pdfparser
File c:\users\lyubomir\openchemie\openchemie\interface.py:85, in OpenChemIE.init_pdfparser(self, ckpt_path)
79 """
80 Set model to custom checkpoint
81 Parameters:
82 ckpt_path: path to checkpoint to use, if None then will use default
83 """
84 config_path = "lp://efficientdet/PubLayNet/tf_efficientdet_d1"
---> 85 self._pdfparser = lp.AutoLayoutModel(config_path, model_path=ckpt_path, device=self.device.type)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\auto_layoutmodel.py:64, in AutoLayoutModel(config_path, model_path, label_map, device, extra_config)
62 for backend_name in ALL_AVAILABLE_BACKENDS:
63 if backend_name in config_path:
---> 64 return ALL_AVAILABLE_BACKENDS[backend_name](
65 config_path,
66 model_path=model_path,
67 label_map=label_map,
68 extra_config=extra_config,
69 device=device,
70 )
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\layoutmodel.py:138, in EfficientDetLayoutModel.init(self, config_path, model_path, label_map, extra_config, enforce_cpu, device)
134 self.device = device
136 extra_config = extra_config if extra_config is not None else {}
--> 138 self._initialize_model(config_path, model_path, label_map, extra_config)
140 self.output_confidence_threshold = extra_config.get(
141 "output_confidence_threshold", self.DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
142 )
144 self.preprocessor = InputTransform(self.config.image_size)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\layoutmodel.py:164, in EfficientDetLayoutModel._initialize_model(self, config_path, model_path, label_map, extra_config)
161 label_map = LABEL_MAP_CATALOG[dataset_name]
162 num_classes = len(label_map)
--> 164 model_path = PathManager.get_local_path(model_path)
166 self.model = create_model(
167 model_name,
168 num_classes=num_classes,
(...)
171 checkpoint_path=model_path,
172 )
173 else:
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:1251, in PathManager.get_local_path(self, path, force, **kwargs)
1249 handler = self.__get_path_handler(path) # type: ignore
1250 try:
-> 1251 bret = handler._get_local_path(path, force=force, **kwargs)
1252 except TypeError:
1253 bret = handler._get_local_path(path, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\catalog.py:64, in LayoutParserEfficientDetModelHandler._get_local_path(self, path, **kwargs)
62 else:
63 raise ValueError(f"Unknown data_type {data_type}")
---> 64 return PathManager.get_local_path(model_url, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:1251, in PathManager.get_local_path(self, path, force, **kwargs)
1249 handler = self.__get_path_handler(path) # type: ignore
1250 try:
-> 1251 bret = handler._get_local_path(path, force=force, **kwargs)
1252 except TypeError:
1253 bret = handler._get_local_path(path, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:835, in HTTPURLHandler.get_local_path(self, path, force, cache_dir, **kwargs)
832 filename = filename[:100] + "" + uuid.uuid4().hex
834 cached = os.path.join(dirname, filename)
--> 835 with file_lock(cached):
836 if not os.path.isfile(cached):
837 logger.info("Downloading {} ...".format(path))
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:302, in Lock.enter(self)
301 def enter(self) -> typing.IO[typing.AnyStr]:
--> 302 return self.acquire()
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:256, in Lock.acquire(self, timeout, check_interval, fail_when_locked)
253 return fh
255 # Get a new filehandler
--> 256 fh = self._get_fh()
258 def try_close(): # pragma: no cover
259 # Silently try to close the handle if possible, ignore all issues
260 if fh is not None:
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:313, in Lock._get_fh(self)
311 def _get_fh(self) -> typing.IO:
312 '''Get a new filehandle'''
--> 313 return open( # noqa: SIM115
314 self.filename,
315 self.mode,
316 **self.file_open_kwargs,
317 )
OSError: [Errno 22] Invalid argument: 'C:\Users\Lyubomir/.torch/iopath_cache\s/gxy11xkkiwnpgog\publaynet-tf_efficientdet_d1.pth.tar?dl=1.lock'
Any idea why the extract_reactions_from_text_in_pdf gives empty lists and the extract_reactions_from_figures_in_pdf throws an error?
Thank you in advance!
Hi,
I set up OpenChemIE by:
conda create -n openchemie python=3.9
conda activate openchemie
git clone https://github.com/CrystalEye42/OpenChemIE.git
cd OpenChemIE
pip install --editable .
pip install pdftotext
conda install -c conda-forge poppler
conda install jupyter
jupyter notebook
Then I create a jupyter notebook in the folder where the pdf example is (acs.joc.2c00749.pdf)
Then I do:
import torch
from openchemie import OpenChemIE
model = OpenChemIE()
pdf_path = 'acs.joc.2c00749.pdf'
text_results = model.extract_reactions_from_text_in_pdf(pdf_path)
text_results
Which returns:
[{'page': 1, 'reactions': []},
{'page': 2, 'reactions': []},
{'page': 3, 'reactions': []},
{'page': 4, 'reactions': []},
{'page': 5, 'reactions': []},
{'page': 6, 'reactions': []},
{'page': 7, 'reactions': []},
{'page': 8, 'reactions': []},
{'page': 9, 'reactions': []},
{'page': 10, 'reactions': []}]
When I do:
figure_results = model.extract_reactions_from_figures_in_pdf(pdf_path)
I get:
OSError Traceback (most recent call last)
Cell In[4], line 1
----> 1 figure_results = model.extract_reactions_from_figures_in_pdf(pdf_path)
File c:\users\lyubomir\openchemie\openchemie\interface.py:450, in OpenChemIE.extract_reactions_from_figures_in_pdf(self, pdf, batch_size, num_pages, molscribe, ocr)
404 def extract_reactions_from_figures_in_pdf(self, pdf, batch_size=16, num_pages=None, molscribe=True, ocr=True):
405 """
406 Get reaction information from figures in pdf
407 Parameters:
(...)
448 ]
449 """
--> 450 figures = self.extract_figures_from_pdf(pdf, num_pages=num_pages, output_bbox=True)
451 images = [figure['figure']['image'] for figure in figures]
452 results = self.extract_reactions_from_figures(images, batch_size=batch_size, molscribe=molscribe, ocr=ocr)
File c:\users\lyubomir\openchemie\openchemie\interface.py:203, in OpenChemIE.extract_figures_from_pdf(self, pdf, num_pages, output_bbox, output_image)
199 table_ext.set_output_image(output_image)
201 table_ext.set_output_bbox(output_bbox)
--> 203 return table_ext.extract_all_tables_and_figures(pages, self.pdfparser, content='figures')
File c:\users\lyubomir\openchemie\openchemie\interface.py:74, in OpenChemIE.pdfparser(self)
71 @Property
72 def pdfparser(self):
73 if self._pdfparser is None:
---> 74 self.init_pdfparser()
75 return self._pdfparser
File c:\users\lyubomir\openchemie\openchemie\interface.py:85, in OpenChemIE.init_pdfparser(self, ckpt_path)
79 """
80 Set model to custom checkpoint
81 Parameters:
82 ckpt_path: path to checkpoint to use, if None then will use default
83 """
84 config_path = "lp://efficientdet/PubLayNet/tf_efficientdet_d1"
---> 85 self._pdfparser = lp.AutoLayoutModel(config_path, model_path=ckpt_path, device=self.device.type)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\auto_layoutmodel.py:64, in AutoLayoutModel(config_path, model_path, label_map, device, extra_config)
62 for backend_name in ALL_AVAILABLE_BACKENDS:
63 if backend_name in config_path:
---> 64 return ALL_AVAILABLE_BACKENDS[backend_name](
65 config_path,
66 model_path=model_path,
67 label_map=label_map,
68 extra_config=extra_config,
69 device=device,
70 )
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\layoutmodel.py:138, in EfficientDetLayoutModel.init(self, config_path, model_path, label_map, extra_config, enforce_cpu, device)
134 self.device = device
136 extra_config = extra_config if extra_config is not None else {}
--> 138 self._initialize_model(config_path, model_path, label_map, extra_config)
140 self.output_confidence_threshold = extra_config.get(
141 "output_confidence_threshold", self.DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
142 )
144 self.preprocessor = InputTransform(self.config.image_size)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\layoutmodel.py:164, in EfficientDetLayoutModel._initialize_model(self, config_path, model_path, label_map, extra_config)
161 label_map = LABEL_MAP_CATALOG[dataset_name]
162 num_classes = len(label_map)
--> 164 model_path = PathManager.get_local_path(model_path)
166 self.model = create_model(
167 model_name,
168 num_classes=num_classes,
(...)
171 checkpoint_path=model_path,
172 )
173 else:
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:1251, in PathManager.get_local_path(self, path, force, **kwargs)
1249 handler = self.__get_path_handler(path) # type: ignore
1250 try:
-> 1251 bret = handler._get_local_path(path, force=force, **kwargs)
1252 except TypeError:
1253 bret = handler._get_local_path(path, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\layoutparser\models\effdet\catalog.py:64, in LayoutParserEfficientDetModelHandler._get_local_path(self, path, **kwargs)
62 else:
63 raise ValueError(f"Unknown data_type {data_type}")
---> 64 return PathManager.get_local_path(model_url, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:1251, in PathManager.get_local_path(self, path, force, **kwargs)
1249 handler = self.__get_path_handler(path) # type: ignore
1250 try:
-> 1251 bret = handler._get_local_path(path, force=force, **kwargs)
1252 except TypeError:
1253 bret = handler._get_local_path(path, **kwargs)
File ~\anaconda3\envs\open_chemie\lib\site-packages\iopath\common\file_io.py:835, in HTTPURLHandler.get_local_path(self, path, force, cache_dir, **kwargs)
832 filename = filename[:100] + "" + uuid.uuid4().hex
834 cached = os.path.join(dirname, filename)
--> 835 with file_lock(cached):
836 if not os.path.isfile(cached):
837 logger.info("Downloading {} ...".format(path))
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:302, in Lock.enter(self)
301 def enter(self) -> typing.IO[typing.AnyStr]:
--> 302 return self.acquire()
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:256, in Lock.acquire(self, timeout, check_interval, fail_when_locked)
253 return fh
255 # Get a new filehandler
--> 256 fh = self._get_fh()
258 def try_close(): # pragma: no cover
259 # Silently try to close the handle if possible, ignore all issues
260 if fh is not None:
File ~\anaconda3\envs\open_chemie\lib\site-packages\portalocker\utils.py:313, in Lock._get_fh(self)
311 def _get_fh(self) -> typing.IO:
312 '''Get a new filehandle'''
--> 313 return open( # noqa: SIM115
314 self.filename,
315 self.mode,
316 **self.file_open_kwargs,
317 )
OSError: [Errno 22] Invalid argument: 'C:\Users\Lyubomir/.torch/iopath_cache\s/gxy11xkkiwnpgog\publaynet-tf_efficientdet_d1.pth.tar?dl=1.lock'
Any idea why the extract_reactions_from_text_in_pdf gives empty lists and the extract_reactions_from_figures_in_pdf throws an error?
Thank you in advance!