diff --git a/.flake8 b/.flake8 index c0511db7..6ceb856f 100644 --- a/.flake8 +++ b/.flake8 @@ -30,6 +30,7 @@ exclude = venv, .venv, build, + etc, dedoc.egg-info, docs/_build, scripts/fintoc2022/metric.py diff --git a/Dockerfile b/Dockerfile index d15e3421..463f3037 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources" COPY requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip3 install --no-cache-dir transformers~=4.49.0 RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge RUN apt install -y libutf8proc-dev RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1 diff --git a/dedoc/attachments_extractors/__init__.py b/dedoc/attachments_extractors/__init__.py index 5fca697e..6c2fca6c 100644 --- a/dedoc/attachments_extractors/__init__.py +++ b/dedoc/attachments_extractors/__init__.py @@ -2,9 +2,10 @@ from .concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor from .concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from .concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor +from .concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor from .concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor from .concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor from .concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor __all__ = ['AbstractAttachmentsExtractor', 'AbstractOfficeAttachmentsExtractor', 'DocxAttachmentsExtractor', 'ExcelAttachmentsExtractor', - 'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor'] + 'ImageAttachmentsExtractor', 'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor'] diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py new file mode 100644 index 00000000..2d423445 --- /dev/null +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py @@ -0,0 +1,100 @@ +import os +import uuid +from typing import Dict, Iterable, List, Optional + +from dedocutils.data_structures.bbox import BBox +from numpy import ndarray +from torch import Tensor + +from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor +from dedoc.data_structures.attached_file import AttachedFile + + +class ImageAttachmentsExtractor(AbstractAttachmentsExtractor): + """ + Extract attachments from image files. + """ + def __init__(self, *, config: Optional[dict] = None) -> None: + from dedoc.extensions import recognized_extensions, recognized_mimes + from dedoc.config import get_config + super().__init__(config=config, recognized_extensions=recognized_extensions.image_like_format, recognized_mimes=recognized_mimes.image_like_format) + self._classes = { + 2, # Formula + 6 # Picture + } + self._image_processor = None + self._model = None + + model_path = os.path.join(get_config()["resources_path"], "layout_model") + if os.path.exists(model_path): + self._model_name = model_path + self.logger.info("Using locally saved layout analysis model") + else: + self._model_name = "docling-project/docling-layout-heron" + self.logger.info("Layout analysis model will be loaded from huggingface") + self._threshold = self.config.get("image_detection_threshold", 0.7) + + def _predict(self, image: ndarray) -> Iterable[Dict[str, Tensor]]: + import torch + from transformers import RTDetrImageProcessor, RTDetrV2ForObjectDetection + + if self._image_processor is None: + self._image_processor = RTDetrImageProcessor.from_pretrained(self._model_name) + + if self._model is None: + self._model = RTDetrV2ForObjectDetection.from_pretrained(self._model_name) + + inputs = self._image_processor(images=[image], return_tensors="pt") + with torch.no_grad(): + outputs = self._model(**inputs) + + results = self._image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.shape[:-1]]), threshold=self._threshold) + return results + + def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: + """ + Get attachments from the given image using a document layout analysis method https://huggingface.co/docling-project/docling-layout-heron. + + Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about the methods' parameters. + """ + import cv2 + import os + from dedoc.utils.parameter_utils import get_param_need_content_analysis, get_param_attachments_dir + from dedoc.utils.utils import get_unique_name + from dedoc.readers.pdf_reader.data_classes.tables.location import Location + from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment + + parameters = {} if parameters is None else parameters + tmpdir, filename = os.path.split(file_path) + attachments_dir = get_param_attachments_dir(parameters, tmpdir) + attachments = [] + + image = cv2.imread(file_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + predictions = self._predict(image) + + for prediction in predictions: + for label_id, box in zip(prediction["labels"], prediction["boxes"]): + if label_id.item() not in self._classes: + continue + + box = [round(i) for i in box.tolist()] + x_top_left, x_bottom_right = max(0, box[0]), min(box[2], image.shape[1]) + y_top_left, y_bottom_right = max(0, box[1]), min(box[3], image.shape[0]) + part = image[y_top_left:y_bottom_right, x_top_left:x_bottom_right] + image_location = Location(page_number=0, bbox=BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))) + + tmp_file_name = get_unique_name(filename) + tmp_file_path = os.path.join(attachments_dir, tmp_file_name) + cv2.imwrite(tmp_file_path, cv2.cvtColor(part, cv2.COLOR_RGB2BGR)) + + image_attachment = PdfImageAttachment( + original_name=tmp_file_name, + tmp_file_path=tmp_file_path, + need_content_analysis=get_param_need_content_analysis(parameters), + uid=f"attach_{uuid.uuid4()}", + location=image_location + ) + attachments.append(image_attachment) + + return attachments diff --git a/dedoc/download_models.py b/dedoc/download_models.py index cdc02d15..7dd4b2ad 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -1,18 +1,19 @@ """Downloading models in advance inside the docker container.""" """ -These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc. +These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc and other users. Keys are the names of repositories with models. """ -model_hash_dict = dict( - txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f", - scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e", - font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", - paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067", - line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae", - fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8", - torch_cnn="5333909f858f5f632df478ef5a53af6dfd26f2e1" -) +model_hash_dict = { + "txtlayer_classifier": "9ca1de749d8d37147b00a3a228e03ee1776c695f", + "scan_orientation_efficient_net_b0": "c60812552a1be624476c1e5b58599867b36f8d4e", + "font_classifier": "db4481ad60ab050cbb42079b64f97f9e431feb07", + "paragraph_classifier": "97c4b78bc20d87ec7d53389e09f1ca35c6ade067", + "line_type_classifiers": "6ad0eacbfdea065b658cb6f039d13f75245d51ae", + "fintoc_classifiers": "6a907b7d2437c3f61ac9c506f67175207982fae8", + "torch_cnn": "5333909f858f5f632df478ef5a53af6dfd26f2e1", + "docling-layout-heron": "8f39ad3c0b4c58e9c2d2c84a38465abf757272d8" +} def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str, user_name: str = "dedoc") -> None: @@ -54,6 +55,16 @@ def download(resources_path: str) -> None: download_from_hub(out_dir=resources_path, out_name="glyph_recognizer.pt", repo_name="torch_cnn", hub_name="rus_eng.pt", user_name="sinkudo") + layout_dir_path = os.path.join(resources_path, "layout_model") + download_from_hub(out_dir=layout_dir_path, out_name="config.json", repo_name="docling-layout-heron", hub_name="config.json", user_name="docling-project") + download_from_hub( + out_dir=layout_dir_path, out_name="model.safetensors", repo_name="docling-layout-heron", hub_name="model.safetensors", user_name="docling-project" + ) + download_from_hub( + out_dir=layout_dir_path, out_name="preprocessor_config.json", repo_name="docling-layout-heron", hub_name="preprocessor_config.json", + user_name="docling-project" + ) + if __name__ == "__main__": from dedoc.config import get_config diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index 3ee49944..250f5a2c 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -34,7 +34,7 @@ def to_dict(self) -> Dict[str, Any]: return res def __eq__(self, other: "Location") -> bool: - return (self.page_number, self.bbox.y_bottom_right) == (other.page_number, other.bbox.y_bottom_right) + return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) == (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left) def __lt__(self, other: "Location") -> bool: - return (self.page_number, self.bbox.y_bottom_right) < (other.page_number, other.bbox.y_bottom_right) + return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) < (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 1c759033..fc91c968 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -21,6 +21,8 @@ class PdfImageReader(PdfBaseReader): * table detection and recognition; + * image detection; + * document binarization (configure via `need_binarization` parameter); * document orientation correction (automatically rotate on 90, 180, 270 degrees if it's needed); @@ -34,6 +36,7 @@ class PdfImageReader(PdfBaseReader): def __init__(self, *, config: Optional[dict] = None) -> None: from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector + from dedoc.attachments_extractors.concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor from dedoc.config import get_config @@ -53,6 +56,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None: config=self.config) self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=self.config) + self.attachments_extractor = ImageAttachmentsExtractor(config=self.config) self.page_number = None def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -66,21 +70,25 @@ def _process_one_page(self, import os from datetime import datetime import cv2 + from dedocutils.utils import rotate_image + from dedoc.utils.image_utils import fill_bbox_on_image from dedoc.utils.parameter_utils import get_path_param + from dedoc.utils.utils import get_unique_name + + initial_image = image + # --- Step 1: do binarization --- + if parameters.need_binarization: + image, _ = self.binarizer.preprocess(image) + if self.config.get("debug_mode", False): + debug_dir = get_path_param(self.config, "path_debug") + cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), image) - # --- Step 1: correct orientation and detect column count --- + # --- Step 2: correct orientation and detect column count --- self.page_number = page_number rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters) if self.config.get("debug_mode", False): self.logger.info(f"Angle page rotation = {angle}") - # --- Step 2: do binarization --- - if parameters.need_binarization: - rotated_image, _ = self.binarizer.preprocess(rotated_image) - if self.config.get("debug_mode", False): - debug_dir = get_path_param(self.config, "path_debug") - cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image) - # --- Step 3: table detection and recognition --- if parameters.need_pdf_table_analysis: clean_image, tables = self.table_recognizer.recognize_tables_from_image( @@ -92,10 +100,26 @@ def _process_one_page(self, else: clean_image, tables = rotated_image, [] - # --- Step 4: plain text recognition and text style detection --- + # --- Step 4: image detection --- + attached_images = [] + if parameters.with_attachments: + tmpdir = os.path.split(path)[0] + tmp_file_path = os.path.join(tmpdir, get_unique_name("rotated.png")) + non_binarized_rotated_image = rotate_image(initial_image, angle) + cv2.imwrite(tmp_file_path, non_binarized_rotated_image) + attached_images = [] + + for attach in self.attachments_extractor.extract(file_path=tmp_file_path, parameters=dict(zip(parameters._fields, parameters))): + attach.location.page_number = page_number + attached_images.append(attach) + clean_image = fill_bbox_on_image(clean_image, attach.location.bbox) + + # --- Step 5: plain text recognition and text style detection --- page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number) - lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page) + if parameters.with_attachments: + page.attachments.extend(attached_images) + return lines, tables, page.attachments, [angle] def _detect_column_count_and_orientation(self, image: ndarray, parameters: ParametersForParseDoc) -> Tuple[ndarray, bool, float]: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index 26f153df..d6038f13 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -8,13 +8,13 @@ import cv2 import numpy as np from PIL import Image -from dedocutils.data_structures import BBox from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor +from dedoc.utils.image_utils import fill_bbox_on_image """-------------------------------------entry class of Table Recognizer Module---------------------------------------""" @@ -87,31 +87,9 @@ def __clean_image_from_table(image: np.ndarray, tables: List[ScanTable]) -> np.n image_copy = np.copy(image) for table in tables: for location in table.locations: - image_copy = TableRecognizer.__clean_image(image_copy, location.bbox) + image_copy = fill_bbox_on_image(image_copy, location.bbox) return image_copy - @staticmethod - def __clean_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray: - """ - replace bboxes with given color (for example to remove tables from images) - @param image: original image - @param bbox: bbox to clear from image - @param color: color to replace bboxes - @return: image without given bboxes - """ - x_min = bbox.x_top_left - x_max = x_min + bbox.width - - y_min = bbox.y_top_left - y_max = y_min + bbox.height - - if len(image.shape) == 3: - image[y_min: y_max, x_min: x_max, :] = color - else: - image[y_min: y_max, x_min: x_max] = color - - return image - def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> List[ScanTable]: filtered = [] for table in tables: diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 45ddaa60..a458cf5f 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict, deque -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from dedocutils.data_structures import BBox @@ -82,20 +82,30 @@ def _find_closest_line(self, @param lines_after: self.n_lines after object @return: best line to link with object """ + best_line = self._get_closest_line_on_same_page(page_object, lines_before) + if not best_line: + best_line = self._get_closest_line_on_same_page(page_object, lines_after) + if best_line: + return best_line + all_lines = lines_before + lines_after - line_on_same_page = [line for line in all_lines if line.location.page_number == page_object.location.page_number] # no one line on the same page - if len(line_on_same_page) == 0: - previous_page_id = page_object.location.page_number - 1 - if previous_page_id in last_page_line: - return last_page_line[previous_page_id] - lines_prev_page = [line for line in all_lines if line.location < page_object.location] - if len(lines_prev_page) > 0: - return max(lines_prev_page, key=lambda line: line.location) - else: - return min(all_lines, key=lambda line: line.location) - line_with_distance = [(self._distance_bboxes(line, page_object.location.bbox), line) for line in line_on_same_page] - return min(line_with_distance, key=lambda t: t[0])[1] + previous_page_id = page_object.location.page_number - 1 + if previous_page_id in last_page_line: + return last_page_line[previous_page_id] + lines_prev_page = [line for line in all_lines if line.location < page_object.location] + if len(lines_prev_page) > 0: + return max(lines_prev_page, key=lambda line: line.location) + else: + return min(all_lines, key=lambda line: line.location) + + @staticmethod + def _get_closest_line_on_same_page(page_obj: Union[ScanTable, PdfImageAttachment], lines: List[LineWithLocation]) -> Optional[LineWithLocation]: + lines_on_same_page = [line for line in lines if line.location.page_number == page_obj.location.page_number] + if not lines_on_same_page: + return None + lines_with_distance = [(LineObjectLinker._distance_bboxes(line, page_obj.location.bbox), line) for line in lines_on_same_page] + return min(lines_with_distance, key=lambda t: t[0])[1] @staticmethod def _distance_bboxes(line: LineWithLocation, object_bbox: BBox) -> float: diff --git a/dedoc/utils/image_utils.py b/dedoc/utils/image_utils.py index ea7d963a..f762581f 100644 --- a/dedoc/utils/image_utils.py +++ b/dedoc/utils/image_utils.py @@ -107,3 +107,25 @@ def get_concat_v(images: List[Image.Image]) -> Image.Image: dst.paste(image, (0, height)) height += image.height return dst + + +def fill_bbox_on_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray: + """ + replace bboxes with given color (for example to remove tables from images) + @param image: original image + @param bbox: bbox to clear from image + @param color: color to replace bboxes + @return: image without given bboxes + """ + x_min = bbox.x_top_left + x_max = x_min + bbox.width + + y_min = bbox.y_top_left + y_max = y_min + bbox.height + + if len(image.shape) == 3: + image[y_min: y_max, x_min: x_max, :] = color + else: + image[y_min: y_max, x_min: x_max] = color + + return image diff --git a/docker_gpu/Dockerfile b/docker_gpu/Dockerfile index e3fa713b..44e374cb 100644 --- a/docker_gpu/Dockerfile +++ b/docker_gpu/Dockerfile @@ -1,5 +1,5 @@ ARG REPOSITORY="docker.io" -FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 +FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root" ENV RESOURCES_PATH "/dedoc_root/resources" @@ -7,6 +7,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources" ADD requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt RUN pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip3 install --no-cache-dir transformers~=4.49.0 RUN mkdir /dedoc_root RUN mkdir /dedoc_root/dedoc diff --git a/docs/source/modules/attachments_extractors.rst b/docs/source/modules/attachments_extractors.rst index b5d55e17..fef8dba0 100644 --- a/docs/source/modules/attachments_extractors.rst +++ b/docs/source/modules/attachments_extractors.rst @@ -30,3 +30,7 @@ dedoc.attachments_extractors .. autoclass:: dedoc.attachments_extractors.PDFAttachmentsExtractor :show-inheritance: :members: + +.. autoclass:: dedoc.attachments_extractors.ImageAttachmentsExtractor + :show-inheritance: + :members: diff --git a/docs/source/readers_output/annotations.rst b/docs/source/readers_output/annotations.rst index 2a13989d..eb9b70b8 100644 --- a/docs/source/readers_output/annotations.rst +++ b/docs/source/readers_output/annotations.rst @@ -29,8 +29,8 @@ Below the readers are enlisted that can return non-empty list of annotations for - `+` - `-` - `-` - - `-` - - `-` + - `+` + - `+` - `+` - `+` diff --git a/pyproject.toml b/pyproject.toml index 46869cf1..9badac73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ where = ["."] include = ["dedoc*"] [project.optional-dependencies] -torch = ["torch~=1.11.0", "torchvision~=0.12.0"] +torch = ["torch~=1.11.0", "torchvision~=0.12.0", "transformers~=4.49.0"] docs = [ "docutils==0.18.1", "Sphinx==6.2.1", diff --git a/tests/Dockerfile b/tests/Dockerfile index 4d47c8db..6072c249 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -4,8 +4,9 @@ FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root:/dedoc_root/tests:/dedoc_root/langchain" COPY requirements.txt . -RUN pip3 install "langchain-community<1.0" +RUN pip3 install --no-cache-dir "langchain-community<1.0" RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip3 install --no-cache-dir transformers~=4.49.0 RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge RUN mkdir /dedoc_root diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py index abad8115..bd15339a 100644 --- a/tests/api_tests/test_api_misc_with_images_refs.py +++ b/tests/api_tests/test_api_misc_with_images_refs.py @@ -71,7 +71,7 @@ def test_pdf_pdfminer_images_refs(self) -> None: self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) - attach_annotation = structure["subparagraphs"][2]["annotations"][-2] + attach_annotation = structure["subparagraphs"][1]["annotations"][-1] self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) @@ -99,6 +99,19 @@ def test_pdf_tabby_images_refs(self) -> None: self.assertEqual(attach_annotation["name"], "attachment") self.assertIn(attach_annotation["value"], attachment_uids) + def test_images_refs_from_image(self) -> None: + file_name = "with_images.png" + result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) + + attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]} + self.assertEqual(len(attachment_uids), 2) + subparagraphs = result["content"]["structure"]["subparagraphs"] + + for i in (1, 43): + attach_annotations = [ann for ann in subparagraphs[i]["annotations"] if ann["name"] == AttachAnnotation.name] + self.assertEqual(len(attach_annotations), 1, f'Wrong node for attachment link: {subparagraphs[i]["text"]}') + self.assertIn(attach_annotations[0]["value"], attachment_uids) + def test_pptx_images_refs(self) -> None: file_name = "with_attachments_1.pptx" result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear")) diff --git a/tests/data/with_attachments/with_images.png b/tests/data/with_attachments/with_images.png new file mode 100644 index 00000000..df48d1f5 Binary files /dev/null and b/tests/data/with_attachments/with_images.png differ diff --git a/tests/unit_tests/test_misc_langchain_document_loader.py b/tests/unit_tests/test_misc_langchain_document_loader.py index 50035c9f..55b7a895 100644 --- a/tests/unit_tests/test_misc_langchain_document_loader.py +++ b/tests/unit_tests/test_misc_langchain_document_loader.py @@ -72,7 +72,8 @@ def test_dedoc_base_loader(self) -> None: self.assertGreater(len(docs), 1) loader = DedocFileLoader( - file_path, split="document", with_tables=True, with_attachments=True, need_content_analysis=True, need_pdf_table_analysis=False + file_path, split="document", with_tables=True, with_attachments=True, need_content_analysis=True, need_pdf_table_analysis=False, + recursion_deep_attachments=1 ) text_docs, table_docs, attachment_docs = [], [], [] for doc in loader.load():