ispras · NastyBoget · Mar 18, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 13, 2026
diff --git a/.flake8 b/.flake8
@@ -30,6 +30,7 @@ exclude =
     venv,
     .venv,
     build,
+    etc,
     dedoc.egg-info,
     docs/_build,
     scripts/fintoc2022/metric.py

diff --git a/Dockerfile b/Dockerfile
@@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
 
 COPY requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install --no-cache-dir transformers~=4.49.0
 RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
 RUN apt install -y libutf8proc-dev
 RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1

diff --git a/dedoc/attachments_extractors/__init__.py b/dedoc/attachments_extractors/__init__.py
@@ -2,9 +2,10 @@
 from .concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
 from .concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
 from .concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor
+from .concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor
 from .concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor
 from .concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
 from .concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
 
 __all__ = ['AbstractAttachmentsExtractor', 'AbstractOfficeAttachmentsExtractor', 'DocxAttachmentsExtractor', 'ExcelAttachmentsExtractor',
-           'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor']
+           'ImageAttachmentsExtractor', 'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor']
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py
@@ -0,0 +1,100 @@
+import os
+import uuid
+from typing import Dict, Iterable, List, Optional
+
+from dedocutils.data_structures.bbox import BBox
+from numpy import ndarray
+from torch import Tensor
+
+from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
+from dedoc.data_structures.attached_file import AttachedFile
+
+
+class ImageAttachmentsExtractor(AbstractAttachmentsExtractor):
+    """
+    Extract attachments from image files.
+    """
+    def __init__(self, *, config: Optional[dict] = None) -> None:
+        from dedoc.extensions import recognized_extensions, recognized_mimes
+        from dedoc.config import get_config
+        super().__init__(config=config, recognized_extensions=recognized_extensions.image_like_format, recognized_mimes=recognized_mimes.image_like_format)
+        self._classes = {
+            2,  # Formula
+            6  # Picture
+        }
+        self._image_processor = None
+        self._model = None
+
+        model_path = os.path.join(get_config()["resources_path"], "layout_model")
+        if os.path.exists(model_path):
+            self._model_name = model_path
+            self.logger.info("Using locally saved layout analysis model")
+        else:
+            self._model_name = "docling-project/docling-layout-heron"
+            self.logger.info("Layout analysis model will be loaded from huggingface")
+        self._threshold = self.config.get("image_detection_threshold", 0.7)
+
+    def _predict(self, image: ndarray) -> Iterable[Dict[str, Tensor]]:
+        import torch
+        from transformers import RTDetrImageProcessor, RTDetrV2ForObjectDetection
+
+        if self._image_processor is None:
+            self._image_processor = RTDetrImageProcessor.from_pretrained(self._model_name)
+
+        if self._model is None:
+            self._model = RTDetrV2ForObjectDetection.from_pretrained(self._model_name)
+
+        inputs = self._image_processor(images=[image], return_tensors="pt")
+        with torch.no_grad():
+            outputs = self._model(**inputs)
+
+        results = self._image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.shape[:-1]]), threshold=self._threshold)
+        return results
+
+    def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
+        """
+        Get attachments from the given image using a document layout analysis method https://huggingface.co/docling-project/docling-layout-heron.
+
+        Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about the methods' parameters.
+        """
+        import cv2
+        import os
+        from dedoc.utils.parameter_utils import get_param_need_content_analysis, get_param_attachments_dir
+        from dedoc.utils.utils import get_unique_name
+        from dedoc.readers.pdf_reader.data_classes.tables.location import Location
+        from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
+
+        parameters = {} if parameters is None else parameters
+        tmpdir, filename = os.path.split(file_path)
+        attachments_dir = get_param_attachments_dir(parameters, tmpdir)
+        attachments = []
+
+        image = cv2.imread(file_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        predictions = self._predict(image)
+
+        for prediction in predictions:
+            for label_id, box in zip(prediction["labels"], prediction["boxes"]):
+                if label_id.item() not in self._classes:
+                    continue
+
+                box = [round(i) for i in box.tolist()]
+                x_top_left, x_bottom_right = max(0, box[0]), min(box[2], image.shape[1])
+                y_top_left, y_bottom_right = max(0, box[1]), min(box[3], image.shape[0])
+                part = image[y_top_left:y_bottom_right, x_top_left:x_bottom_right]
+                image_location = Location(page_number=0, bbox=BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)))
+
+                tmp_file_name = get_unique_name(filename)
+                tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
+                cv2.imwrite(tmp_file_path, cv2.cvtColor(part, cv2.COLOR_RGB2BGR))
+
+                image_attachment = PdfImageAttachment(
+                    original_name=tmp_file_name,
+                    tmp_file_path=tmp_file_path,
+                    need_content_analysis=get_param_need_content_analysis(parameters),
+                    uid=f"attach_{uuid.uuid4()}",
+                    location=image_location
+                )
+                attachments.append(image_attachment)
+
+        return attachments
diff --git a/dedoc/download_models.py b/dedoc/download_models.py
@@ -1,18 +1,19 @@
 """Downloading models in advance inside the docker container."""
 
 """
-These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc.
+These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc and other users.
 Keys are the names of repositories with models.
 """
-model_hash_dict = dict(
-    txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
-    scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e",
-    font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
-    paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
-    line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
-    fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8",
-    torch_cnn="5333909f858f5f632df478ef5a53af6dfd26f2e1"
-)
+model_hash_dict = {
+    "txtlayer_classifier": "9ca1de749d8d37147b00a3a228e03ee1776c695f",
+    "scan_orientation_efficient_net_b0": "c60812552a1be624476c1e5b58599867b36f8d4e",
+    "font_classifier": "db4481ad60ab050cbb42079b64f97f9e431feb07",
+    "paragraph_classifier": "97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
+    "line_type_classifiers": "6ad0eacbfdea065b658cb6f039d13f75245d51ae",
+    "fintoc_classifiers": "6a907b7d2437c3f61ac9c506f67175207982fae8",
+    "torch_cnn": "5333909f858f5f632df478ef5a53af6dfd26f2e1",
+    "docling-layout-heron": "8f39ad3c0b4c58e9c2d2c84a38465abf757272d8"
+}
 
 
 def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str, user_name: str = "dedoc") -> None:
@@ -54,6 +55,16 @@ def download(resources_path: str) -> None:
 
     download_from_hub(out_dir=resources_path, out_name="glyph_recognizer.pt", repo_name="torch_cnn", hub_name="rus_eng.pt", user_name="sinkudo")
 
+    layout_dir_path = os.path.join(resources_path, "layout_model")
+    download_from_hub(out_dir=layout_dir_path, out_name="config.json", repo_name="docling-layout-heron", hub_name="config.json", user_name="docling-project")
+    download_from_hub(
+        out_dir=layout_dir_path, out_name="model.safetensors", repo_name="docling-layout-heron", hub_name="model.safetensors", user_name="docling-project"
+    )
+    download_from_hub(
+        out_dir=layout_dir_path, out_name="preprocessor_config.json", repo_name="docling-layout-heron", hub_name="preprocessor_config.json",
+        user_name="docling-project"
+    )
+
 
 if __name__ == "__main__":
     from dedoc.config import get_config

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py
@@ -34,7 +34,7 @@ def to_dict(self) -> Dict[str, Any]:
         return res
 
     def __eq__(self, other: "Location") -> bool:
-        return (self.page_number, self.bbox.y_bottom_right) == (other.page_number, other.bbox.y_bottom_right)
+        return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) == (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left)
 
     def __lt__(self, other: "Location") -> bool:
-        return (self.page_number, self.bbox.y_bottom_right) < (other.page_number, other.bbox.y_bottom_right)
+        return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) < (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left)
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
@@ -21,6 +21,8 @@ class PdfImageReader(PdfBaseReader):
 
     * table detection and recognition;
 
+    * image detection;
+
     * document binarization (configure via `need_binarization` parameter);
 
     * document orientation correction (automatically rotate on 90, 180, 270 degrees if it's needed);
@@ -34,6 +36,7 @@ class PdfImageReader(PdfBaseReader):
 
     def __init__(self, *, config: Optional[dict] = None) -> None:
         from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector
+        from dedoc.attachments_extractors.concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor
         from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
         from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
         from dedoc.config import get_config
@@ -53,6 +56,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
                                                                           config=self.config)
         self.binarizer = AdaptiveBinarizer()
         self.ocr = OCRLineExtractor(config=self.config)
+        self.attachments_extractor = ImageAttachmentsExtractor(config=self.config)
         self.page_number = None
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
@@ -66,21 +70,25 @@ def _process_one_page(self,
         import os
         from datetime import datetime
         import cv2
+        from dedocutils.utils import rotate_image
+        from dedoc.utils.image_utils import fill_bbox_on_image
         from dedoc.utils.parameter_utils import get_path_param
+        from dedoc.utils.utils import get_unique_name
+
+        initial_image = image
+        #  --- Step 1: do binarization ---
+        if parameters.need_binarization:
+            image, _ = self.binarizer.preprocess(image)
+            if self.config.get("debug_mode", False):
+                debug_dir = get_path_param(self.config, "path_debug")
+                cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), image)
 
-        #  --- Step 1: correct orientation and detect column count ---
+        #  --- Step 2: correct orientation and detect column count ---
         self.page_number = page_number
         rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
         if self.config.get("debug_mode", False):
             self.logger.info(f"Angle page rotation = {angle}")
 
-        #  --- Step 2: do binarization ---
-        if parameters.need_binarization:
-            rotated_image, _ = self.binarizer.preprocess(rotated_image)
-            if self.config.get("debug_mode", False):
-                debug_dir = get_path_param(self.config, "path_debug")
-                cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)
-
         #  --- Step 3: table detection and recognition ---
         if parameters.need_pdf_table_analysis:
             clean_image, tables = self.table_recognizer.recognize_tables_from_image(
@@ -92,10 +100,26 @@ def _process_one_page(self,
         else:
             clean_image, tables = rotated_image, []
 
-        # --- Step 4: plain text recognition and text style detection ---
+        # --- Step 4: image detection ---
+        attached_images = []
+        if parameters.with_attachments:
+            tmpdir = os.path.split(path)[0]
+            tmp_file_path = os.path.join(tmpdir, get_unique_name("rotated.png"))
+            non_binarized_rotated_image = rotate_image(initial_image, angle)
+            cv2.imwrite(tmp_file_path, non_binarized_rotated_image)
+            attached_images = []
+
+            for attach in self.attachments_extractor.extract(file_path=tmp_file_path, parameters=dict(zip(parameters._fields, parameters))):
+                attach.location.page_number = page_number
+                attached_images.append(attach)
+                clean_image = fill_bbox_on_image(clean_image, attach.location.bbox)
+
+        # --- Step 5: plain text recognition and text style detection ---
         page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number)
-
         lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page)
+        if parameters.with_attachments:
+            page.attachments.extend(attached_images)
+
         return lines, tables, page.attachments, [angle]
 
     def _detect_column_count_and_orientation(self, image: ndarray, parameters: ParametersForParseDoc) -> Tuple[ndarray, bool, float]:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -8,13 +8,13 @@
 import cv2
 import numpy as np
 from PIL import Image
-from dedocutils.data_structures import BBox
 
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
+from dedoc.utils.image_utils import fill_bbox_on_image
 
 """-------------------------------------entry class of Table Recognizer Module---------------------------------------"""
 
@@ -87,31 +87,9 @@ def __clean_image_from_table(image: np.ndarray, tables: List[ScanTable]) -> np.n
         image_copy = np.copy(image)
         for table in tables:
             for location in table.locations:
-                image_copy = TableRecognizer.__clean_image(image_copy, location.bbox)
+                image_copy = fill_bbox_on_image(image_copy, location.bbox)
         return image_copy
 
-    @staticmethod
-    def __clean_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray:
-        """
-        replace bboxes with given color (for example to remove tables from images)
-        @param image: original image
-        @param bbox: bbox to clear from image
-        @param color: color to replace bboxes
-        @return: image without given bboxes
-        """
-        x_min = bbox.x_top_left
-        x_max = x_min + bbox.width
-
-        y_min = bbox.y_top_left
-        y_max = y_min + bbox.height
-
-        if len(image.shape) == 3:
-            image[y_min: y_max, x_min: x_max, :] = color
-        else:
-            image[y_min: y_max, x_min: x_max] = color
-
-        return image
-
     def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> List[ScanTable]:
         filtered = []
         for table in tables:

diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py
@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict, deque
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 from dedocutils.data_structures import BBox
 
@@ -82,20 +82,30 @@ def _find_closest_line(self,
         @param lines_after: self.n_lines after object
         @return: best line to link with object
         """
+        best_line = self._get_closest_line_on_same_page(page_object, lines_before)
+        if not best_line:
+            best_line = self._get_closest_line_on_same_page(page_object, lines_after)
+        if best_line:
+            return best_line
+
         all_lines = lines_before + lines_after
-        line_on_same_page = [line for line in all_lines if line.location.page_number == page_object.location.page_number]
         # no one line on the same page
-        if len(line_on_same_page) == 0:
-            previous_page_id = page_object.location.page_number - 1
-            if previous_page_id in last_page_line:
-                return last_page_line[previous_page_id]
-            lines_prev_page = [line for line in all_lines if line.location < page_object.location]
-            if len(lines_prev_page) > 0:
-                return max(lines_prev_page, key=lambda line: line.location)
-            else:
-                return min(all_lines, key=lambda line: line.location)
-        line_with_distance = [(self._distance_bboxes(line, page_object.location.bbox), line) for line in line_on_same_page]
-        return min(line_with_distance, key=lambda t: t[0])[1]
+        previous_page_id = page_object.location.page_number - 1
+        if previous_page_id in last_page_line:
+            return last_page_line[previous_page_id]
+        lines_prev_page = [line for line in all_lines if line.location < page_object.location]
+        if len(lines_prev_page) > 0:
+            return max(lines_prev_page, key=lambda line: line.location)
+        else:
+            return min(all_lines, key=lambda line: line.location)
+
+    @staticmethod
+    def _get_closest_line_on_same_page(page_obj: Union[ScanTable, PdfImageAttachment], lines: List[LineWithLocation]) -> Optional[LineWithLocation]:
+        lines_on_same_page = [line for line in lines if line.location.page_number == page_obj.location.page_number]
+        if not lines_on_same_page:
+            return None
+        lines_with_distance = [(LineObjectLinker._distance_bboxes(line, page_obj.location.bbox), line) for line in lines_on_same_page]
+        return min(lines_with_distance, key=lambda t: t[0])[1]
 
     @staticmethod
     def _distance_bboxes(line: LineWithLocation, object_bbox: BBox) -> float:

diff --git a/dedoc/utils/image_utils.py b/dedoc/utils/image_utils.py
@@ -107,3 +107,25 @@ def get_concat_v(images: List[Image.Image]) -> Image.Image:
         dst.paste(image, (0, height))
         height += image.height
     return dst
+
+
+def fill_bbox_on_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray:
+    """
+    replace bboxes with given color (for example to remove tables from images)
+    @param image: original image
+    @param bbox: bbox to clear from image
+    @param color: color to replace bboxes
+    @return: image without given bboxes
+    """
+    x_min = bbox.x_top_left
+    x_max = x_min + bbox.width
+
+    y_min = bbox.y_top_left
+    y_max = y_min + bbox.height
+
+    if len(image.shape) == 3:
+        image[y_min: y_max, x_min: x_max, :] = color
+    else:
+        image[y_min: y_max, x_min: x_max] = color
+
+    return image
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ exclude = @@
         venv,
         .venv,
         build,
+        etc,
         dedoc.egg-info,
         docs/_build,
         scripts/fintoc2022/metric.py
@@ Expand Down @@