Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ exclude =
venv,
.venv,
build,
etc,
dedoc.egg-info,
docs/_build,
scripts/fintoc2022/metric.py
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources"

COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
RUN pip3 install --no-cache-dir transformers~=4.49.0
RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
RUN apt install -y libutf8proc-dev
RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1
Expand Down
3 changes: 2 additions & 1 deletion dedoc/attachments_extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from .concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from .concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from .concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor
from .concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor
from .concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor
from .concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
from .concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor

__all__ = ['AbstractAttachmentsExtractor', 'AbstractOfficeAttachmentsExtractor', 'DocxAttachmentsExtractor', 'ExcelAttachmentsExtractor',
'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor']
'ImageAttachmentsExtractor', 'JsonAttachmentsExtractor', 'PDFAttachmentsExtractor', 'PptxAttachmentsExtractor']
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os
import uuid
from typing import Dict, Iterable, List, Optional

from dedocutils.data_structures.bbox import BBox
from numpy import ndarray
from torch import Tensor

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile


class ImageAttachmentsExtractor(AbstractAttachmentsExtractor):
"""
Extract attachments from image files.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.config import get_config
super().__init__(config=config, recognized_extensions=recognized_extensions.image_like_format, recognized_mimes=recognized_mimes.image_like_format)
self._classes = {
2, # Formula
6 # Picture
}
self._image_processor = None
self._model = None

model_path = os.path.join(get_config()["resources_path"], "layout_model")
if os.path.exists(model_path):
self._model_name = model_path
self.logger.info("Using locally saved layout analysis model")
else:
self._model_name = "docling-project/docling-layout-heron"
self.logger.info("Layout analysis model will be loaded from huggingface")
self._threshold = self.config.get("image_detection_threshold", 0.7)

def _predict(self, image: ndarray) -> Iterable[Dict[str, Tensor]]:
import torch
from transformers import RTDetrImageProcessor, RTDetrV2ForObjectDetection

if self._image_processor is None:
self._image_processor = RTDetrImageProcessor.from_pretrained(self._model_name)

if self._model is None:
self._model = RTDetrV2ForObjectDetection.from_pretrained(self._model_name)

inputs = self._image_processor(images=[image], return_tensors="pt")
with torch.no_grad():
outputs = self._model(**inputs)

results = self._image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.shape[:-1]]), threshold=self._threshold)
return results

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given image using a document layout analysis method https://huggingface.co/docling-project/docling-layout-heron.

Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about the methods' parameters.
"""
import cv2
import os
from dedoc.utils.parameter_utils import get_param_need_content_analysis, get_param_attachments_dir
from dedoc.utils.utils import get_unique_name
from dedoc.readers.pdf_reader.data_classes.tables.location import Location
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment

parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
attachments_dir = get_param_attachments_dir(parameters, tmpdir)
attachments = []

image = cv2.imread(file_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
predictions = self._predict(image)

for prediction in predictions:
for label_id, box in zip(prediction["labels"], prediction["boxes"]):
if label_id.item() not in self._classes:
continue

box = [round(i) for i in box.tolist()]
x_top_left, x_bottom_right = max(0, box[0]), min(box[2], image.shape[1])
y_top_left, y_bottom_right = max(0, box[1]), min(box[3], image.shape[0])
part = image[y_top_left:y_bottom_right, x_top_left:x_bottom_right]
image_location = Location(page_number=0, bbox=BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right)))

tmp_file_name = get_unique_name(filename)
tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
cv2.imwrite(tmp_file_path, cv2.cvtColor(part, cv2.COLOR_RGB2BGR))

image_attachment = PdfImageAttachment(
original_name=tmp_file_name,
tmp_file_path=tmp_file_path,
need_content_analysis=get_param_need_content_analysis(parameters),
uid=f"attach_{uuid.uuid4()}",
location=image_location
)
attachments.append(image_attachment)

return attachments
31 changes: 21 additions & 10 deletions dedoc/download_models.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
"""Downloading models in advance inside the docker container."""

"""
These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc.
These are versions of the models that are used at the current moment - hashes of commits from https://huggingface.co/dedoc and other users.
Keys are the names of repositories with models.
"""
model_hash_dict = dict(
txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8",
torch_cnn="5333909f858f5f632df478ef5a53af6dfd26f2e1"
)
model_hash_dict = {
"txtlayer_classifier": "9ca1de749d8d37147b00a3a228e03ee1776c695f",
"scan_orientation_efficient_net_b0": "c60812552a1be624476c1e5b58599867b36f8d4e",
"font_classifier": "db4481ad60ab050cbb42079b64f97f9e431feb07",
"paragraph_classifier": "97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
"line_type_classifiers": "6ad0eacbfdea065b658cb6f039d13f75245d51ae",
"fintoc_classifiers": "6a907b7d2437c3f61ac9c506f67175207982fae8",
"torch_cnn": "5333909f858f5f632df478ef5a53af6dfd26f2e1",
"docling-layout-heron": "8f39ad3c0b4c58e9c2d2c84a38465abf757272d8"
}


def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str, user_name: str = "dedoc") -> None:
Expand Down Expand Up @@ -54,6 +55,16 @@ def download(resources_path: str) -> None:

download_from_hub(out_dir=resources_path, out_name="glyph_recognizer.pt", repo_name="torch_cnn", hub_name="rus_eng.pt", user_name="sinkudo")

layout_dir_path = os.path.join(resources_path, "layout_model")
download_from_hub(out_dir=layout_dir_path, out_name="config.json", repo_name="docling-layout-heron", hub_name="config.json", user_name="docling-project")
download_from_hub(
out_dir=layout_dir_path, out_name="model.safetensors", repo_name="docling-layout-heron", hub_name="model.safetensors", user_name="docling-project"
)
download_from_hub(
out_dir=layout_dir_path, out_name="preprocessor_config.json", repo_name="docling-layout-heron", hub_name="preprocessor_config.json",
user_name="docling-project"
)


if __name__ == "__main__":
from dedoc.config import get_config
Expand Down
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def to_dict(self) -> Dict[str, Any]:
return res

def __eq__(self, other: "Location") -> bool:
return (self.page_number, self.bbox.y_bottom_right) == (other.page_number, other.bbox.y_bottom_right)
return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) == (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left)

def __lt__(self, other: "Location") -> bool:
return (self.page_number, self.bbox.y_bottom_right) < (other.page_number, other.bbox.y_bottom_right)
return (self.page_number, self.bbox.y_top_left, self.bbox.x_top_left) < (other.page_number, other.bbox.y_top_left, other.bbox.x_top_left)
44 changes: 34 additions & 10 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class PdfImageReader(PdfBaseReader):

* table detection and recognition;

* image detection;

* document binarization (configure via `need_binarization` parameter);

* document orientation correction (automatically rotate on 90, 180, 270 degrees if it's needed);
Expand All @@ -34,6 +36,7 @@ class PdfImageReader(PdfBaseReader):

def __init__(self, *, config: Optional[dict] = None) -> None:
from dedocutils.preprocessing import AdaptiveBinarizer, SkewCorrector
from dedoc.attachments_extractors.concrete_attachments_extractors.image_attachments_extractor import ImageAttachmentsExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
from dedoc.config import get_config
Expand All @@ -53,6 +56,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
config=self.config)
self.binarizer = AdaptiveBinarizer()
self.ocr = OCRLineExtractor(config=self.config)
self.attachments_extractor = ImageAttachmentsExtractor(config=self.config)
self.page_number = None

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand All @@ -66,21 +70,25 @@ def _process_one_page(self,
import os
from datetime import datetime
import cv2
from dedocutils.utils import rotate_image
from dedoc.utils.image_utils import fill_bbox_on_image
from dedoc.utils.parameter_utils import get_path_param
from dedoc.utils.utils import get_unique_name

initial_image = image
# --- Step 1: do binarization ---
if parameters.need_binarization:
image, _ = self.binarizer.preprocess(image)
if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), image)

# --- Step 1: correct orientation and detect column count ---
# --- Step 2: correct orientation and detect column count ---
self.page_number = page_number
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
if self.config.get("debug_mode", False):
self.logger.info(f"Angle page rotation = {angle}")

# --- Step 2: do binarization ---
if parameters.need_binarization:
rotated_image, _ = self.binarizer.preprocess(rotated_image)
if self.config.get("debug_mode", False):
debug_dir = get_path_param(self.config, "path_debug")
cv2.imwrite(os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_binarization.jpg"), rotated_image)

# --- Step 3: table detection and recognition ---
if parameters.need_pdf_table_analysis:
clean_image, tables = self.table_recognizer.recognize_tables_from_image(
Expand All @@ -92,10 +100,26 @@ def _process_one_page(self,
else:
clean_image, tables = rotated_image, []

# --- Step 4: plain text recognition and text style detection ---
# --- Step 4: image detection ---
attached_images = []
if parameters.with_attachments:
tmpdir = os.path.split(path)[0]
tmp_file_path = os.path.join(tmpdir, get_unique_name("rotated.png"))
non_binarized_rotated_image = rotate_image(initial_image, angle)
cv2.imwrite(tmp_file_path, non_binarized_rotated_image)
attached_images = []

for attach in self.attachments_extractor.extract(file_path=tmp_file_path, parameters=dict(zip(parameters._fields, parameters))):
attach.location.page_number = page_number
attached_images.append(attach)
clean_image = fill_bbox_on_image(clean_image, attach.location.bbox)

# --- Step 5: plain text recognition and text style detection ---
page = self.ocr.split_image2lines(image=clean_image, language=parameters.language, is_one_column_document=is_one_column_document, page_num=page_number)

lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page)
if parameters.with_attachments:
page.attachments.extend(attached_images)

return lines, tables, page.attachments, [angle]

def _detect_column_count_and_orientation(self, image: ndarray, parameters: ParametersForParseDoc) -> Tuple[ndarray, bool, float]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
import cv2
import numpy as np
from PIL import Image
from dedocutils.data_structures import BBox

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
from dedoc.utils.image_utils import fill_bbox_on_image

"""-------------------------------------entry class of Table Recognizer Module---------------------------------------"""

Expand Down Expand Up @@ -87,31 +87,9 @@ def __clean_image_from_table(image: np.ndarray, tables: List[ScanTable]) -> np.n
image_copy = np.copy(image)
for table in tables:
for location in table.locations:
image_copy = TableRecognizer.__clean_image(image_copy, location.bbox)
image_copy = fill_bbox_on_image(image_copy, location.bbox)
return image_copy

@staticmethod
def __clean_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray:
"""
replace bboxes with given color (for example to remove tables from images)
@param image: original image
@param bbox: bbox to clear from image
@param color: color to replace bboxes
@return: image without given bboxes
"""
x_min = bbox.x_top_left
x_max = x_min + bbox.width

y_min = bbox.y_top_left
y_max = y_min + bbox.height

if len(image.shape) == 3:
image[y_min: y_max, x_min: x_max, :] = color
else:
image[y_min: y_max, x_min: x_max] = color

return image

def __filter_bad_tables(self, tables: List[ScanTable], image: np.ndarray) -> List[ScanTable]:
filtered = []
for table in tables:
Expand Down
36 changes: 23 additions & 13 deletions dedoc/readers/pdf_reader/utils/line_object_linker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from collections import defaultdict, deque
from typing import Dict, List, Union
from typing import Dict, List, Optional, Union

from dedocutils.data_structures import BBox

Expand Down Expand Up @@ -82,20 +82,30 @@ def _find_closest_line(self,
@param lines_after: self.n_lines after object
@return: best line to link with object
"""
best_line = self._get_closest_line_on_same_page(page_object, lines_before)
if not best_line:
best_line = self._get_closest_line_on_same_page(page_object, lines_after)
if best_line:
return best_line

all_lines = lines_before + lines_after
line_on_same_page = [line for line in all_lines if line.location.page_number == page_object.location.page_number]
# no one line on the same page
if len(line_on_same_page) == 0:
previous_page_id = page_object.location.page_number - 1
if previous_page_id in last_page_line:
return last_page_line[previous_page_id]
lines_prev_page = [line for line in all_lines if line.location < page_object.location]
if len(lines_prev_page) > 0:
return max(lines_prev_page, key=lambda line: line.location)
else:
return min(all_lines, key=lambda line: line.location)
line_with_distance = [(self._distance_bboxes(line, page_object.location.bbox), line) for line in line_on_same_page]
return min(line_with_distance, key=lambda t: t[0])[1]
previous_page_id = page_object.location.page_number - 1
if previous_page_id in last_page_line:
return last_page_line[previous_page_id]
lines_prev_page = [line for line in all_lines if line.location < page_object.location]
if len(lines_prev_page) > 0:
return max(lines_prev_page, key=lambda line: line.location)
else:
return min(all_lines, key=lambda line: line.location)

@staticmethod
def _get_closest_line_on_same_page(page_obj: Union[ScanTable, PdfImageAttachment], lines: List[LineWithLocation]) -> Optional[LineWithLocation]:
lines_on_same_page = [line for line in lines if line.location.page_number == page_obj.location.page_number]
if not lines_on_same_page:
return None
lines_with_distance = [(LineObjectLinker._distance_bboxes(line, page_obj.location.bbox), line) for line in lines_on_same_page]
return min(lines_with_distance, key=lambda t: t[0])[1]

@staticmethod
def _distance_bboxes(line: LineWithLocation, object_bbox: BBox) -> float:
Expand Down
22 changes: 22 additions & 0 deletions dedoc/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,25 @@ def get_concat_v(images: List[Image.Image]) -> Image.Image:
dst.paste(image, (0, height))
height += image.height
return dst


def fill_bbox_on_image(image: np.ndarray, bbox: BBox, color: int = 255) -> np.ndarray:
"""
replace bboxes with given color (for example to remove tables from images)
@param image: original image
@param bbox: bbox to clear from image
@param color: color to replace bboxes
@return: image without given bboxes
"""
x_min = bbox.x_top_left
x_max = x_min + bbox.width

y_min = bbox.y_top_left
y_max = y_min + bbox.height

if len(image.shape) == 3:
image[y_min: y_max, x_min: x_max, :] = color
else:
image[y_min: y_max, x_min: x_max] = color

return image
Loading
Loading