From c256a332e62700e00661badb69e6343d3d99bbe7 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 19 Mar 2026 11:48:27 +0300 Subject: [PATCH 1/3] TALIE-1519: unify colors for images and pdfs --- .../image_attachments_extractor.py | 3 +-- dedoc/readers/pdf_reader/pdf_base_reader.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py index 2d423445..25d04abb 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/image_attachments_extractor.py @@ -70,7 +70,6 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att attachments = [] image = cv2.imread(file_path) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) predictions = self._predict(image) for prediction in predictions: @@ -86,7 +85,7 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att tmp_file_name = get_unique_name(filename) tmp_file_path = os.path.join(attachments_dir, tmp_file_name) - cv2.imwrite(tmp_file_path, cv2.cvtColor(part, cv2.COLOR_RGB2BGR)) + cv2.imwrite(tmp_file_path, part) image_attachment = PdfImageAttachment( original_name=tmp_file_name, diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index d59c114f..ea510c1a 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -240,6 +240,7 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ if page_from >= page_to: return + import cv2 import math import os import numpy as np @@ -263,7 +264,8 @@ def _split_pdf2image(self, path: str, page_from: int, page_to: int) -> Iterator[ left += 1 if left > page_to + 1: break - yield np.array(image) + image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) + yield image except (PDFPageCountError, PDFSyntaxError) as error: raise BadFileFormatError(f"Bad pdf file:\n file_name = {os.path.basename(path)} \n exception = {error.args}") From 38ae5ed0013b42648b28ef33f6ac1f87bc6f3b48 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 19 Mar 2026 14:19:34 +0300 Subject: [PATCH 2/3] TALIE-1519: tests fix --- .../images_creators/concrete_creators/docx_images_creator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/labeling/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py b/labeling/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py index 5df1da8b..625fa5e2 100644 --- a/labeling/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py +++ b/labeling/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py @@ -11,6 +11,7 @@ from typing import Dict, Iterable, Iterator, Optional, Tuple from typing import List +import cv2 import numpy as np from PIL import Image from PIL import ImageColor @@ -214,7 +215,8 @@ def _create_images_from_pdf(self, pdfs: PairedPdf, page: List[dict], tmp_dir: st uid2path = defaultdict(list) n = 0 for two_color, many_color in zip(two_color_images, many_color_images): - + two_color = cv2.cvtColor(two_color, cv2.COLOR_RGB2BGR) + many_color = cv2.cvtColor(many_color, cv2.COLOR_RGB2BGR) diff = many_color - two_color all_masks = np.abs(diff) > 0 many_color[all_masks] = 255 From 6afae1d28dc73739a336cd881dad70eb09afda32 Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 19 Mar 2026 15:16:02 +0300 Subject: [PATCH 3/3] TALIE-1520: fix bugs in pdf parsing --- .../pdf_attachments_extractor.py | 5 ++++- .../pdf_auto_reader/txtlayer_detector.py | 22 ++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py index bd6aea2e..e82a68d1 100644 --- a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py +++ b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py @@ -88,7 +88,10 @@ def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]: import uuid attachments = [] - catalog = reader.trailer["/Root"] + catalog = reader.trailer.get("/Root") + if catalog is None: + return attachments + if "/Names" in catalog.keys() and "/EmbeddedFiles" in catalog["/Names"].keys() and "/Names" in catalog["/Names"]["/EmbeddedFiles"].keys(): file_names = catalog["/Names"]["/EmbeddedFiles"]["/Names"] for f in file_names: diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py index de92dfc9..383623f6 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py @@ -2,7 +2,7 @@ import math from copy import deepcopy from itertools import chain -from typing import List +from typing import List, Optional import numpy as np @@ -37,22 +37,24 @@ def detect_txtlayer(self, path: str, parameters: dict) -> List[TxtLayerResult]: if txtlayer_classifier is None: raise ValueError(f"Unknown textual layer classifier `{classifier_name}`") + start, end = get_param_page_slice(parameters) + start = 1 if start is None else start + 1 + classify_each_page = get_bool_parameter(parameters, "each_page_textual_layer_detection", False) detect_function = self.__classify_each_page if classify_each_page else self.__classify_all_pages try: - return detect_function(path, parameters, txtlayer_classifier) + return detect_function(path, parameters, txtlayer_classifier, start, end) except Exception as e: self.logger.debug(f"Error occurred white detecting PDF textual layer ({e})") - return [TxtLayerResult(correct=False, start=1, end=None)] + return [TxtLayerResult(correct=False, start=start, end=end)] - def __classify_all_pages(self, path: str, parameters: dict, txtlayer_classifier: AbstractTxtlayerClassifier) -> List[TxtLayerResult]: + def __classify_all_pages( + self, path: str, parameters: dict, txtlayer_classifier: AbstractTxtlayerClassifier, start: int, end: Optional[int] + ) -> List[TxtLayerResult]: """ Check only first 8 pages of the document, use classification results for the entire document. Separately handle the first page (it's common that only first page doesn't have a textual layer). """ - start, end = get_param_page_slice(parameters) - start = 1 if start is None else start + 1 - parameters_copy = deepcopy(parameters) parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader parameters_copy["need_pdf_table_analysis"] = "false" @@ -72,13 +74,13 @@ def __classify_all_pages(self, path: str, parameters: dict, txtlayer_classifier: else: return [TxtLayerResult(correct=False, start=start, end=start), TxtLayerResult(correct=True, start=start + 1, end=end)] - def __classify_each_page(self, path: str, parameters: dict, txtlayer_classifier: AbstractTxtlayerClassifier) -> List[TxtLayerResult]: + def __classify_each_page( + self, path: str, parameters: dict, txtlayer_classifier: AbstractTxtlayerClassifier, start: int, end: Optional[int] + ) -> List[TxtLayerResult]: """ Classify each page of the document correct/not correct textual layer. """ document = self.pdf_reader.read(path, parameters=parameters) - start, end = get_param_page_slice(parameters) - start = 1 if start is None else start + 1 if not document.lines: return [TxtLayerResult(correct=False, start=start, end=end)]