diff --git a/normcap/detection/codes/detector.py b/normcap/detection/codes/detector.py index b8c93d2b5..599703d68 100644 --- a/normcap/detection/codes/detector.py +++ b/normcap/detection/codes/detector.py @@ -105,7 +105,8 @@ def _detect_codes_via_zxing( text, text_type = _get_text_type_and_transform(text) return text, text_type, code_type - return os.linesep.join(codes), TextType.MULTI_LINE, code_type + result_text = os.linesep.join(codes) + return result_text, TextType.MULTI_LINE, code_type def detect_codes(image: QtGui.QImage) -> DetectionResult | None: diff --git a/normcap/detection/detector.py b/normcap/detection/detector.py index 79ff8b9dd..3199a82a4 100644 --- a/normcap/detection/detector.py +++ b/normcap/detection/detector.py @@ -22,6 +22,7 @@ def detect( language: str, detect_mode: DetectionMode, parse_text: bool, + strip_whitespaces: bool, ) -> DetectionResult: ocr_result = None codes_result = None @@ -44,6 +45,7 @@ def detect( tessdata_path=tessdata_path, parse=parse_text, resize_factor=2, + strip_whitespaces=strip_whitespaces, padding_size=80, ) logger.debug("OCR detection took %s s", f"{time.time() - start_time:.4f}.") diff --git a/normcap/detection/ocr/recognize.py b/normcap/detection/ocr/recognize.py index 09ea08c41..4ee35f291 100644 --- a/normcap/detection/ocr/recognize.py +++ b/normcap/detection/ocr/recognize.py @@ -38,6 +38,7 @@ def get_text_from_image( tesseract_bin_path: PathLike, tessdata_path: PathLike | str | None = None, parse: bool = True, + strip_whitespaces: bool = False, resize_factor: float | None = None, padding_size: int | None = None, ) -> DetectionResult: @@ -67,13 +68,17 @@ def get_text_from_image( logger.debug("OCR detections:\n%s", ",\n".join(str(w) for w in result.words)) if not parse: + # Even without parsing, apply smart whitespace stripping if enabled + raw_text = result.text + if strip_whitespaces and transformer._should_strip_whitespaces(tess_args.lang): + raw_text = transformer._smart_strip_cjk_whitespaces(raw_text) return DetectionResult( - text=result.text, + text=raw_text, text_type=TextType.SINGLE_LINE, detector=TextDetector.OCR_RAW, ) - result = transformer.apply(result) + result = transformer.apply(result, strip_whitespaces=strip_whitespaces) logger.debug("Parsed text:\n%s", result.parsed) text_type = ( TextType[result.best_scored_transformer.value] diff --git a/normcap/detection/ocr/transformer.py b/normcap/detection/ocr/transformer.py index d7729e09d..947f3178d 100644 --- a/normcap/detection/ocr/transformer.py +++ b/normcap/detection/ocr/transformer.py @@ -17,7 +17,7 @@ } -def apply(ocr_result: OcrResult) -> OcrResult: +def apply(ocr_result: OcrResult, strip_whitespaces: bool = False) -> OcrResult: """Load transformers, calculate score, execute transformer with highest score. Args: @@ -33,8 +33,11 @@ def apply(ocr_result: OcrResult) -> OcrResult: if best_transformer_name := ocr_result.best_scored_transformer: best_transformer = _transformers[best_transformer_name] ocr_result.parsed = best_transformer.transform(ocr_result) + else: + # No transformer matched, use raw OCR text + ocr_result.parsed = ocr_result.add_linebreaks() - ocr_result.parsed = _post_process(ocr_result) + ocr_result.parsed = _post_process(ocr_result, strip_whitespaces) return ocr_result @@ -47,17 +50,82 @@ def _clean(text: str) -> str: return text # unnecessary return for clarity -def _post_process(ocr_result: OcrResult) -> str: +def _post_process(ocr_result: OcrResult, strip_whitespaces: bool = False) -> str: """Apply postprocessing to transformed output.""" text = ocr_result.parsed text = _clean(text) - # ONHOLD: Check tesseract issue if whitespace workaround still necessary: - # https://github.com/tesseract-ocr/tesseract/issues/2702 - if ocr_result.tess_args.is_language_without_spaces(): - text = text.replace(" ", "") + # Smart whitespace stripping for CJK text + logger.debug( + "Whitespace stripping: enabled=%s, should_strip=%s, lang=%s", + strip_whitespaces, + _should_strip_whitespaces(ocr_result.tess_args.lang) if strip_whitespaces else "N/A", + ocr_result.tess_args.lang, + ) + if strip_whitespaces and _should_strip_whitespaces(ocr_result.tess_args.lang): + logger.debug("Before smart stripping: %s", repr(text[:100])) + text = _smart_strip_cjk_whitespaces(text) + logger.debug("After smart stripping: %s", repr(text[:100])) return text +def _should_strip_whitespaces(lang: str) -> bool: + """Check if language contains CJK characters that benefit from smart stripping. + + Now checks for Chinese, Japanese, or Korean languages. + """ + selected_languages = lang.split("+") + cjk_langs = {"chi_", "jpn", "kor"} + return any( + any(sel_lang.startswith(cjk_prefix) for cjk_prefix in cjk_langs) + for sel_lang in selected_languages + ) + + +def _smart_strip_cjk_whitespaces(text: str) -> str: + """Strip whitespaces from CJK text using smart algorithm. + + Rules: + - Remove spaces between CJK characters only (keep spaces for English words) + - Remove soft line breaks (after non-punctuation characters) + - Keep hard line breaks (after end punctuation like 。!?;:) + - Keep paragraph breaks (double newlines -> single newline) + + This smart algorithm works well for mixed CJK-Latin text. + """ + # Define CJK character range (Chinese, Japanese, Korean) + cjk_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]' + # End punctuation that indicates sentence end + end_punct_pattern = r'[。!?;:]' + + # Step 1: Handle paragraph breaks (double newlines) + text = re.sub(r'\n\n+', '<<>>', text) + + # Step 2: Handle line breaks intelligently + lines = text.split('\n') + result = [] + for i, line in enumerate(lines): + if i < len(lines) - 1: # Not the last line + # Check if line ends with end punctuation + if re.search(end_punct_pattern + r'$', line.rstrip()): + # Hard break after punctuation - keep newline + result.append(line.rstrip() + '\n') + else: + # Soft break - remove newline + result.append(line.rstrip()) + else: + result.append(line.rstrip()) + text = ''.join(result) + + # Step 3: Remove spaces adjacent to CJK characters (but preserve ASCII word spacing) + # Remove: CJK + space + non-letter + text = re.sub(f'({cjk_pattern})[ \t]+(?![a-zA-Z])', r'\1', text) + # Remove: non-letter + space + CJK + text = re.sub(f'(?>>', '\n') + + return text def _calc_scores(ocr_result: OcrResult) -> dict[Transformer, float]: """Calculate score for every loaded transformer. diff --git a/normcap/gui/menu_button.py b/normcap/gui/menu_button.py index bce50708e..bd432698d 100644 --- a/normcap/gui/menu_button.py +++ b/normcap/gui/menu_button.py @@ -300,6 +300,25 @@ def _add_postprocessing_section(self, menu: QtWidgets.QMenu) -> None: ) menu.addAction(action) + + # Fix OCR spacing artifacts in CJK text + # L10N: Entry in main menu's 'Post-processing' section + action = QtGui.QAction(_("Fix OCR spacing"), postprocessing_group) + action.setObjectName("fix-ocr-spacing") + action.setCheckable(True) + action.setChecked(bool(self.settings.value("fix-ocr-spacing", type=bool))) + # L10N: Tooltip of main menu's 'Fix OCR spacing' entry. Use <56 chars p. line. + action.setToolTip( + _( + "Removes spacing artifacts in CJK text:\n" + "• Remove spaces between CJK characters\n" + "• Keep spaces between English words\n" + "• Remove soft line breaks\n" + "• Keep hard line breaks after punctuation" + ) + ) + menu.addAction(action) + def _add_detection_section(self, menu: QtWidgets.QMenu) -> None: detection_group = QtGui.QActionGroup(menu) detection_group.setObjectName("detection_group") diff --git a/normcap/gui/settings.py b/normcap/gui/settings.py index 89ed6a2ce..aa35c9767 100644 --- a/normcap/gui/settings.py +++ b/normcap/gui/settings.py @@ -103,6 +103,16 @@ def _parse_str_to_bool(string: str) -> bool: cli_arg=True, nargs=None, ), + Setting( + key="fix-ocr-spacing", + flag="", + type_=_parse_str_to_bool, + value=False, + help_="Fix spacing artifacts in CJK text recognition (removes extra spaces between characters).", + choices=(True, False), + cli_arg=True, + nargs=None, + ), Setting( key="current-version", flag="_c", diff --git a/normcap/gui/tray.py b/normcap/gui/tray.py index 789568f04..9ea38b885 100644 --- a/normcap/gui/tray.py +++ b/normcap/gui/tray.py @@ -328,8 +328,10 @@ def _trigger_detect(self, rect: Rect, screen_idx: int) -> None: language=self.settings.value("language"), detect_mode=detection_mode, parse_text=bool(self.settings.value("parse-text", type=bool)), + strip_whitespaces=bool(self.settings.value("fix-ocr-spacing", type=bool)), ) + if result.text and self.cli_mode: self._print_to_stdout_and_exit(text=result.text) elif result.text: diff --git a/tests/tests_detection/tests_ocr/test_transformer.py b/tests/tests_detection/tests_ocr/test_transformer.py index a10fdded3..480d59e62 100644 --- a/tests/tests_detection/tests_ocr/test_transformer.py +++ b/tests/tests_detection/tests_ocr/test_transformer.py @@ -51,3 +51,58 @@ def test_transformer_apply_scores(ocr_result, words, scores_expected): assert scores[transformer_name] == pytest.approx( scores_expected[transformer_name], abs=3 ), transformer_name + + +@pytest.mark.parametrize( + ("lang", "expected"), + [ + ("chi_sim", True), + ("chi_tra", True), + ("chi_sim_vert", True), + ("jpn", True), + ("kor", True), + ("eng", False), + ("deu", False), + ("chi_sim+eng", True), # Mixed with Chinese + ("eng+deu", False), # No CJK + ], +) +def test_should_strip_whitespaces(lang, expected): + """Test detection of CJK languages that benefit from whitespace stripping.""" + result = transformer._should_strip_whitespaces(lang) + assert result == expected + + +@pytest.mark.parametrize( + ("input_text", "expected"), + [ + # Soft line breaks (no punctuation) should be removed + ("这是第一行\n这是第二行", "这是第一行这是第二行"), + ("多行文本\n继续\n还继续", "多行文本继续还继续"), + + # Hard line breaks (after punctuation) should be kept + ("这是句子。\n这是新句子。", "这是句子。\n这是新句子。"), + ("句子一。\n句子二!\n句子三?", "句子一。\n句子二!\n句子三?"), + + # Spaces between CJK characters should be removed + ("这是 中文 文本", "这是中文文本"), + ("中 文 字 符", "中文字符"), + + # Spaces around English words should be kept + ("这是 English word 混排", "这是 English word 混排"), + ("PDF 文档识别", "PDF 文档识别"), + + # Paragraph breaks (double newlines) should become single newline + ("第一段。\n\n第二段。", "第一段。\n第二段。"), + ("段落一\n\n段落二\n\n段落三", "段落一\n段落二\n段落三"), + + # Mixed scenarios + ("中文English中文", "中文English中文"), + ("行尾\n继续行", "行尾继续行"), + ("句子结束。\n\n新段落开始", "句子结束。\n新段落开始"), + ], +) +def test_strip_chinese_whitespaces(input_text, expected): + """Test smart whitespace stripping for CJK text mixed with Latin text.""" + result = transformer._smart_strip_cjk_whitespaces(input_text) + assert result == expected