dynobo · shukebeta · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/normcap/detection/codes/detector.py b/normcap/detection/codes/detector.py
@@ -105,7 +105,8 @@ def _detect_codes_via_zxing(
         text, text_type = _get_text_type_and_transform(text)
         return text, text_type, code_type
 
-    return os.linesep.join(codes), TextType.MULTI_LINE, code_type
+    result_text = os.linesep.join(codes)
+    return result_text, TextType.MULTI_LINE, code_type
 
 
 def detect_codes(image: QtGui.QImage) -> DetectionResult | None:

diff --git a/normcap/detection/detector.py b/normcap/detection/detector.py
@@ -22,6 +22,7 @@ def detect(
     language: str,
     detect_mode: DetectionMode,
     parse_text: bool,
+    strip_whitespaces: bool,
 ) -> DetectionResult:
     ocr_result = None
     codes_result = None
@@ -44,6 +45,7 @@ def detect(
             tessdata_path=tessdata_path,
             parse=parse_text,
             resize_factor=2,
+            strip_whitespaces=strip_whitespaces,
             padding_size=80,
         )
         logger.debug("OCR detection took %s s", f"{time.time() - start_time:.4f}.")

diff --git a/normcap/detection/ocr/recognize.py b/normcap/detection/ocr/recognize.py
@@ -38,6 +38,7 @@ def get_text_from_image(
     tesseract_bin_path: PathLike,
     tessdata_path: PathLike | str | None = None,
     parse: bool = True,
+    strip_whitespaces: bool = False,
     resize_factor: float | None = None,
     padding_size: int | None = None,
 ) -> DetectionResult:
@@ -67,13 +68,17 @@ def get_text_from_image(
     logger.debug("OCR detections:\n%s", ",\n".join(str(w) for w in result.words))
 
     if not parse:
+        # Even without parsing, apply smart whitespace stripping if enabled
+        raw_text = result.text
+        if strip_whitespaces and transformer._should_strip_whitespaces(tess_args.lang):
+            raw_text = transformer._smart_strip_cjk_whitespaces(raw_text)
         return DetectionResult(
-            text=result.text,
+            text=raw_text,
             text_type=TextType.SINGLE_LINE,
             detector=TextDetector.OCR_RAW,
         )
 
-    result = transformer.apply(result)
+    result = transformer.apply(result, strip_whitespaces=strip_whitespaces)
     logger.debug("Parsed text:\n%s", result.parsed)
     text_type = (
         TextType[result.best_scored_transformer.value]

diff --git a/normcap/detection/ocr/transformer.py b/normcap/detection/ocr/transformer.py
@@ -17,7 +17,7 @@
 }
 
 
-def apply(ocr_result: OcrResult) -> OcrResult:
+def apply(ocr_result: OcrResult, strip_whitespaces: bool = False) -> OcrResult:
     """Load transformers, calculate score, execute transformer with highest score.
 
     Args:
@@ -33,8 +33,11 @@ def apply(ocr_result: OcrResult) -> OcrResult:
     if best_transformer_name := ocr_result.best_scored_transformer:
         best_transformer = _transformers[best_transformer_name]
         ocr_result.parsed = best_transformer.transform(ocr_result)
+    else:
+        # No transformer matched, use raw OCR text
+        ocr_result.parsed = ocr_result.add_linebreaks()
 
-    ocr_result.parsed = _post_process(ocr_result)
+    ocr_result.parsed = _post_process(ocr_result, strip_whitespaces)
     return ocr_result
 
 
@@ -47,17 +50,82 @@ def _clean(text: str) -> str:
     return text  # unnecessary return for clarity
 
 
-def _post_process(ocr_result: OcrResult) -> str:
+def _post_process(ocr_result: OcrResult, strip_whitespaces: bool = False) -> str:
     """Apply postprocessing to transformed output."""
     text = ocr_result.parsed
     text = _clean(text)
-    # ONHOLD: Check tesseract issue if whitespace workaround still necessary:
-    # https://github.com/tesseract-ocr/tesseract/issues/2702
-    if ocr_result.tess_args.is_language_without_spaces():
-        text = text.replace(" ", "")
+    # Smart whitespace stripping for CJK text
+    logger.debug(
+        "Whitespace stripping: enabled=%s, should_strip=%s, lang=%s",
+        strip_whitespaces,
+        _should_strip_whitespaces(ocr_result.tess_args.lang) if strip_whitespaces else "N/A",
+        ocr_result.tess_args.lang,
+    )
+    if strip_whitespaces and _should_strip_whitespaces(ocr_result.tess_args.lang):
+        logger.debug("Before smart stripping: %s", repr(text[:100]))
+        text = _smart_strip_cjk_whitespaces(text)
+        logger.debug("After smart stripping: %s", repr(text[:100]))
     return text
 
 
+def _should_strip_whitespaces(lang: str) -> bool:
+    """Check if language contains CJK characters that benefit from smart stripping.
+
+    Now checks for Chinese, Japanese, or Korean languages.
+    """
+    selected_languages = lang.split("+")
+    cjk_langs = {"chi_", "jpn", "kor"}
+    return any(
+        any(sel_lang.startswith(cjk_prefix) for cjk_prefix in cjk_langs)
+        for sel_lang in selected_languages
+    )
+
+
+def _smart_strip_cjk_whitespaces(text: str) -> str:
+    """Strip whitespaces from CJK text using smart algorithm.
+
+    Rules:
+    - Remove spaces between CJK characters only (keep spaces for English words)
+    - Remove soft line breaks (after non-punctuation characters)
+    - Keep hard line breaks (after end punctuation like 。！？；：)
+    - Keep paragraph breaks (double newlines -> single newline)
+
+    This smart algorithm works well for mixed CJK-Latin text.
+    """
+    # Define CJK character range (Chinese, Japanese, Korean)
+    cjk_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]'
+    # End punctuation that indicates sentence end
+    end_punct_pattern = r'[。！？；：]'
+
+    # Step 1: Handle paragraph breaks (double newlines)
+    text = re.sub(r'\n\n+', '<<<PARAGRAPH>>>', text)
+
+    # Step 2: Handle line breaks intelligently
+    lines = text.split('\n')
+    result = []
+    for i, line in enumerate(lines):
+        if i < len(lines) - 1:  # Not the last line
+            # Check if line ends with end punctuation
+            if re.search(end_punct_pattern + r'$', line.rstrip()):
+                # Hard break after punctuation - keep newline
+                result.append(line.rstrip() + '\n')
+            else:
+                # Soft break - remove newline
+                result.append(line.rstrip())
+        else:
+            result.append(line.rstrip())
+    text = ''.join(result)
+
+    # Step 3: Remove spaces adjacent to CJK characters (but preserve ASCII word spacing)
+    # Remove: CJK + space + non-letter
+    text = re.sub(f'({cjk_pattern})[ \t]+(?![a-zA-Z])', r'\1', text)
+    # Remove: non-letter + space + CJK
+    text = re.sub(f'(?<![a-zA-Z])[ \t]+({cjk_pattern})', r'\1', text)
+
+    # Step 4: Restore paragraph breaks as single newline
+    text = text.replace('<<<PARAGRAPH>>>', '\n')
+
+    return text
 def _calc_scores(ocr_result: OcrResult) -> dict[Transformer, float]:
     """Calculate score for every loaded transformer.
 

diff --git a/normcap/gui/menu_button.py b/normcap/gui/menu_button.py
@@ -300,6 +300,25 @@ def _add_postprocessing_section(self, menu: QtWidgets.QMenu) -> None:
         )
         menu.addAction(action)
 
+
+        # Fix OCR spacing artifacts in CJK text
+        # L10N: Entry in main menu's 'Post-processing' section
+        action = QtGui.QAction(_("Fix OCR spacing"), postprocessing_group)
+        action.setObjectName("fix-ocr-spacing")
+        action.setCheckable(True)
+        action.setChecked(bool(self.settings.value("fix-ocr-spacing", type=bool)))
+        # L10N: Tooltip of main menu's 'Fix OCR spacing' entry. Use <56 chars p. line.
+        action.setToolTip(
+            _(
+                "Removes spacing artifacts in CJK text:\n"
+                "• Remove spaces between CJK characters\n"
+                "• Keep spaces between English words\n"
+                "• Remove soft line breaks\n"
+                "• Keep hard line breaks after punctuation"
+            )
+        )
+        menu.addAction(action)
+
     def _add_detection_section(self, menu: QtWidgets.QMenu) -> None:
         detection_group = QtGui.QActionGroup(menu)
         detection_group.setObjectName("detection_group")

diff --git a/normcap/gui/settings.py b/normcap/gui/settings.py
@@ -103,6 +103,16 @@ def _parse_str_to_bool(string: str) -> bool:
         cli_arg=True,
         nargs=None,
     ),
+    Setting(
+        key="fix-ocr-spacing",
+        flag="",
+        type_=_parse_str_to_bool,
+        value=False,
+        help_="Fix spacing artifacts in CJK text recognition (removes extra spaces between characters).",
+        choices=(True, False),
+        cli_arg=True,
+        nargs=None,
+    ),
     Setting(
         key="current-version",
         flag="_c",

diff --git a/normcap/gui/tray.py b/normcap/gui/tray.py
@@ -328,8 +328,10 @@ def _trigger_detect(self, rect: Rect, screen_idx: int) -> None:
             language=self.settings.value("language"),
             detect_mode=detection_mode,
             parse_text=bool(self.settings.value("parse-text", type=bool)),
+            strip_whitespaces=bool(self.settings.value("fix-ocr-spacing", type=bool)),
         )
 
+
         if result.text and self.cli_mode:
             self._print_to_stdout_and_exit(text=result.text)
         elif result.text:

diff --git a/tests/tests_detection/tests_ocr/test_transformer.py b/tests/tests_detection/tests_ocr/test_transformer.py
@@ -51,3 +51,58 @@ def test_transformer_apply_scores(ocr_result, words, scores_expected):
         assert scores[transformer_name] == pytest.approx(
             scores_expected[transformer_name], abs=3
         ), transformer_name
+
+
+@pytest.mark.parametrize(
+    ("lang", "expected"),
+    [
+        ("chi_sim", True),
+        ("chi_tra", True),
+        ("chi_sim_vert", True),
+        ("jpn", True),
+        ("kor", True),
+        ("eng", False),
+        ("deu", False),
+        ("chi_sim+eng", True),  # Mixed with Chinese
+        ("eng+deu", False),  # No CJK
+    ],
+)
+def test_should_strip_whitespaces(lang, expected):
+    """Test detection of CJK languages that benefit from whitespace stripping."""
+    result = transformer._should_strip_whitespaces(lang)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("input_text", "expected"),
+    [
+        # Soft line breaks (no punctuation) should be removed
+        ("这是第一行\n这是第二行", "这是第一行这是第二行"),
+        ("多行文本\n继续\n还继续", "多行文本继续还继续"),
+
+        # Hard line breaks (after punctuation) should be kept
+        ("这是句子。\n这是新句子。", "这是句子。\n这是新句子。"),
+        ("句子一。\n句子二！\n句子三？", "句子一。\n句子二！\n句子三？"),
+
+        # Spaces between CJK characters should be removed
+        ("这是 中文 文本", "这是中文文本"),
+        ("中 文 字 符", "中文字符"),
+
+        # Spaces around English words should be kept
+        ("这是 English word 混排", "这是 English word 混排"),
+        ("PDF 文档识别", "PDF 文档识别"),
+
+        # Paragraph breaks (double newlines) should become single newline
+        ("第一段。\n\n第二段。", "第一段。\n第二段。"),
+        ("段落一\n\n段落二\n\n段落三", "段落一\n段落二\n段落三"),
+
+        # Mixed scenarios
+        ("中文English中文", "中文English中文"),
+        ("行尾\n继续行", "行尾继续行"),
+        ("句子结束。\n\n新段落开始", "句子结束。\n新段落开始"),
+    ],
+)
+def test_strip_chinese_whitespaces(input_text, expected):
+    """Test smart whitespace stripping for CJK text mixed with Latin text."""
+    result = transformer._smart_strip_cjk_whitespaces(input_text)
+    assert result == expected