Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion normcap/detection/codes/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def _detect_codes_via_zxing(
text, text_type = _get_text_type_and_transform(text)
return text, text_type, code_type

return os.linesep.join(codes), TextType.MULTI_LINE, code_type
result_text = os.linesep.join(codes)
return result_text, TextType.MULTI_LINE, code_type


def detect_codes(image: QtGui.QImage) -> DetectionResult | None:
Expand Down
2 changes: 2 additions & 0 deletions normcap/detection/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def detect(
language: str,
detect_mode: DetectionMode,
parse_text: bool,
strip_whitespaces: bool,
) -> DetectionResult:
ocr_result = None
codes_result = None
Expand All @@ -44,6 +45,7 @@ def detect(
tessdata_path=tessdata_path,
parse=parse_text,
resize_factor=2,
strip_whitespaces=strip_whitespaces,
padding_size=80,
)
logger.debug("OCR detection took %s s", f"{time.time() - start_time:.4f}.")
Expand Down
9 changes: 7 additions & 2 deletions normcap/detection/ocr/recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def get_text_from_image(
tesseract_bin_path: PathLike,
tessdata_path: PathLike | str | None = None,
parse: bool = True,
strip_whitespaces: bool = False,
resize_factor: float | None = None,
padding_size: int | None = None,
) -> DetectionResult:
Expand Down Expand Up @@ -67,13 +68,17 @@ def get_text_from_image(
logger.debug("OCR detections:\n%s", ",\n".join(str(w) for w in result.words))

if not parse:
# Even without parsing, apply smart whitespace stripping if enabled
raw_text = result.text
if strip_whitespaces and transformer._should_strip_whitespaces(tess_args.lang):
raw_text = transformer._smart_strip_cjk_whitespaces(raw_text)
return DetectionResult(
text=result.text,
text=raw_text,
text_type=TextType.SINGLE_LINE,
detector=TextDetector.OCR_RAW,
)

result = transformer.apply(result)
result = transformer.apply(result, strip_whitespaces=strip_whitespaces)
logger.debug("Parsed text:\n%s", result.parsed)
text_type = (
TextType[result.best_scored_transformer.value]
Expand Down
82 changes: 75 additions & 7 deletions normcap/detection/ocr/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
}


def apply(ocr_result: OcrResult) -> OcrResult:
def apply(ocr_result: OcrResult, strip_whitespaces: bool = False) -> OcrResult:
"""Load transformers, calculate score, execute transformer with highest score.

Args:
Expand All @@ -33,8 +33,11 @@ def apply(ocr_result: OcrResult) -> OcrResult:
if best_transformer_name := ocr_result.best_scored_transformer:
best_transformer = _transformers[best_transformer_name]
ocr_result.parsed = best_transformer.transform(ocr_result)
else:
# No transformer matched, use raw OCR text
ocr_result.parsed = ocr_result.add_linebreaks()

ocr_result.parsed = _post_process(ocr_result)
ocr_result.parsed = _post_process(ocr_result, strip_whitespaces)
return ocr_result


Expand All @@ -47,17 +50,82 @@ def _clean(text: str) -> str:
return text # unnecessary return for clarity


def _post_process(ocr_result: OcrResult) -> str:
def _post_process(ocr_result: OcrResult, strip_whitespaces: bool = False) -> str:
"""Apply postprocessing to transformed output."""
text = ocr_result.parsed
text = _clean(text)
# ONHOLD: Check tesseract issue if whitespace workaround still necessary:
# https://github.com/tesseract-ocr/tesseract/issues/2702
if ocr_result.tess_args.is_language_without_spaces():
text = text.replace(" ", "")
# Smart whitespace stripping for CJK text
logger.debug(
"Whitespace stripping: enabled=%s, should_strip=%s, lang=%s",
strip_whitespaces,
_should_strip_whitespaces(ocr_result.tess_args.lang) if strip_whitespaces else "N/A",
ocr_result.tess_args.lang,
)
if strip_whitespaces and _should_strip_whitespaces(ocr_result.tess_args.lang):
logger.debug("Before smart stripping: %s", repr(text[:100]))
text = _smart_strip_cjk_whitespaces(text)
logger.debug("After smart stripping: %s", repr(text[:100]))
return text


def _should_strip_whitespaces(lang: str) -> bool:
"""Check if language contains CJK characters that benefit from smart stripping.

Now checks for Chinese, Japanese, or Korean languages.
"""
selected_languages = lang.split("+")
cjk_langs = {"chi_", "jpn", "kor"}
return any(
any(sel_lang.startswith(cjk_prefix) for cjk_prefix in cjk_langs)
for sel_lang in selected_languages
)


def _smart_strip_cjk_whitespaces(text: str) -> str:
"""Strip whitespaces from CJK text using smart algorithm.

Rules:
- Remove spaces between CJK characters only (keep spaces for English words)
- Remove soft line breaks (after non-punctuation characters)
- Keep hard line breaks (after end punctuation like 。!?;:)
- Keep paragraph breaks (double newlines -> single newline)

This smart algorithm works well for mixed CJK-Latin text.
"""
# Define CJK character range (Chinese, Japanese, Korean)
cjk_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]'
# End punctuation that indicates sentence end
end_punct_pattern = r'[。!?;:]'

# Step 1: Handle paragraph breaks (double newlines)
text = re.sub(r'\n\n+', '<<<PARAGRAPH>>>', text)

# Step 2: Handle line breaks intelligently
lines = text.split('\n')
result = []
for i, line in enumerate(lines):
if i < len(lines) - 1: # Not the last line
# Check if line ends with end punctuation
if re.search(end_punct_pattern + r'$', line.rstrip()):
# Hard break after punctuation - keep newline
result.append(line.rstrip() + '\n')
else:
# Soft break - remove newline
result.append(line.rstrip())
else:
result.append(line.rstrip())
text = ''.join(result)

# Step 3: Remove spaces adjacent to CJK characters (but preserve ASCII word spacing)
# Remove: CJK + space + non-letter
text = re.sub(f'({cjk_pattern})[ \t]+(?![a-zA-Z])', r'\1', text)
# Remove: non-letter + space + CJK
text = re.sub(f'(?<![a-zA-Z])[ \t]+({cjk_pattern})', r'\1', text)

# Step 4: Restore paragraph breaks as single newline
text = text.replace('<<<PARAGRAPH>>>', '\n')

return text
def _calc_scores(ocr_result: OcrResult) -> dict[Transformer, float]:
"""Calculate score for every loaded transformer.

Expand Down
19 changes: 19 additions & 0 deletions normcap/gui/menu_button.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,25 @@ def _add_postprocessing_section(self, menu: QtWidgets.QMenu) -> None:
)
menu.addAction(action)


# Fix OCR spacing artifacts in CJK text
# L10N: Entry in main menu's 'Post-processing' section
action = QtGui.QAction(_("Fix OCR spacing"), postprocessing_group)
action.setObjectName("fix-ocr-spacing")
action.setCheckable(True)
action.setChecked(bool(self.settings.value("fix-ocr-spacing", type=bool)))
# L10N: Tooltip of main menu's 'Fix OCR spacing' entry. Use <56 chars p. line.
action.setToolTip(
_(
"Removes spacing artifacts in CJK text:\n"
"• Remove spaces between CJK characters\n"
"• Keep spaces between English words\n"
"• Remove soft line breaks\n"
"• Keep hard line breaks after punctuation"
)
)
menu.addAction(action)

def _add_detection_section(self, menu: QtWidgets.QMenu) -> None:
detection_group = QtGui.QActionGroup(menu)
detection_group.setObjectName("detection_group")
Expand Down
10 changes: 10 additions & 0 deletions normcap/gui/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@ def _parse_str_to_bool(string: str) -> bool:
cli_arg=True,
nargs=None,
),
Setting(
key="fix-ocr-spacing",
flag="",
type_=_parse_str_to_bool,
value=False,
help_="Fix spacing artifacts in CJK text recognition (removes extra spaces between characters).",
choices=(True, False),
cli_arg=True,
nargs=None,
),
Setting(
key="current-version",
flag="_c",
Expand Down
2 changes: 2 additions & 0 deletions normcap/gui/tray.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,8 +328,10 @@ def _trigger_detect(self, rect: Rect, screen_idx: int) -> None:
language=self.settings.value("language"),
detect_mode=detection_mode,
parse_text=bool(self.settings.value("parse-text", type=bool)),
strip_whitespaces=bool(self.settings.value("fix-ocr-spacing", type=bool)),
)


if result.text and self.cli_mode:
self._print_to_stdout_and_exit(text=result.text)
elif result.text:
Expand Down
55 changes: 55 additions & 0 deletions tests/tests_detection/tests_ocr/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,58 @@ def test_transformer_apply_scores(ocr_result, words, scores_expected):
assert scores[transformer_name] == pytest.approx(
scores_expected[transformer_name], abs=3
), transformer_name


@pytest.mark.parametrize(
("lang", "expected"),
[
("chi_sim", True),
("chi_tra", True),
("chi_sim_vert", True),
("jpn", True),
("kor", True),
("eng", False),
("deu", False),
("chi_sim+eng", True), # Mixed with Chinese
("eng+deu", False), # No CJK
],
)
def test_should_strip_whitespaces(lang, expected):
"""Test detection of CJK languages that benefit from whitespace stripping."""
result = transformer._should_strip_whitespaces(lang)
assert result == expected


@pytest.mark.parametrize(
("input_text", "expected"),
[
# Soft line breaks (no punctuation) should be removed
("这是第一行\n这是第二行", "这是第一行这是第二行"),
("多行文本\n继续\n还继续", "多行文本继续还继续"),

# Hard line breaks (after punctuation) should be kept
("这是句子。\n这是新句子。", "这是句子。\n这是新句子。"),
("句子一。\n句子二!\n句子三?", "句子一。\n句子二!\n句子三?"),

# Spaces between CJK characters should be removed
("这是 中文 文本", "这是中文文本"),
("中 文 字 符", "中文字符"),

# Spaces around English words should be kept
("这是 English word 混排", "这是 English word 混排"),
("PDF 文档识别", "PDF 文档识别"),

# Paragraph breaks (double newlines) should become single newline
("第一段。\n\n第二段。", "第一段。\n第二段。"),
("段落一\n\n段落二\n\n段落三", "段落一\n段落二\n段落三"),

# Mixed scenarios
("中文English中文", "中文English中文"),
("行尾\n继续行", "行尾继续行"),
("句子结束。\n\n新段落开始", "句子结束。\n新段落开始"),
],
)
def test_strip_chinese_whitespaces(input_text, expected):
"""Test smart whitespace stripping for CJK text mixed with Latin text."""
result = transformer._smart_strip_cjk_whitespaces(input_text)
assert result == expected