diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index 252e42cf4..558eda67d 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -28,6 +28,10 @@ reconstruct_fragment_with_sidecar, rewrite_on_stored_template, ) +from reverse_sync.visible_segments import ( + extract_list_model_from_mdx, + extract_list_model_from_xhtml, +) def is_markdown_table(content: str) -> bool: @@ -183,53 +187,6 @@ def _detect_list_item_space_change(old_content: str, new_content: str) -> bool: return has_space_change -def _normalize_list_for_content_compare(content: str) -> str: - """리스트 변경 비교용 plain text를 생성한다. - - 리스트 항목 내부의 continuation line 줄바꿈은 emitter에서 공백 하나로 합쳐지므로 - 내용 변경 판정에서는 무시한다. 대신 항목 경계와 항목 내부의 실제 공백 수 차이는 - 그대로 보존해 no-op reflow와 가시 공백 변경을 구분한다. - - 마커 뒤 공백 수도 보존한다. normalize_mdx_to_plain이 마커를 제거하므로 - 마커 뒤 공백 수를 별도로 접두어에 기록하여 ``* text``와 ``* text``를 구분한다. - """ - marker_re = re.compile(r'^(\s*(?:\d+\.|[-*+]))(\s+)') - lines = content.strip().split('\n') - item_chunks: List[str] = [] - current_chunk: List[str] = [] - current_marker_ws: str = '' - - def _flush_current() -> None: - if not current_chunk: - return - plain = normalize_mdx_to_plain('\n'.join(current_chunk), 'list') - if plain: - item_chunks.append(current_marker_ws + plain.replace('\n', ' ')) - - for line in lines: - if not line.strip(): - continue - m = marker_re.match(line) - if m: - _flush_current() - current_chunk = [line] - current_marker_ws = m.group(2) - continue - if re.match(r'^\s*(?:\d+\.(?:\s+|$)|[-*+]\s+)', line): - _flush_current() - current_chunk = [line] - current_marker_ws = '' - continue - if current_chunk: - current_chunk.append(line) - else: - current_chunk = [line] - current_marker_ws = '' - - _flush_current() - return '\n'.join(item_chunks) - - def _build_inline_fixups( old_content: str, new_content: str, @@ -1106,17 +1063,16 @@ def _mark_used(block_id: str, m: BlockMapping): list_sidecar = _find_roundtrip_sidecar_block( change, mapping, roundtrip_sidecar, xpath_to_sidecar_block, ) - # v3 fallback, sidecar 없음, 또는 실제 텍스트 변경이 있는 경우 whole-fragment 재생성 - # (Phase 5 Axis 3: build_list_item_patches fallback 제거) - # 내용 비교는 가시 공백 수 변화는 보존하되, continuation line reflow처럼 - # emitter 결과가 동일한 줄바꿈 정리는 무시한다. - _old_plain_raw = _normalize_list_for_content_compare(change.old_block.content) - _new_plain_raw = _normalize_list_for_content_compare(change.new_block.content) - has_content_change = _old_plain_raw != _new_plain_raw - # _apply_mdx_diff_to_xhtml에 전달할 기본값은 collapse_ws 적용: - # XHTML plain text에는 줄바꿈이 없으므로 clean list 정렬에는 공백 축약본이 맞다. - _old_plain = collapse_ws(_old_plain_raw) - _new_plain = collapse_ws(_new_plain_raw) + old_list_model = extract_list_model_from_mdx(change.old_block.content) + new_list_model = extract_list_model_from_mdx(change.new_block.content) + xhtml_list_model = extract_list_model_from_xhtml(mapping.xhtml_text) + has_content_change = old_list_model.visible_text != new_list_model.visible_text + has_structure_change = ( + old_list_model.structural_fingerprint + != new_list_model.structural_fingerprint + ) + old_visible = old_list_model.visible_text + new_visible = new_list_model.visible_text # ol start 변경 감지: 숫자 목록의 시작 번호가 달라진 경우 _old_start = re.match(r'^\s*(\d+)\.', change.old_block.content) _new_start = re.match(r'^\s*(\d+)\.', change.new_block.content) @@ -1132,18 +1088,39 @@ def _mark_used(block_id: str, m: BlockMapping): block_type=change.old_block.type, ) has_inline_boundary = bool(inline_fixups) - has_any_change = has_content_change or has_ol_start_change or has_inline_boundary + has_patchable_text_change = ( + has_content_change or has_ol_start_change or has_inline_boundary + ) + has_rebuild_change = has_patchable_text_change or has_structure_change + requires_anchor_rebuild = sidecar_block_requires_reconstruction( + list_sidecar, + ) should_replace_clean_list = ( mapping is not None and not _contains_preserved_anchor_markup(mapping.xhtml_text) # sidecar 있으면 항상 허용; 없으면 실제 변경(텍스트 또는 번호 시작값)이 있을 때만 허용 - and (roundtrip_sidecar is not None or has_any_change) - and (list_sidecar is None or mapping_via_v3_fallback or has_any_change) + and (roundtrip_sidecar is not None or has_rebuild_change) + and (list_sidecar is None or mapping_via_v3_fallback or has_rebuild_change) ) + # preserved anchor list의 구조 변경은 item merge를 우선 시도하고, + # 실패한 경우에만 fragment 재구성으로 내려간다. + if (mapping is not None + and _contains_preserved_anchor_markup(mapping.xhtml_text)): + merge_patch = _build_list_item_merge_patch( + mapping, + change.old_block.content, + change.new_block.content, + old_visible, + new_visible, + ) + if merge_patch is not None: + _mark_used(mapping.block_id, mapping) + patches.append(merge_patch) + continue if (mapping is not None and ( - # anchor case: sidecar anchor metadata가 있으면 ac: 포함 여부 무관 - sidecar_block_requires_reconstruction(list_sidecar) + # anchor case: text-only preserved anchor list는 modify로 처리한다. + requires_anchor_rebuild # clean case: preserved anchor 없는 clean list or should_replace_clean_list )): @@ -1157,49 +1134,25 @@ def _mark_used(block_id: str, m: BlockMapping): ) ) continue - # preserved anchor list + 아이템 수 변경: DOM 직접 조작으로
  • 병합/제거 - if (mapping is not None - and _contains_preserved_anchor_markup(mapping.xhtml_text) - and has_content_change): - merge_patch = _build_list_item_merge_patch( - mapping, - change.old_block.content, - change.new_block.content, - _old_plain, - _new_plain, - ) - if merge_patch is not None: - _mark_used(mapping.block_id, mapping) - patches.append(merge_patch) - continue # preserved anchor list: text-level 패치로 ac:/ri: XHTML 구조 보존 # (_apply_mdx_diff_to_xhtml 경로) # 같은 부모의 다중 변경은 순차 집계한다 (이전 결과에 누적 적용) # inline_fixups, has_inline_boundary는 상단에서 이미 계산됨 - if mapping is not None and (has_any_change or has_inline_boundary): + if mapping is not None and has_patchable_text_change: bid = mapping.block_id if bid not in _text_change_patches: patch_entry: Dict[str, Any] = { 'xhtml_xpath': mapping.xhtml_xpath, - 'old_plain_text': mapping.xhtml_plain_text, - 'new_plain_text': mapping.xhtml_plain_text, + 'old_plain_text': xhtml_list_model.visible_text, + 'new_plain_text': xhtml_list_model.visible_text, } patches.append(patch_entry) _text_change_patches[bid] = patch_entry if has_content_change: - preserve_visible_ws = _contains_preserved_link_markup( - mapping.xhtml_text - ) - transfer_old_plain = _old_plain_raw if preserve_visible_ws else _old_plain - transfer_new_plain = _new_plain_raw if preserve_visible_ws else _new_plain transfer_xhtml_plain = _text_change_patches[bid]['new_plain_text'] - if not preserve_visible_ws: - # XHTML text를 정규화하여 MDX와 공백 1:1 매핑 보장 - # (strong trailing space 등으로 인한 이중 공백 문제 방지) - transfer_xhtml_plain = collapse_ws(transfer_xhtml_plain) _text_change_patches[bid]['new_plain_text'] = _apply_mdx_diff_to_xhtml( - transfer_old_plain, - transfer_new_plain, + old_visible, + new_visible, transfer_xhtml_plain, ) if has_ol_start_change: diff --git a/confluence-mdx/bin/reverse_sync/visible_segments.py b/confluence-mdx/bin/reverse_sync/visible_segments.py new file mode 100644 index 000000000..3509926c1 --- /dev/null +++ b/confluence-mdx/bin/reverse_sync/visible_segments.py @@ -0,0 +1,390 @@ +"""Lossless visible segment extraction for reverse sync. + +Phase 1 migrates list handling first. The abstraction is intentionally small: +- keep visible whitespace as explicit segments +- keep list/item structure in a fingerprint for rebuild decisions +- expose actual XHTML-visible text without the lossy normalize_mdx_to_plain step +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import html as html_module +import re +from typing import Any, Iterable, List, Literal, Tuple + +SegmentKind = Literal["list_marker", "ws", "text", "item_boundary", "anchor"] + +from bs4 import BeautifulSoup, Tag +from reverse_sync.mapping_recorder import get_text_with_emoticons + + +@dataclass(frozen=True) +class VisibleSegment: + kind: SegmentKind + text: str + visible: bool + structural: bool + meta: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class VisibleContentModel: + segments: List[VisibleSegment] + visible_text: str + structural_fingerprint: Tuple[Any, ...] + + +_MDX_LIST_ITEM_RE = re.compile(r'^(\s*)(\d+\.|[-*+])(\s*)(.*)$') +_MDX_TABLE_SEPARATOR_RE = re.compile(r'^\|[\s\-:|]+\|$') + + +@dataclass(frozen=True) +class _MdxListEntry: + path: Tuple[int, ...] + marker: str + marker_ws: str + body: str + continuation_lines: Tuple[str, ...] + + +def extract_list_model_from_mdx(content: str) -> VisibleContentModel: + """Build a lossless visible-content model from MDX list content.""" + entries = _parse_mdx_list_entries(content) + root_kind = _detect_root_list_kind(entries) + ol_start = _detect_ordered_start(entries) + + segments: List[VisibleSegment] = [] + visible_parts: List[str] = [] + fingerprint_items: List[Tuple[Any, ...]] = [] + + for index, entry in enumerate(entries): + rendered = _render_mdx_list_entry(entry) + continuation_target = _find_continuation_target_path(entries, index) + if continuation_target is not None and not rendered: + continue + + segment_path = continuation_target or entry.path + if (continuation_target is not None and rendered and visible_parts + and not visible_parts[-1].endswith((' ', '\t')) + and not rendered.startswith((' ', '\t'))): + rendered = f" {rendered}" + if continuation_target is None: + segments.append(VisibleSegment( + kind="list_marker", + text=entry.marker, + visible=False, + structural=True, + meta={"path": entry.path}, + )) + if entry.marker_ws: + segments.append(VisibleSegment( + kind="ws", + text=entry.marker_ws, + visible=False, + structural=True, + meta={"path": entry.path, "role": "marker_gap"}, + )) + + for segment in _tokenize_visible_text(rendered, path=segment_path): + segments.append(segment) + if segment.visible: + visible_parts.append(segment.text) + + if continuation_target is None: + segments.append(VisibleSegment( + kind="item_boundary", + text="", + visible=False, + structural=True, + meta={"path": entry.path}, + )) + marker_kind = "ol" if entry.marker.endswith('.') else "ul" + fingerprint_items.append((entry.path, marker_kind)) + + return VisibleContentModel( + segments=segments, + visible_text=''.join(visible_parts), + structural_fingerprint=(root_kind, ol_start, tuple(fingerprint_items)), + ) + + +def extract_list_model_from_xhtml(fragment: str) -> VisibleContentModel: + """Build a lossless visible-content model from XHTML list content.""" + soup = BeautifulSoup(fragment, "html.parser") + root = soup.find(["ul", "ol"]) + if root is None: + return VisibleContentModel([], "", ("", None, (), ())) + + # Use the same DOM text basis that patch_xhtml validates against. + # Only trim non-visible trailing whitespace from whitespace-only tail blocks. + visible_text = get_text_with_emoticons(root).rstrip() + segments: List[VisibleSegment] = [] + for segment in _tokenize_visible_text(visible_text): + segments.append(segment) + + item_paths, anchor_paths = _collect_xhtml_list_structure(root) + for anchor_path in anchor_paths: + segments.append(VisibleSegment( + kind="anchor", + text="", + visible=False, + structural=True, + meta={"path": anchor_path}, + )) + + start = int(root.get("start", "1")) if root.name == "ol" and root.get("start") else 1 + return VisibleContentModel( + segments=segments, + visible_text=visible_text, + structural_fingerprint=(root.name, start if root.name == "ol" else None, + tuple(item_paths), tuple(anchor_paths)), + ) + + +def model_has_anchor_segments(model: VisibleContentModel) -> bool: + return any(segment.kind == "anchor" for segment in model.segments) + + +def _parse_mdx_list_entries(content: str) -> List[_MdxListEntry]: + entries: List[_MdxListEntry] = [] + stack: List[Tuple[int, Tuple[int, ...]]] = [] + current: dict[str, Any] | None = None + + for raw_line in content.split('\n'): + match = _MDX_LIST_ITEM_RE.match(raw_line) + if match is None: + if current is not None: + current["continuation_lines"].append(raw_line) + continue + + if current is not None: + entries.append(_MdxListEntry( + path=current["path"], + marker=current["marker"], + marker_ws=current["marker_ws"], + body=current["body"], + continuation_lines=tuple(current["continuation_lines"]), + )) + + indent = len(match.group(1)) + marker = match.group(2) + marker_ws = match.group(3) + body = match.group(4) + + while stack and indent < stack[-1][0]: + stack.pop() + + if stack and indent == stack[-1][0]: + parent_path = stack[-2][1] if len(stack) >= 2 else () + index = stack[-1][1][-1] + 1 + stack.pop() + elif stack and indent > stack[-1][0]: + parent_path = stack[-1][1] + index = 0 + else: + parent_path = () + index = 0 + + path = parent_path + (index,) + current = { + "path": path, + "marker": marker, + "marker_ws": marker_ws, + "body": body, + "continuation_lines": [], + } + stack.append((indent, path)) + + if current is not None: + entries.append(_MdxListEntry( + path=current["path"], + marker=current["marker"], + marker_ws=current["marker_ws"], + body=current["body"], + continuation_lines=tuple(current["continuation_lines"]), + )) + + return entries + + +def _render_mdx_list_entry(entry: _MdxListEntry) -> str: + pieces: List[str] = [] + if entry.body: + rendered = _render_mdx_line(entry.body, preserve_leading=True) + if rendered: + pieces.append(rendered) + + for line in entry.continuation_lines: + rendered = _render_mdx_line(line.lstrip(), preserve_leading=False) + if rendered: + pieces.append(rendered) + + return _join_rendered_pieces(pieces) + + +def _render_mdx_line(line: str, *, preserve_leading: bool) -> str: + if not line: + return "" + + s = line if preserve_leading else line.lstrip() + stripped = s.strip() + if not stripped: + return "" + if stripped.startswith(("(.*?)', + lambda m: m.group(2) + m.group(1).capitalize(), + s, + ) + # Terminal
    before a non-visible continuation (for example a figure) + # does not contribute visible text and should not leave a trailing space. + s = re.sub(r'\s*\s*$', '', s) + s = re.sub(r'<[^>]+/?>', '', s) + return html_module.unescape(s) + + +def _join_rendered_pieces(pieces: Iterable[str]) -> str: + result = "" + for piece in pieces: + if not piece: + continue + if not result: + result = piece + continue + joiner = "" + if not result.endswith((' ', '\t')) and not piece.startswith((' ', '\t')): + joiner = " " + result = result + joiner + piece + return result + + +def _find_continuation_target_path( + entries: List[_MdxListEntry], + index: int, +) -> Tuple[int, ...] | None: + entry = entries[index] + if entry.body.strip() or not entry.continuation_lines or index == 0: + return None + + previous = entries[index - 1] + if previous.path[:-1] != entry.path[:-1]: + return None + if previous.path[-1] + 1 != entry.path[-1]: + return None + + if all( + not _render_mdx_line(line.lstrip(), preserve_leading=False) + for line in entry.continuation_lines + ): + return previous.path + + if _is_figure_only_continuation_lines(entry.continuation_lines): + return previous.path + + return None + + +def _is_figure_only_continuation_lines(lines: Tuple[str, ...]) -> bool: + in_figure = False + in_figcaption = False + saw_figure = False + + for line in lines: + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("" and in_figure: + in_figcaption = True + continue + if stripped == "" and in_figcaption: + in_figcaption = False + continue + if stripped == "" and in_figure and not in_figcaption: + in_figure = False + continue + if in_figcaption: + continue + return False + + return saw_figure and not in_figure and not in_figcaption + + +def _tokenize_visible_text(text: str, *, path: Tuple[int, ...] | None = None) -> List[VisibleSegment]: + segments: List[VisibleSegment] = [] + if not text: + return segments + + for match in re.finditer(r'\s+|[^\s]+', text): + token = match.group(0) + segments.append(VisibleSegment( + kind="ws" if token.isspace() else "text", + text=token, + visible=True, + structural=False, + meta={"path": path} if path is not None else {}, + )) + return segments + + +def _detect_root_list_kind(entries: List[_MdxListEntry]) -> str: + if not entries: + return "ul" + return "ol" if entries[0].marker.endswith('.') else "ul" + + +def _detect_ordered_start(entries: List[_MdxListEntry]) -> int | None: + if not entries: + return None + first = entries[0].marker + if not first.endswith('.'): + return None + try: + return int(first[:-1]) + except ValueError: + return None + + +def _collect_xhtml_list_structure(root: Tag) -> Tuple[List[Tuple[int, ...]], List[Tuple[int, ...]]]: + item_paths: List[Tuple[int, ...]] = [] + anchor_paths: List[Tuple[int, ...]] = [] + + def walk_list(list_tag: Tag, parent_path: Tuple[int, ...]) -> None: + items = [child for child in list_tag.children if isinstance(child, Tag) and child.name == 'li'] + for index, li in enumerate(items): + path = parent_path + (index,) + item_paths.append(path) + if li.find(['ac:link', 'ac:image']) is not None: + anchor_paths.append(path) + nested_lists = [ + child for child in li.children + if isinstance(child, Tag) and child.name in ('ul', 'ol') + ] + for nested in nested_lists: + walk_list(nested, path) + + walk_list(root, ()) + return item_paths, anchor_paths diff --git a/confluence-mdx/tests/test_reverse_sync_patch_builder.py b/confluence-mdx/tests/test_reverse_sync_patch_builder.py index ef32f8ac6..ef0efa91c 100644 --- a/confluence-mdx/tests/test_reverse_sync_patch_builder.py +++ b/confluence-mdx/tests/test_reverse_sync_patch_builder.py @@ -3,14 +3,25 @@ build_patches 분기 경로 + helper 함수 (is_markdown_table) 테스트. """ +from pathlib import Path +from bs4 import BeautifulSoup + from reverse_sync.block_diff import BlockChange +from reverse_sync.block_diff import diff_blocks from reverse_sync.mapping_recorder import BlockMapping +from reverse_sync.mapping_recorder import record_mapping +from mdx_to_storage.parser import parse_mdx_blocks from reverse_sync.mdx_block_parser import MdxBlock from reverse_sync.sidecar import ( DocumentEnvelope, RoundtripSidecar, SidecarBlock, SidecarEntry, + build_sidecar, + build_mdx_to_sidecar_index, + build_xpath_to_mapping, + generate_sidecar_mapping, + load_sidecar_mapping, sha256_text, ) from text_utils import normalize_mdx_to_plain @@ -21,13 +32,14 @@ _extract_inline_markers, _find_roundtrip_sidecar_block, _has_inline_boundary_change, - _normalize_list_for_content_compare, _resolve_mapping_for_change, build_patches, is_markdown_table, ) from reverse_sync.xhtml_patcher import _apply_inline_fixups, patch_xhtml +_REVERSE_SYNC_FIXTURE_ROOT = Path(__file__).parent / "reverse-sync" + # ── 헬퍼 팩토리 ── @@ -2967,45 +2979,160 @@ def test_image_anchor_list_keeps_collapsed_text_diff(self): ) assert patches[0]['new_plain_text'] == '목록 좌측 상단에서 Delete 버튼을 클릭합니다.' + def test_single_item_link_trailing_space_change_generates_patch(self): + xhtml = ( + '
    • ' + '' + 'Okta 연동하기 ' + '

    ' + ) + old_content = '* [Okta 연동하기 ](general/okta)\n' + new_content = '* [Okta 연동하기](general/okta)\n' + change = _make_change(0, old_content, new_content, type_='list') + mapping = BlockMapping( + block_id='list-anchor-trailing-1', + type='list', + xhtml_xpath='ul[1]', + xhtml_text=xhtml, + xhtml_plain_text='Okta 연동하기', + xhtml_element_index=0, + children=[], + ) + roundtrip_sidecar = _make_roundtrip_sidecar([ + SidecarBlock(0, 'ul[1]', xhtml, sha256_text(old_content), (1, 1)) + ]) + + patches, _, skipped = build_patches( + [change], [change.old_block], [change.new_block], + mappings=[mapping], + roundtrip_sidecar=roundtrip_sidecar, + ) + + assert len(patches) == 1, ( + f"단일 preserved anchor 리스트의 trailing space 제거도 패치를 생성해야 합니다. " + f"skipped={skipped}" + ) + assert patches[0]['new_plain_text'] == 'Okta 연동하기' + + def test_marker_space_only_change_on_preserved_anchor_is_noop(self): + xhtml = ( + '
    • ' + '' + '링크' + '

    ' + ) + old_content = '* [링크](url)\n' + new_content = '* [링크](url)\n' + change = _make_change(0, old_content, new_content, type_='list') + mapping = BlockMapping( + block_id='list-anchor-marker-noop-1', + type='list', + xhtml_xpath='ul[1]', + xhtml_text=xhtml, + xhtml_plain_text='링크', + xhtml_element_index=0, + children=[], + ) + roundtrip_sidecar = _make_roundtrip_sidecar([ + SidecarBlock(0, 'ul[1]', xhtml, sha256_text(old_content), (1, 1)) + ]) -# ── _normalize_list_for_content_compare 마커 공백 보존 테스트 ── + patches, _, skipped = build_patches( + [change], [change.old_block], [change.new_block], + mappings=[mapping], + roundtrip_sidecar=roundtrip_sidecar, + ) + assert patches == [], ( + f"marker 뒤 공백만 바뀐 preserved anchor 리스트는 no-op 이어야 합니다. " + f"patches={patches}, skipped={skipped}" + ) -class TestNormalizeListMarkerWhitespace: - """_normalize_list_for_content_compare: 마커 뒤 공백 차이를 보존하여 변경 감지.""" - def test_marker_ws_difference_detected(self): - """마커 뒤 공백 수가 다르면 정규화 결과가 다르다.""" - old = _normalize_list_for_content_compare("* 항목") - new = _normalize_list_for_content_compare("* 항목") - assert old != new +class TestCleanListWhitespaceNoop: + def test_marker_space_only_change_on_clean_list_is_noop(self): + xhtml = '
    • 항목

    ' + change = _make_change(0, '* 항목\n', '* 항목\n', type_='list') + mapping = BlockMapping( + block_id='list-clean-marker-noop-1', + type='list', + xhtml_xpath='ul[1]', + xhtml_text=xhtml, + xhtml_plain_text='항목', + xhtml_element_index=0, + children=[], + ) - def test_same_content_same_result(self): - """마커 공백이 같으면 정규화 결과도 같다.""" - old = _normalize_list_for_content_compare("* 항목") - new = _normalize_list_for_content_compare("* 항목") - assert old == new + patches, _, skipped = build_patches( + [change], [change.old_block], [change.new_block], + mappings=[mapping], + ) - def test_text_only_change_detected(self): - """텍스트만 변경되어도 감지한다.""" - old = _normalize_list_for_content_compare("* 원래") - new = _normalize_list_for_content_compare("* 새것") - assert old != new + assert patches == [], ( + f"marker 뒤 공백만 바뀐 clean list는 no-op 이어야 합니다. " + f"patches={patches}, skipped={skipped}" + ) - def test_numbered_list_marker_ws(self): - """번호 리스트 마커 뒤 공백 차이.""" - old = _normalize_list_for_content_compare("7. 생성이") - new = _normalize_list_for_content_compare("7. 생성이") - assert old != new + def test_fixture_preserved_anchor_list_applies_text_changes_across_image_boundary( + self, tmp_path, + ): + case_dir = _REVERSE_SYNC_FIXTURE_ROOT / "544243925" + xhtml = (case_dir / "page.xhtml").read_text(encoding="utf-8") + original_mdx = (case_dir / "original.mdx").read_text(encoding="utf-8") + improved_mdx = (case_dir / "improved.mdx").read_text(encoding="utf-8") + original_blocks = list(parse_mdx_blocks(original_mdx)) + improved_blocks = list(parse_mdx_blocks(improved_mdx)) + changes, alignment = diff_blocks(original_blocks, improved_blocks) + mappings = record_mapping(xhtml) + sidecar_yaml = generate_sidecar_mapping(xhtml, original_mdx, "544243925") + mapping_path = tmp_path / "544243925.mapping.yaml" + mapping_path.write_text(sidecar_yaml, encoding="utf-8") + sidecar_entries = load_sidecar_mapping(str(mapping_path)) + + patches, _, _ = build_patches( + changes, + original_blocks, + improved_blocks, + mappings=mappings, + mdx_to_sidecar=build_mdx_to_sidecar_index(sidecar_entries), + xpath_to_mapping=build_xpath_to_mapping(mappings), + alignment=alignment, + ) - def test_nested_list_marker_ws(self): - """중첩 리스트에서 하위 항목 마커 공백 차이.""" - old = _normalize_list_for_content_compare("1. 상위\n * 하위") - new = _normalize_list_for_content_compare("1. 상위\n * 하위") - assert old != new + patched = patch_xhtml(xhtml, patches) + patched_plain = BeautifulSoup(patched, "html.parser").get_text() + assert "알림 바에서" in patched_plain + assert "users:read.email을 추가한 뒤 저장합니다" in patched_plain + assert "Bot User OAuth Token을 복사하고" in patched_plain + + def test_fixture_preserved_anchor_list_merges_figure_only_pseudo_item( + self, tmp_path, + ): + case_dir = _REVERSE_SYNC_FIXTURE_ROOT / "798064641" + xhtml = (case_dir / "page.xhtml").read_text(encoding="utf-8") + original_mdx = (case_dir / "original.mdx").read_text(encoding="utf-8") + improved_mdx = (case_dir / "improved.mdx").read_text(encoding="utf-8") + original_blocks = list(parse_mdx_blocks(original_mdx)) + improved_blocks = list(parse_mdx_blocks(improved_mdx)) + changes, alignment = diff_blocks(original_blocks, improved_blocks) + mappings = record_mapping(xhtml) + sidecar_yaml = generate_sidecar_mapping(xhtml, original_mdx, "798064641") + mapping_path = tmp_path / "798064641.mapping.yaml" + mapping_path.write_text(sidecar_yaml, encoding="utf-8") + sidecar_entries = load_sidecar_mapping(str(mapping_path)) + roundtrip_sidecar = build_sidecar(xhtml, original_mdx, page_id="798064641") + + patches, _, _ = build_patches( + changes, + original_blocks, + improved_blocks, + mappings=mappings, + mdx_to_sidecar=build_mdx_to_sidecar_index(sidecar_entries), + xpath_to_mapping=build_xpath_to_mapping(mappings), + alignment=alignment, + roundtrip_sidecar=roundtrip_sidecar, + ) - def test_text_and_marker_ws_change(self): - """텍스트와 마커 공백이 동시에 변경.""" - old = _normalize_list_for_content_compare("* 원래 텍스트") - new = _normalize_list_for_content_compare("* 새 텍스트") - assert old != new + target = next(p for p in patches if p.get("xhtml_xpath") == "ol[1]") + assert target.get("action") == "replace_fragment" + assert "
  • \n" + "
    \n" + " \"img\"\n" + "
    \n" + "5. 확인 창이 나타나면 삭제하여 설정을 제거합니다.\n" + ) + + assert "클릭합니다 확인" not in model.visible_text + assert "클릭합니다확인" in model.visible_text + + def test_canonicalizes_figure_only_pseudo_item_into_previous_item(self): + old = extract_list_model_from_mdx( + "4. SMTP 설정을 생성합니다.\n" + " 11. **Test 버튼** : SMTP 설정이 접속에 문제 없는지 확인합니다.
    \n" + " 12.\n" + "
    \n" + " \"SMTP\n" + "
    \n" + " SMTP 설정 팝업 다이얼로그\n" + "
    \n" + "
    \n" + "5. `OK` 버튼을 누르고 설정을 저장합니다.\n" + ) + new = extract_list_model_from_mdx( + "4. SMTP 설정을 생성합니다.\n" + " 11. **Test 버튼** : SMTP 설정이 접속에 문제 없는지 확인합니다.
    \n" + "
    \n" + " \"SMTP\n" + "
    \n" + " SMTP 설정 팝업 다이얼로그\n" + "
    \n" + "
    \n" + "5. `OK` 버튼을 누르고 설정을 저장합니다.\n" + ) + + assert old.visible_text == new.visible_text + assert old.structural_fingerprint == new.structural_fingerprint + + +class TestExtractListModelFromXhtml: + def test_preserves_dom_whitespace_and_tracks_structure(self): + model = extract_list_model_from_xhtml( + "" + ) + + assert model.visible_text == "앞 링크 뒤" + assert any(segment.kind == "anchor" for segment in model.segments) + assert model.structural_fingerprint[0] == "ul" + + def test_includes_image_caption_text_in_visible_text(self): + model = extract_list_model_from_xhtml( + "" + ) + + assert model.visible_text == "항목캡션 텍스트" + + def test_ignores_whitespace_only_paragraph_after_image(self): + model = extract_list_model_from_xhtml( + "
    1. 목록 좌측 상단에서 Delete버튼을 클릭합니다

      " + "" + "" + "

    " + ) + + assert model.visible_text == "목록 좌측 상단에서 Delete버튼을 클릭합니다" + + def test_preserves_separator_space_from_whitespace_only_paragraph_between_items(self): + model = extract_list_model_from_xhtml( + "
    1. before old

      " + "" + "" + "

      Cap

      " + "

    2. next

    " + ) + + assert model.visible_text == "before oldCap next" + + +class TestEdgeCases: + def test_empty_string_returns_empty_model(self): + model = extract_list_model_from_mdx("") + assert model.visible_text == "" + assert model.segments == [] + assert model.structural_fingerprint == ("ul", None, ()) + + def test_single_empty_item(self): + model = extract_list_model_from_mdx("* \n") + assert model.visible_text == "" + assert model.structural_fingerprint[0] == "ul" + + def test_deeply_nested_list_preserves_structure(self): + model = extract_list_model_from_mdx( + "* L1\n" + " * L2\n" + " * L3\n" + ) + assert model.visible_text == "L1L2L3" + fp_items = model.structural_fingerprint[2] + assert len(fp_items) == 3 + assert fp_items[0][0] == (0,) + assert fp_items[1][0] == (0, 0) + assert fp_items[2][0] == (0, 0, 0) + + def test_ordered_list_start_reflected_in_fingerprint(self): + model_1 = extract_list_model_from_mdx("1. A\n2. B\n") + model_3 = extract_list_model_from_mdx("3. A\n4. B\n") + assert model_1.visible_text == model_3.visible_text + assert model_1.structural_fingerprint[1] == 1 + assert model_3.structural_fingerprint[1] == 3 + assert model_1.structural_fingerprint != model_3.structural_fingerprint + + def test_empty_xhtml_returns_empty_model(self): + model = extract_list_model_from_xhtml("") + assert model.visible_text == "" + assert model.segments == [] diff --git a/docs/superpowers/plans/2026-04-10-list-visible-segments-phase1.md b/docs/superpowers/plans/2026-04-10-list-visible-segments-phase1.md new file mode 100644 index 000000000..1e9051f0b --- /dev/null +++ b/docs/superpowers/plans/2026-04-10-list-visible-segments-phase1.md @@ -0,0 +1,164 @@ +# Reverse Sync Visible Segments Phase 1 Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace lossy list-only normalization with a lossless visible-segment abstraction and migrate the reverse-sync list strategy to use it first. + +**Architecture:** Add a new `visible_segments` module that extracts lossless visible/structural segment models from MDX lists and XHTML lists. Update the list branch in `patch_builder.py` to base patch decisions on `visible_text` and structural fingerprints instead of list-specific whitespace helpers. + +**Tech Stack:** Python, pytest, BeautifulSoup, existing reverse-sync patch builder utilities + +--- + +## Chunk 1: Spec-Backed Test Scaffolding + +### Task 1: Add failing tests for visible-segment extraction and list decisions + +**Files:** +- Modify: `confluence-mdx/tests/test_reverse_sync_patch_builder.py` +- Create: `confluence-mdx/tests/test_reverse_sync_visible_segments.py` +- Reference: `docs/superpowers/specs/2026-04-10-list-visible-segments-phase1-design.md` + +- [ ] **Step 1: Write failing extraction tests** + +Add tests covering: + +- MDX list extraction preserves marker-post whitespace +- MDX list extraction preserves link trailing space at item edges +- XHTML list extraction preserves DOM whitespace and list structure + +- [ ] **Step 2: Run extraction tests to verify they fail** + +Run: `pytest -q tests/test_reverse_sync_visible_segments.py` +Expected: FAIL because the new module/functions do not exist yet + +- [ ] **Step 3: Write failing list-strategy regression tests** + +Add or update tests covering: + +- clean list trailing-space change must emit a patch +- preserved-anchor no-op marker-space change must emit no patch +- continuation-line reflow remains no-op + +- [ ] **Step 4: Run targeted builder tests to verify they fail** + +Run: `pytest -q tests/test_reverse_sync_patch_builder.py -k 'visible_segments or trailing_space or marker_space or continuation_line_reflow'` +Expected: FAIL with current list-path behavior + +## Chunk 2: Visible Segment Module + +### Task 2: Implement the new visible-segment extraction module + +**Files:** +- Create: `confluence-mdx/bin/reverse_sync/visible_segments.py` +- Reference: `confluence-mdx/bin/reverse_sync/xhtml_normalizer.py` +- Test: `confluence-mdx/tests/test_reverse_sync_visible_segments.py` + +- [ ] **Step 1: Add the minimal dataclasses and extraction entry points** + +Implement: + +- `VisibleSegment` +- `VisibleContentModel` +- `extract_list_model_from_mdx(content: str)` +- `extract_list_model_from_xhtml(fragment: str)` + +- [ ] **Step 2: Run extraction tests** + +Run: `pytest -q tests/test_reverse_sync_visible_segments.py` +Expected: still failing on incomplete extraction behavior + +- [ ] **Step 3: Implement minimal lossless segment extraction** + +Support: + +- marker tokenization +- whitespace tokenization +- item boundaries +- structural fingerprint for ordered start and item path +- XHTML text-node extraction with preserved anchors skipped from visible text but tracked structurally + +- [ ] **Step 4: Re-run extraction tests** + +Run: `pytest -q tests/test_reverse_sync_visible_segments.py` +Expected: PASS + +## Chunk 3: Patch Builder Migration + +### Task 3: Replace list-only normalization logic with visible models + +**Files:** +- Modify: `confluence-mdx/bin/reverse_sync/patch_builder.py` +- Test: `confluence-mdx/tests/test_reverse_sync_patch_builder.py` +- Reference: `confluence-mdx/bin/reverse_sync/reconstructors.py` + +- [ ] **Step 1: Wire list strategy to visible models** + +Replace the current list-path use of: + +- `_normalize_list_for_content_compare()` +- `_has_marker_ws_change()` +- `_detect_list_item_space_change()` + +with: + +- `extract_list_model_from_mdx()` +- `extract_list_model_from_xhtml()` as needed +- `visible_text` and structural fingerprint comparisons + +- [ ] **Step 2: Run targeted list tests** + +Run: `pytest -q tests/test_reverse_sync_patch_builder.py -k 'ListWhitespace or MarkerWhitespace or CalloutChildListSpaceChange or BuildPatchesIdempotency or visible_segments'` +Expected: FAIL until all list decisions are updated + +- [ ] **Step 3: Implement minimal green-path migration** + +Behavior: + +- visible-only list diffs stay in text/template transfer path +- structural list diffs continue through existing rebuild/merge path +- no-op visible diffs emit no patch + +- [ ] **Step 4: Re-run targeted list tests** + +Run: `pytest -q tests/test_reverse_sync_patch_builder.py -k 'ListWhitespace or MarkerWhitespace or CalloutChildListSpaceChange or BuildPatchesIdempotency or visible_segments'` +Expected: PASS + +## Chunk 4: Broader Verification and Cleanup + +### Task 4: Remove obsolete helper coverage and verify reverse-sync behavior + +**Files:** +- Modify: `confluence-mdx/bin/reverse_sync/patch_builder.py` +- Modify: `confluence-mdx/tests/test_reverse_sync_patch_builder.py` +- Modify: `confluence-mdx/tests/test_reverse_sync_visible_segments.py` + +- [ ] **Step 1: Remove obsolete helper tests or rewrite them to behavior-based tests** + +Focus test names on: + +- visible diff applies +- structural diff rebuilds +- no-op emits no patch + +- [ ] **Step 2: Run the focused reverse-sync suites** + +Run: `pytest -q tests/test_reverse_sync_visible_segments.py tests/test_reverse_sync_patch_builder.py tests/test_reverse_sync_xhtml_patcher.py tests/test_reverse_sync_xhtml_normalizer.py` +Expected: PASS + +- [ ] **Step 3: Run a broader reverse-sync safety sweep** + +Run: `pytest -q tests/test_reverse_sync_patch_builder.py tests/test_reverse_sync_mapping_recorder.py tests/test_reverse_sync_sidecar.py tests/test_reverse_sync_cli.py` +Expected: PASS + +- [ ] **Step 4: Commit** + +```bash +git add docs/superpowers/specs/2026-04-10-list-visible-segments-phase1-design.md \ + docs/superpowers/plans/2026-04-10-list-visible-segments-phase1.md \ + confluence-mdx/bin/reverse_sync/visible_segments.py \ + confluence-mdx/bin/reverse_sync/patch_builder.py \ + confluence-mdx/tests/test_reverse_sync_visible_segments.py \ + confluence-mdx/tests/test_reverse_sync_patch_builder.py +git commit -m "refactor: add visible segment model for reverse sync lists" +``` diff --git a/docs/superpowers/specs/2026-04-10-list-visible-segments-phase1-design.md b/docs/superpowers/specs/2026-04-10-list-visible-segments-phase1-design.md new file mode 100644 index 000000000..c2d9ddd46 --- /dev/null +++ b/docs/superpowers/specs/2026-04-10-list-visible-segments-phase1-design.md @@ -0,0 +1,146 @@ +# Reverse Sync Visible Segments Phase 1 Design + +**Goal** + +Replace lossy list-content normalization in reverse sync with a lossless visible-segment model so visible whitespace edits are handled the same way as character edits and can be reflected back into Confluence XHTML. + +**Problem** + +The current list path in `confluence-mdx/bin/reverse_sync/patch_builder.py` depends on `normalize_mdx_to_plain()` and list-specific helpers such as `_normalize_list_for_content_compare()`, `_has_marker_ws_change()`, and `_detect_list_item_space_change()`. Those helpers exist because the plain-text normalization step drops visible information before patch strategy is selected. Once marker whitespace, edge whitespace, or link-boundary whitespace is removed, the downstream code can no longer distinguish: + +- no-op continuation-line reflow +- real visible whitespace edits +- structural list edits that require reconstruction + +That produces both missed edits and no-op patches. + +**Non-Goals** + +- Do not replace the paragraph/table/direct paths in this phase. +- Do not redesign the XHTML patcher in this phase. +- Do not eliminate `normalize_mdx_to_plain()` globally in this phase. + +**Design Summary** + +Introduce a new lossless abstraction for reverse sync list handling: + +- `VisibleSegment`: a token-level representation of visible content and structural markers +- `VisibleContentModel`: a block-level container of ordered segments, `visible_text`, and structural fingerprint + +The model is extracted directly from MDX list content and XHTML list fragments without passing through lossy plain-text normalization. The list strategy then computes one diff model from the extracted visible text and uses one structural fingerprint comparison for rebuild decisions. + +In this phase: + +- clean list visible-only edits should be handled by text transfer when safe +- preserved-anchor list visible-only edits should be handled by template-based text transfer +- structural list changes should still use existing rebuild/merge paths + +**Core Data Model** + +```python +@dataclass(frozen=True) +class VisibleSegment: + kind: Literal["text", "ws", "anchor", "list_marker", "item_boundary"] + text: str + visible: bool + structural: bool + meta: dict[str, Any] + + +@dataclass(frozen=True) +class VisibleContentModel: + segments: list[VisibleSegment] + visible_text: str + structural_fingerprint: tuple[Any, ...] +``` + +Design rules: + +- Visible whitespace is represented explicitly as `ws` segments, not inferred later. +- `visible_text` is the lossless concatenation of visible segments. +- Marker text and item boundaries may be structural even when not applied as XHTML text. +- The extractor is not allowed to erase or trim visible whitespace. + +**List Extraction** + +MDX list extraction: + +- Tokenize each list item into `list_marker`, post-marker `ws`, body `text/ws`, and `item_boundary`. +- Track ordered-list start value and nested item path in `meta`. +- Preserve link label whitespace in visible text. +- Canonicalize continuation-line reflow only when the rendered visible result is equivalent. + +XHTML list extraction: + +- Walk `