1+ import type { Token , Tokens } from "marked" ;
12import { lexer } from "marked" ;
23import { useMemo , useRef } from "react" ;
34
@@ -14,6 +15,17 @@ import { useMemo, useRef } from "react";
1415 * flag (close-time-gated work, e.g. a long-block highlight fuse, can
1516 * hang off it).
1617 *
18+ * LIST HOIST: fenced code NESTED in list items (incl. task items and
19+ * nested lists) is also broken out — enriched repeats the list marker on
20+ * every code line (software-mansion-labs/react-native-enriched-markdown
21+ * #243, open). The list's raw is split at fence boundaries; ordered-list
22+ * numbering survives because the continuation run still carries the
23+ * literal "2."/"3." markers and GFM honors a list's first number. Known
24+ * tradeoffs: hoisted code renders full-width (list indent lost), and any
25+ * post-code content of the same item becomes a plain paragraph.
26+ * Blockquotes are deliberately NOT hoisted — #243 is list-marker-specific
27+ * and a hoist would visually break the quote bar around the code.
28+ *
1729 * Stability model: segments are keyed POSITIONALLY (`i-kind`). Children
1830 * memo on `raw`, so the prefix-diff guarantee we rely on is "unchanged
1931 * raw ⇒ no re-render", not "unchanged key". Lists/setext headings can
@@ -45,12 +57,109 @@ export interface ChunkStats {
4557 * LAST segment can ever be open. */
4658function isFencedCodeClosed ( raw : string ) : boolean {
4759 const t = raw . trimEnd ( ) ;
48- const firstNewline = t . indexOf ( "\n" ) ;
49- if ( firstNewline < 0 ) return false ; // just the opening fence line
5060 if ( ! t . startsWith ( "```" ) && ! t . startsWith ( "~~~" ) ) return true ; // indented code
61+ if ( t . indexOf ( "\n" ) < 0 ) return false ; // just the opening fence line
5162 return / \n [ \t ] * ( ` ` ` | ~ ~ ~ ) [ \t ] * $ / . test ( t ) ;
5263}
5364
65+ /** Fenced code anywhere down a list's item tree (nested lists recursed,
66+ * blockquotes deliberately not — see header). Indented (non-fenced) code
67+ * inside items is excluded: the line scanner below only understands
68+ * fences, and LLM output is fenced in practice. */
69+ function listHasNestedFence ( list : Tokens . List ) : boolean {
70+ for ( const item of list . items ) {
71+ for ( const t of item . tokens as Token [ ] ) {
72+ if ( t . type === "code" && / ^ ( ` { 3 , } | ~ { 3 , } ) / . test ( t . raw . trimStart ( ) ) ) {
73+ return true ;
74+ }
75+ if ( t . type === "list" && listHasNestedFence ( t as Tokens . List ) ) {
76+ return true ;
77+ }
78+ }
79+ }
80+ return false ;
81+ }
82+
83+ type ListPiece =
84+ | { kind : "run" ; raw : string }
85+ | { kind : "code" ; raw : string ; lang : string ; code : string ; closed : boolean } ;
86+
87+ /** A fence-open line inside list raw: indentation, optionally the list
88+ * marker itself ("1. ```ts" — fence as the item's first block), then the
89+ * fence and its info string. The captured prefix length is what GFM
90+ * strips from the content lines (continuation indent = marker width). */
91+ const LIST_FENCE_OPEN =
92+ / ^ ( [ \t ] * (?: (?: [ - * + ] | \d { 1 , 9 } [ . ) ] ) [ \t ] + ) ? ) ( ` { 3 , } | ~ { 3 , } ) ( [ ^ \n ] * ) $ / ;
93+ const FENCE_CLOSE = / ^ [ \t ] * ( ` { 3 , } | ~ { 3 , } ) [ \t ] * $ / ;
94+
95+ function dedent ( line : string , max : number ) : string {
96+ let n = 0 ;
97+ while ( n < max && line [ n ] === " " ) n ++ ;
98+ return line . slice ( n ) ;
99+ }
100+
101+ /** Split a list token's raw at fence boundaries. Pure text surgery on the
102+ * ORIGINAL raw (concatenating the pieces reproduces it exactly), so run
103+ * pieces keep their literal list markers. Scanner-conservative: a fence
104+ * shape it doesn't recognize (e.g. inside a nested blockquote, where the
105+ * line starts with ">") simply stays in the run — worst case is the
106+ * pre-hoist behavior, never corruption. */
107+ function splitListRaw ( raw : string ) : ListPiece [ ] {
108+ // Lines WITH their trailing newline (Hermes-safe; no lookbehind).
109+ const lines = raw . match ( / [ ^ \n ] * \n | [ ^ \n ] + / g) ?? [ ] ;
110+ const pieces : ListPiece [ ] = [ ] ;
111+ let run : string [ ] = [ ] ;
112+ const flushRunPiece = ( ) => {
113+ if ( run . length > 0 ) {
114+ pieces . push ( { kind : "run" , raw : run . join ( "" ) } ) ;
115+ run = [ ] ;
116+ }
117+ } ;
118+
119+ let i = 0 ;
120+ while ( i < lines . length ) {
121+ const line = lines [ i ] ?? "" ;
122+ const open = LIST_FENCE_OPEN . exec ( line . replace ( / \n $ / , "" ) ) ;
123+ if ( open === null ) {
124+ run . push ( line ) ;
125+ i ++ ;
126+ continue ;
127+ }
128+ const prefixLen = open [ 1 ] ?. length ?? 0 ;
129+ const fence = open [ 2 ] ?? "" ;
130+ const info = ( open [ 3 ] ?? "" ) . trim ( ) ;
131+ const rawLines : string [ ] = [ line ] ;
132+ const codeLines : string [ ] = [ ] ;
133+ let closed = false ;
134+ i ++ ;
135+ while ( i < lines . length ) {
136+ const l = lines [ i ] ?? "" ;
137+ rawLines . push ( l ) ;
138+ i ++ ;
139+ const close = FENCE_CLOSE . exec ( l . replace ( / \n $ / , "" ) ) ;
140+ if (
141+ close !== null &&
142+ close [ 1 ] ?. [ 0 ] === fence [ 0 ] &&
143+ ( close [ 1 ] ?. length ?? 0 ) >= fence . length
144+ ) {
145+ closed = true ;
146+ break ;
147+ }
148+ codeLines . push ( dedent ( l , prefixLen ) ) ;
149+ }
150+ flushRunPiece ( ) ;
151+ pieces . push ( {
152+ kind : "code" ,
153+ raw : rawLines . join ( "" ) ,
154+ lang : info ,
155+ code : codeLines . join ( "" ) . replace ( / \n $ / , "" ) ,
156+ closed,
157+ } ) ;
158+ }
159+ flushRunPiece ( ) ;
160+ return pieces ;
161+ }
162+
54163export function chunkMarkdown (
55164 markdown : string ,
56165 streamDone : boolean ,
@@ -68,28 +177,54 @@ export function chunkMarkdown(
68177 } ) ;
69178 runBuffer = "" ;
70179 } ;
180+ const pushCode = (
181+ raw : string ,
182+ lang : string ,
183+ code : string ,
184+ closed : boolean ,
185+ ) => {
186+ flushRun ( ) ;
187+ segments . push ( {
188+ kind : "code" ,
189+ key : `${ segments . length } -code` ,
190+ raw,
191+ lang,
192+ code,
193+ closed,
194+ } ) ;
195+ } ;
71196
72197 for ( const token of tokens ) {
73198 if ( token . type === "code" ) {
74- flushRun ( ) ;
75- segments . push ( {
76- kind : "code" ,
77- key : `${ segments . length } -code` ,
78- raw : token . raw ,
79- lang : typeof token . lang === "string" ? token . lang : "" ,
80- code : typeof token . text === "string" ? token . text : "" ,
81- closed : true , // non-last segments are closed by construction
82- } ) ;
199+ pushCode (
200+ token . raw ,
201+ typeof token . lang === "string" ? token . lang : "" ,
202+ typeof token . text === "string" ? token . text : "" ,
203+ isFencedCodeClosed ( token . raw ) ,
204+ ) ;
205+ } else if (
206+ token . type === "list" &&
207+ // `type` alone doesn't narrow past marked's Tokens.Generic.
208+ listHasNestedFence ( token as Tokens . List )
209+ ) {
210+ for ( const piece of splitListRaw ( token . raw ) ) {
211+ if ( piece . kind === "code" ) {
212+ pushCode ( piece . raw , piece . lang , piece . code , piece . closed ) ;
213+ } else {
214+ runBuffer += piece . raw ;
215+ }
216+ }
83217 } else {
84218 runBuffer += token . raw ;
85219 }
86220 }
87221 flushRun ( ) ;
88222
89- // Only the tail can be open; everything before it has tokens after it.
223+ // Only the tail can be open (unclosed fences swallow to EOF); when the
224+ // stream is done, an EOF-unterminated fence counts as closed.
90225 const last = segments [ segments . length - 1 ] ;
91- if ( last ?. kind === "code" ) {
92- last . closed = streamDone || isFencedCodeClosed ( last . raw ) ;
226+ if ( last ?. kind === "code" && streamDone ) {
227+ last . closed = true ;
93228 }
94229 return segments ;
95230}
0 commit comments