jsoizo · jsoizo · May 22, 2026 · May 22, 2026
diff --git a/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt b/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt
@@ -231,50 +231,95 @@ class SequenceParserTest {
 
     @Test
     fun chunked_emptyInput_yieldsEmptySequence() {
-        // First readInto returns 0 — early-return path.
         parseChunked("") shouldBe emptyList()
     }
 
     @Test
     fun chunked_smallBuffer_forcesChunkBoundaryAcrossRows() {
-        // bufferSize=4 forces the parser to swap chunks several times mid-row
-        // and to consult the cross-buffer lookahead path.
+        // Several row terminators land mid-chunk so a regression in the
+        // double-buffer swap or the cross-buffer next-char lookahead would
+        // either drop rows or merge them.
         val rows = parseChunked("a,bbb\nccc,d\nef,gh", bufferSize = 4)
         rows shouldBe listOf(listOf("a", "bbb"), listOf("ccc", "d"), listOf("ef", "gh"))
     }
 
     @Test
     fun chunked_bufferBoundaryLandsOnCrLf() {
-        // Tail CR is the last char of one chunk; LF starts the next. The CR
-        // branch must consult lookahead via the next-buffer slot to swallow LF.
+        // CR ends one chunk and LF starts the next. The CR branch only
+        // collapses CRLF into a single terminator if lookahead reaches into
+        // nextBuffer[0]; without that the LF would surface as an extra empty
+        // row.
         val rows = parseChunked("ab\r\ncd", bufferSize = 3)
         rows shouldBe listOf(listOf("ab"), listOf("cd"))
     }
 
     @Test
     fun chunked_requireBufferSizeAtLeastTwo() {
+        // The kotlinx-io chunk reader needs room for a UTF-16 surrogate pair
+        // in one buffer; the parser enforces that contract on callers.
         shouldThrow<IllegalArgumentException> {
             parseRowsFromChunks(chunkedReader("a"), rfc4180, bufferSize = 1).toList()
         }
     }
 
     @Test
     fun chunked_stripsBomWhenRequested() {
-        // bufferSize chosen so BOM and the first field share the leading chunk.
         val rows = parseChunked("a,b", bufferSize = 4, stripBom = true)
         rows shouldBe listOf(listOf("a", "b"))
     }
 
     @Test
     fun chunked_preservesBomWhenStripDisabled() {
-        // Default stripBom = false — exercises the default-parameter call.
+        // Default-parameter stripBom: BOM must reach the parser as a regular
+        // char so I/O layers can keep ownership of the strip policy.
         val rows = parseRowsFromChunks(chunkedReader("a"), rfc4180).toList()
         rows shouldBe listOf(listOf("a"))
     }
 
     @Test
     fun chunked_unterminatedQuote_atChunkBoundary_yieldsNoFinalRow() {
-        // tail-flush path runs but getResult() returns null (QUOTED_FIELD state).
+        // QUOTED_FIELD at EOF means the row was never terminated; tail-flush
+        // must drop it rather than emit a half-parsed row.
         parseChunked("\"abc", bufferSize = 2) shouldBe emptyList()
     }
+
+    @Test
+    fun chunked_doubleQuoteEscape_splitAcrossChunkBoundary() {
+        // The doubled `""` is consumed via skipCount=1 after a lookahead that
+        // crosses chunk boundaries. Buffer size 3 places the second quote of
+        // the doubled pair at the start of nextBuffer for the `a""b` case,
+        // so a regression in cross-chunk lookahead would mis-emit the field.
+        parseChunked("\"a\"\"b\"", bufferSize = 3) shouldBe listOf(listOf("a\"b"))
+    }
+
+    @Test
+    fun chunked_explicitEscape_splitAcrossChunkBoundary() {
+        // With escapeChar=`\\` inside a quoted field, escape and its target
+        // char are consumed in one machine.read() call (skipCount=1). Place
+        // them on either side of a chunk boundary to ensure the escaped
+        // target is fetched from nextBuffer[0].
+        val dialect = CsvDialect(escapeChar = '\\')
+        parseChunked("\"a\\\"b\"", bufferSize = 3, dialect = dialect) shouldBe listOf(listOf("a\"b"))
+    }
+
+    @Test
+    fun chunked_loneCarriageReturn_atChunkBoundary_terminatesRowWithoutLookahead() {
+        // CR alone is a row terminator, so the parser must not require LF to
+        // appear next. With the CR at the end of one chunk and a non-LF char
+        // at the start of the next, the skipCount path must stay at 0 instead
+        // of consuming the following field char as part of CRLF.
+        parseChunked("ab\rcd", bufferSize = 3) shouldBe listOf(listOf("ab"), listOf("cd"))
+    }
+
+    @Test
+    fun chunked_supplementaryCodePoint_surrogatesSplitAcrossChunks() {
+        // U+1F600 (😀) is represented as a UTF-16 surrogate pair (high, low).
+        // The kotlinx-io adapter reserves the last slot of each buffer to keep
+        // pairs together, but parseRowsFromChunks itself must remain correct
+        // even when a caller (test fakes, hypothetical alt backends) hands the
+        // pair across a boundary — neither half is special to the CSV state
+        // machine, both should pass through as ordinary field characters.
+        val rows = parseChunked("a,😀,b", bufferSize = 3)
+        rows shouldBe listOf(listOf("a", "😀", "b"))
+    }
 }
diff --git a/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt b/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt
@@ -59,4 +59,25 @@ class CsvReaderPathSmokeTest {
             tmp.deleteIfExists()
         }
     }
+
+    @Test
+    fun readFromFile_supplementaryCodePoint_atChunkBoundary_keepsSurrogatePairTogether() {
+        // Source.asChunkReader reserves the last slot of the 8 KB buffer so a
+        // supplementary code point's UTF-16 surrogate pair never spans two
+        // chunks. Lay out the input so 😀's high surrogate lands at the
+        // last reservable index (buffer.size - 2): 8189 pad chars + ',' fills
+        // indices 0..8189, the high surrogate goes to 8190, the low surrogate
+        // to 8191 (the reserved slot). If the limit regressed to
+        // buffer.size, writing the low surrogate at 8192 would overflow.
+        val tmp = Files.createTempFile("kotlin-csv-reader-smoke-supplementary", ".csv")
+        try {
+            val pad = "x".repeat(8189)
+            val csv = "$pad,😀\n"
+            Files.writeString(tmp, csv)
+            val rows = CsvReader().readFromFile(tmp.toString()) { it.toList() }
+            rows shouldBe listOf(listOf(pad, "😀"))
+        } finally {
+            tmp.deleteIfExists()
+        }
+    }
 }