diff --git a/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt b/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt index 7858b9f..5ffd3e6 100644 --- a/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt +++ b/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/internal/SequenceParserTest.kt @@ -231,28 +231,32 @@ class SequenceParserTest { @Test fun chunked_emptyInput_yieldsEmptySequence() { - // First readInto returns 0 — early-return path. parseChunked("") shouldBe emptyList() } @Test fun chunked_smallBuffer_forcesChunkBoundaryAcrossRows() { - // bufferSize=4 forces the parser to swap chunks several times mid-row - // and to consult the cross-buffer lookahead path. + // Several row terminators land mid-chunk so a regression in the + // double-buffer swap or the cross-buffer next-char lookahead would + // either drop rows or merge them. val rows = parseChunked("a,bbb\nccc,d\nef,gh", bufferSize = 4) rows shouldBe listOf(listOf("a", "bbb"), listOf("ccc", "d"), listOf("ef", "gh")) } @Test fun chunked_bufferBoundaryLandsOnCrLf() { - // Tail CR is the last char of one chunk; LF starts the next. The CR - // branch must consult lookahead via the next-buffer slot to swallow LF. + // CR ends one chunk and LF starts the next. The CR branch only + // collapses CRLF into a single terminator if lookahead reaches into + // nextBuffer[0]; without that the LF would surface as an extra empty + // row. val rows = parseChunked("ab\r\ncd", bufferSize = 3) rows shouldBe listOf(listOf("ab"), listOf("cd")) } @Test fun chunked_requireBufferSizeAtLeastTwo() { + // The kotlinx-io chunk reader needs room for a UTF-16 surrogate pair + // in one buffer; the parser enforces that contract on callers. shouldThrow { parseRowsFromChunks(chunkedReader("a"), rfc4180, bufferSize = 1).toList() } @@ -260,21 +264,62 @@ class SequenceParserTest { @Test fun chunked_stripsBomWhenRequested() { - // bufferSize chosen so BOM and the first field share the leading chunk. val rows = parseChunked("a,b", bufferSize = 4, stripBom = true) rows shouldBe listOf(listOf("a", "b")) } @Test fun chunked_preservesBomWhenStripDisabled() { - // Default stripBom = false — exercises the default-parameter call. + // Default-parameter stripBom: BOM must reach the parser as a regular + // char so I/O layers can keep ownership of the strip policy. val rows = parseRowsFromChunks(chunkedReader("a"), rfc4180).toList() rows shouldBe listOf(listOf("a")) } @Test fun chunked_unterminatedQuote_atChunkBoundary_yieldsNoFinalRow() { - // tail-flush path runs but getResult() returns null (QUOTED_FIELD state). + // QUOTED_FIELD at EOF means the row was never terminated; tail-flush + // must drop it rather than emit a half-parsed row. parseChunked("\"abc", bufferSize = 2) shouldBe emptyList() } + + @Test + fun chunked_doubleQuoteEscape_splitAcrossChunkBoundary() { + // The doubled `""` is consumed via skipCount=1 after a lookahead that + // crosses chunk boundaries. Buffer size 3 places the second quote of + // the doubled pair at the start of nextBuffer for the `a""b` case, + // so a regression in cross-chunk lookahead would mis-emit the field. + parseChunked("\"a\"\"b\"", bufferSize = 3) shouldBe listOf(listOf("a\"b")) + } + + @Test + fun chunked_explicitEscape_splitAcrossChunkBoundary() { + // With escapeChar=`\\` inside a quoted field, escape and its target + // char are consumed in one machine.read() call (skipCount=1). Place + // them on either side of a chunk boundary to ensure the escaped + // target is fetched from nextBuffer[0]. + val dialect = CsvDialect(escapeChar = '\\') + parseChunked("\"a\\\"b\"", bufferSize = 3, dialect = dialect) shouldBe listOf(listOf("a\"b")) + } + + @Test + fun chunked_loneCarriageReturn_atChunkBoundary_terminatesRowWithoutLookahead() { + // CR alone is a row terminator, so the parser must not require LF to + // appear next. With the CR at the end of one chunk and a non-LF char + // at the start of the next, the skipCount path must stay at 0 instead + // of consuming the following field char as part of CRLF. + parseChunked("ab\rcd", bufferSize = 3) shouldBe listOf(listOf("ab"), listOf("cd")) + } + + @Test + fun chunked_supplementaryCodePoint_surrogatesSplitAcrossChunks() { + // U+1F600 (😀) is represented as a UTF-16 surrogate pair (high, low). + // The kotlinx-io adapter reserves the last slot of each buffer to keep + // pairs together, but parseRowsFromChunks itself must remain correct + // even when a caller (test fakes, hypothetical alt backends) hands the + // pair across a boundary — neither half is special to the CSV state + // machine, both should pass through as ordinary field characters. + val rows = parseChunked("a,😀,b", bufferSize = 3) + rows shouldBe listOf(listOf("a", "😀", "b")) + } } diff --git a/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt b/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt index c923dbd..eeee81a 100644 --- a/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt +++ b/src/jvmTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderPathSmokeTest.kt @@ -59,4 +59,25 @@ class CsvReaderPathSmokeTest { tmp.deleteIfExists() } } + + @Test + fun readFromFile_supplementaryCodePoint_atChunkBoundary_keepsSurrogatePairTogether() { + // Source.asChunkReader reserves the last slot of the 8 KB buffer so a + // supplementary code point's UTF-16 surrogate pair never spans two + // chunks. Lay out the input so 😀's high surrogate lands at the + // last reservable index (buffer.size - 2): 8189 pad chars + ',' fills + // indices 0..8189, the high surrogate goes to 8190, the low surrogate + // to 8191 (the reserved slot). If the limit regressed to + // buffer.size, writing the low surrogate at 8192 would overflow. + val tmp = Files.createTempFile("kotlin-csv-reader-smoke-supplementary", ".csv") + try { + val pad = "x".repeat(8189) + val csv = "$pad,😀\n" + Files.writeString(tmp, csv) + val rows = CsvReader().readFromFile(tmp.toString()) { it.toList() } + rows shouldBe listOf(listOf(pad, "😀")) + } finally { + tmp.deleteIfExists() + } + } }