Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -231,50 +231,95 @@ class SequenceParserTest {

@Test
fun chunked_emptyInput_yieldsEmptySequence() {
// First readInto returns 0 — early-return path.
parseChunked("") shouldBe emptyList()
}

@Test
fun chunked_smallBuffer_forcesChunkBoundaryAcrossRows() {
// bufferSize=4 forces the parser to swap chunks several times mid-row
// and to consult the cross-buffer lookahead path.
// Several row terminators land mid-chunk so a regression in the
// double-buffer swap or the cross-buffer next-char lookahead would
// either drop rows or merge them.
val rows = parseChunked("a,bbb\nccc,d\nef,gh", bufferSize = 4)
rows shouldBe listOf(listOf("a", "bbb"), listOf("ccc", "d"), listOf("ef", "gh"))
}

@Test
fun chunked_bufferBoundaryLandsOnCrLf() {
// Tail CR is the last char of one chunk; LF starts the next. The CR
// branch must consult lookahead via the next-buffer slot to swallow LF.
// CR ends one chunk and LF starts the next. The CR branch only
// collapses CRLF into a single terminator if lookahead reaches into
// nextBuffer[0]; without that the LF would surface as an extra empty
// row.
val rows = parseChunked("ab\r\ncd", bufferSize = 3)
rows shouldBe listOf(listOf("ab"), listOf("cd"))
}

@Test
fun chunked_requireBufferSizeAtLeastTwo() {
// The kotlinx-io chunk reader needs room for a UTF-16 surrogate pair
// in one buffer; the parser enforces that contract on callers.
shouldThrow<IllegalArgumentException> {
parseRowsFromChunks(chunkedReader("a"), rfc4180, bufferSize = 1).toList()
}
}

@Test
fun chunked_stripsBomWhenRequested() {
// bufferSize chosen so BOM and the first field share the leading chunk.
val rows = parseChunked("a,b", bufferSize = 4, stripBom = true)
rows shouldBe listOf(listOf("a", "b"))
}

@Test
fun chunked_preservesBomWhenStripDisabled() {
// Default stripBom = false — exercises the default-parameter call.
// Default-parameter stripBom: BOM must reach the parser as a regular
// char so I/O layers can keep ownership of the strip policy.
val rows = parseRowsFromChunks(chunkedReader("a"), rfc4180).toList()
rows shouldBe listOf(listOf("a"))
}

@Test
fun chunked_unterminatedQuote_atChunkBoundary_yieldsNoFinalRow() {
// tail-flush path runs but getResult() returns null (QUOTED_FIELD state).
// QUOTED_FIELD at EOF means the row was never terminated; tail-flush
// must drop it rather than emit a half-parsed row.
parseChunked("\"abc", bufferSize = 2) shouldBe emptyList()
}

@Test
fun chunked_doubleQuoteEscape_splitAcrossChunkBoundary() {
// The doubled `""` is consumed via skipCount=1 after a lookahead that
// crosses chunk boundaries. Buffer size 3 places the second quote of
// the doubled pair at the start of nextBuffer for the `a""b` case,
// so a regression in cross-chunk lookahead would mis-emit the field.
parseChunked("\"a\"\"b\"", bufferSize = 3) shouldBe listOf(listOf("a\"b"))
}

@Test
fun chunked_explicitEscape_splitAcrossChunkBoundary() {
// With escapeChar=`\\` inside a quoted field, escape and its target
// char are consumed in one machine.read() call (skipCount=1). Place
// them on either side of a chunk boundary to ensure the escaped
// target is fetched from nextBuffer[0].
val dialect = CsvDialect(escapeChar = '\\')
parseChunked("\"a\\\"b\"", bufferSize = 3, dialect = dialect) shouldBe listOf(listOf("a\"b"))
}

@Test
fun chunked_loneCarriageReturn_atChunkBoundary_terminatesRowWithoutLookahead() {
// CR alone is a row terminator, so the parser must not require LF to
// appear next. With the CR at the end of one chunk and a non-LF char
// at the start of the next, the skipCount path must stay at 0 instead
// of consuming the following field char as part of CRLF.
parseChunked("ab\rcd", bufferSize = 3) shouldBe listOf(listOf("ab"), listOf("cd"))
}

@Test
fun chunked_supplementaryCodePoint_surrogatesSplitAcrossChunks() {
// U+1F600 (😀) is represented as a UTF-16 surrogate pair (high, low).
// The kotlinx-io adapter reserves the last slot of each buffer to keep
// pairs together, but parseRowsFromChunks itself must remain correct
// even when a caller (test fakes, hypothetical alt backends) hands the
// pair across a boundary — neither half is special to the CSV state
// machine, both should pass through as ordinary field characters.
val rows = parseChunked("a,😀,b", bufferSize = 3)
rows shouldBe listOf(listOf("a", "😀", "b"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,25 @@ class CsvReaderPathSmokeTest {
tmp.deleteIfExists()
}
}

@Test
fun readFromFile_supplementaryCodePoint_atChunkBoundary_keepsSurrogatePairTogether() {
// Source.asChunkReader reserves the last slot of the 8 KB buffer so a
// supplementary code point's UTF-16 surrogate pair never spans two
// chunks. Lay out the input so 😀's high surrogate lands at the
// last reservable index (buffer.size - 2): 8189 pad chars + ',' fills
// indices 0..8189, the high surrogate goes to 8190, the low surrogate
// to 8191 (the reserved slot). If the limit regressed to
// buffer.size, writing the low surrogate at 8192 would overflow.
val tmp = Files.createTempFile("kotlin-csv-reader-smoke-supplementary", ".csv")
try {
val pad = "x".repeat(8189)
val csv = "$pad,😀\n"
Files.writeString(tmp, csv)
val rows = CsvReader().readFromFile(tmp.toString()) { it.toList() }
rows shouldBe listOf(listOf(pad, "😀"))
} finally {
tmp.deleteIfExists()
}
}
}
Loading