Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ internal class ParseStateMachine(
ParseState.START -> {
when (ch) {
quoteChar -> state = ParseState.QUOTE_START
// When `escapeChar == quoteChar`, the quoteChar arm above wins; this arm is unreachable.
escapeChar -> state = handleUnquotedEscape(nextCh, rowNum)
delimiter -> {
flushField()
state = ParseState.DELIMITER
Expand All @@ -52,17 +54,7 @@ internal class ParseStateMachine(
}
ParseState.FIELD -> {
when (ch) {
escapeChar -> {
if (nextCh != escapeChar) throw CsvParseFormatException(
rowNum,
pos,
ch,
"must appear escapeChar($escapeChar) after escapeChar($escapeChar)"
)
field.append(nextCh)
state = ParseState.FIELD
pos += 1
}
escapeChar -> state = handleUnquotedEscape(nextCh, rowNum)
delimiter -> {
flushField()
state = ParseState.DELIMITER
Expand All @@ -86,6 +78,8 @@ internal class ParseStateMachine(
ParseState.DELIMITER -> {
when (ch) {
quoteChar -> state = ParseState.QUOTE_START
// When `escapeChar == quoteChar`, the quoteChar arm above wins; this arm is unreachable.
escapeChar -> state = handleUnquotedEscape(nextCh, rowNum)
delimiter -> {
flushField()
state = ParseState.DELIMITER
Expand Down Expand Up @@ -190,6 +184,32 @@ internal class ParseStateMachine(
fields.add(field.toString())
field.clear()
}

/**
* Consume one additional character following an escape character at an
* unquoted position (START/DELIMITER/FIELD). The caller must already have
* accounted for the escape char itself in [pos]; this method advances
* [pos] by 1 more to consume the escaped char, appends it to [field], and
* returns [ParseState.FIELD].
*
* Accepted [nextCh] values are [escapeChar] and [quoteChar]. When
* `escapeChar == quoteChar` the accepted set degenerates to a single value,
* preserving the RFC 4180 strict-doubling behaviour for the default
* dialect.
*/
private fun handleUnquotedEscape(nextCh: Char?, rowNum: Long): ParseState {
if (nextCh != escapeChar && nextCh != quoteChar) {
throw CsvParseFormatException(
rowNum,
pos,
escapeChar,
"escape character must be followed by escapeChar($escapeChar) or quoteChar($quoteChar)"
)
}
field.append(nextCh)
pos += 1
return ParseState.FIELD
}
}

private enum class ParseState {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.jsoizo.kotlincsv

import io.kotest.common.ExperimentalKotest
import io.kotest.matchers.shouldBe
import io.kotest.property.Arb
import io.kotest.property.PropTestConfig
import io.kotest.property.arbitrary.filter
import io.kotest.property.arbitrary.list
import io.kotest.property.arbitrary.map
import io.kotest.property.checkAll
import kotlinx.coroutines.test.runTest
import kotlin.test.Test

@OptIn(ExperimentalKotest::class)
class UnquotedEscapePropertyTest {

private val dialect = CsvDialect(escapeChar = '\\')

// Chars that can legally appear in an unquoted field. The minimal serializer
// below only escapes `\` and `"`, so delimiter and line terminators must be
// excluded from generation. `\` and `"` are kept so the escape path is exercised.
private val unquotedSafeChar: Arb<Char> = fieldChar.filter { ch ->
ch != ',' && ch != '"' &&
ch != '\n' && ch != '\r' &&
ch != '
' && ch != '
' && ch != '…'
}

private val unquotedFieldArb: Arb<String> =
Arb.list(unquotedSafeChar, 0..16).map { it.joinToString("") }

private fun serialize(field: String): String = buildString {
for (ch in field) {
when (ch) {
'\\' -> append("\\\\")
'"' -> append("\\\"")
else -> append(ch)
}
}
append('\n')
}

@Test
fun unquotedEscape_singleFieldRoundTrip() = runTest {
checkAll(
PropTestConfig(seed = 0L, iterations = 500),
unquotedFieldArb,
) { field ->
val text = serialize(field)
val parsed = csvReader { this.dialect = this@UnquotedEscapePropertyTest.dialect }.readAll(text)
parsed shouldBe listOf(listOf(field))
}
}
}
59 changes: 59 additions & 0 deletions src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package com.jsoizo.kotlincsv.reader

import com.jsoizo.kotlincsv.CsvDialect
import com.jsoizo.kotlincsv.exceptions.CsvFieldNumDifferentException
import com.jsoizo.kotlincsv.exceptions.CsvParseFormatException
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.matchers.shouldBe
import kotlin.test.Test
Expand Down Expand Up @@ -134,4 +135,62 @@ class CsvReaderTest {
val seq = reader.read("a,b,c\nd,e".asSequence())
seq.take(1).toList() shouldBe listOf(listOf("a", "b", "c"))
}

// --- Unquoted-field escape (issue #168) ---

private val explicitEscapeReader =
CsvReader(CsvReaderConfig(dialect = CsvDialect(escapeChar = '\\')))

@Test
fun unquotedEscape_doubledEscape_inField() {
explicitEscapeReader.readAll("x,a\\\\b\n") shouldBe listOf(listOf("x", "a\\b"))
}

@Test
fun unquotedEscape_atRowStart() {
explicitEscapeReader.readAll("\\\\b\n") shouldBe listOf(listOf("\\b"))
}

@Test
fun unquotedEscape_quoteCharEscaped_atRowStart() {
explicitEscapeReader.readAll("\\\"y\n") shouldBe listOf(listOf("\"y"))
}

@Test
fun unquotedEscape_quoteCharEscaped_afterDelimiter() {
explicitEscapeReader.readAll("x,\\\"y\n") shouldBe listOf(listOf("x", "\"y"))
}

@Test
fun unquotedEscape_quoteCharEscaped_inField() {
explicitEscapeReader.readAll("a\\\"b\n") shouldBe listOf(listOf("a\"b"))
}

@Test
fun unquotedEscape_escapeAtEof_throws() {
shouldThrow<CsvParseFormatException> { explicitEscapeReader.readAll("a\\") }
}

@Test
fun unquotedEscape_followedByOrdinaryChar_throws() {
shouldThrow<CsvParseFormatException> { explicitEscapeReader.readAll("a\\c") }
}

@Test
fun unquotedEscape_followedByCr_throws() {
shouldThrow<CsvParseFormatException> { explicitEscapeReader.readAll("a\\\r\n") }
}

@Test
fun unquotedEscape_followedByLf_throws() {
shouldThrow<CsvParseFormatException> { explicitEscapeReader.readAll("a\\\n") }
}

@Test
fun unquotedEscape_multipleRows() {
explicitEscapeReader.readAll("a\\\\b\nc\\\\d\n") shouldBe listOf(
listOf("a\\b"),
listOf("c\\d"),
)
}
}
Loading