From 6d0890df81abb3ee79d4e6e7dd7e833b2fcbcc97 Mon Sep 17 00:00:00 2001 From: Jun Sekine Date: Sun, 17 May 2026 01:20:22 +0900 Subject: [PATCH 1/2] feat: parse escape sequences in unquoted fields for explicit-escape dialects For dialects where `escapeChar != quoteChar` (e.g. `CsvDialect(escapeChar = '\\')`), the reader previously treated the escape character literally in the START and DELIMITER states, and accepted only `escapeChar` (not `quoteChar`) as the escaped character in the FIELD state. This rejected CSV produced by other libraries (Python `csv`, Apache Commons CSV, OpenCSV) that emit unquoted escape sequences such as `a\\b` or `a\"b`. Introduce a `handleUnquotedEscape` helper and route START / DELIMITER / FIELD through it so all three states share the same `{escapeChar, quoteChar}` acceptance set. When `escapeChar == quoteChar` (RFC 4180 default) the helper degenerates to a single-element accept set, preserving the existing strict behaviour (e.g. `a"b` under the default dialect still throws). Closes #168 --- .../reader/internal/ParseStateMachine.kt | 42 +++++++++---- .../jsoizo/kotlincsv/reader/CsvReaderTest.kt | 59 +++++++++++++++++++ 2 files changed, 90 insertions(+), 11 deletions(-) diff --git a/src/commonMain/kotlin/com/jsoizo/kotlincsv/reader/internal/ParseStateMachine.kt b/src/commonMain/kotlin/com/jsoizo/kotlincsv/reader/internal/ParseStateMachine.kt index fd40e7e..bbb99eb 100644 --- a/src/commonMain/kotlin/com/jsoizo/kotlincsv/reader/internal/ParseStateMachine.kt +++ b/src/commonMain/kotlin/com/jsoizo/kotlincsv/reader/internal/ParseStateMachine.kt @@ -30,6 +30,8 @@ internal class ParseStateMachine( ParseState.START -> { when (ch) { quoteChar -> state = ParseState.QUOTE_START + // When `escapeChar == quoteChar`, the quoteChar arm above wins; this arm is unreachable. + escapeChar -> state = handleUnquotedEscape(nextCh, rowNum) delimiter -> { flushField() state = ParseState.DELIMITER @@ -52,17 +54,7 @@ internal class ParseStateMachine( } ParseState.FIELD -> { when (ch) { - escapeChar -> { - if (nextCh != escapeChar) throw CsvParseFormatException( - rowNum, - pos, - ch, - "must appear escapeChar($escapeChar) after escapeChar($escapeChar)" - ) - field.append(nextCh) - state = ParseState.FIELD - pos += 1 - } + escapeChar -> state = handleUnquotedEscape(nextCh, rowNum) delimiter -> { flushField() state = ParseState.DELIMITER @@ -86,6 +78,8 @@ internal class ParseStateMachine( ParseState.DELIMITER -> { when (ch) { quoteChar -> state = ParseState.QUOTE_START + // When `escapeChar == quoteChar`, the quoteChar arm above wins; this arm is unreachable. + escapeChar -> state = handleUnquotedEscape(nextCh, rowNum) delimiter -> { flushField() state = ParseState.DELIMITER @@ -190,6 +184,32 @@ internal class ParseStateMachine( fields.add(field.toString()) field.clear() } + + /** + * Consume one additional character following an escape character at an + * unquoted position (START/DELIMITER/FIELD). The caller must already have + * accounted for the escape char itself in [pos]; this method advances + * [pos] by 1 more to consume the escaped char, appends it to [field], and + * returns [ParseState.FIELD]. + * + * Accepted [nextCh] values are [escapeChar] and [quoteChar]. When + * `escapeChar == quoteChar` the accepted set degenerates to a single value, + * preserving the RFC 4180 strict-doubling behaviour for the default + * dialect. + */ + private fun handleUnquotedEscape(nextCh: Char?, rowNum: Long): ParseState { + if (nextCh != escapeChar && nextCh != quoteChar) { + throw CsvParseFormatException( + rowNum, + pos, + escapeChar, + "escape character must be followed by escapeChar($escapeChar) or quoteChar($quoteChar)" + ) + } + field.append(nextCh) + pos += 1 + return ParseState.FIELD + } } private enum class ParseState { diff --git a/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderTest.kt b/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderTest.kt index 1611216..522f78b 100644 --- a/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderTest.kt +++ b/src/commonTest/kotlin/com/jsoizo/kotlincsv/reader/CsvReaderTest.kt @@ -2,6 +2,7 @@ package com.jsoizo.kotlincsv.reader import com.jsoizo.kotlincsv.CsvDialect import com.jsoizo.kotlincsv.exceptions.CsvFieldNumDifferentException +import com.jsoizo.kotlincsv.exceptions.CsvParseFormatException import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import kotlin.test.Test @@ -134,4 +135,62 @@ class CsvReaderTest { val seq = reader.read("a,b,c\nd,e".asSequence()) seq.take(1).toList() shouldBe listOf(listOf("a", "b", "c")) } + + // --- Unquoted-field escape (issue #168) --- + + private val explicitEscapeReader = + CsvReader(CsvReaderConfig(dialect = CsvDialect(escapeChar = '\\'))) + + @Test + fun unquotedEscape_doubledEscape_inField() { + explicitEscapeReader.readAll("x,a\\\\b\n") shouldBe listOf(listOf("x", "a\\b")) + } + + @Test + fun unquotedEscape_atRowStart() { + explicitEscapeReader.readAll("\\\\b\n") shouldBe listOf(listOf("\\b")) + } + + @Test + fun unquotedEscape_quoteCharEscaped_atRowStart() { + explicitEscapeReader.readAll("\\\"y\n") shouldBe listOf(listOf("\"y")) + } + + @Test + fun unquotedEscape_quoteCharEscaped_afterDelimiter() { + explicitEscapeReader.readAll("x,\\\"y\n") shouldBe listOf(listOf("x", "\"y")) + } + + @Test + fun unquotedEscape_quoteCharEscaped_inField() { + explicitEscapeReader.readAll("a\\\"b\n") shouldBe listOf(listOf("a\"b")) + } + + @Test + fun unquotedEscape_escapeAtEof_throws() { + shouldThrow { explicitEscapeReader.readAll("a\\") } + } + + @Test + fun unquotedEscape_followedByOrdinaryChar_throws() { + shouldThrow { explicitEscapeReader.readAll("a\\c") } + } + + @Test + fun unquotedEscape_followedByCr_throws() { + shouldThrow { explicitEscapeReader.readAll("a\\\r\n") } + } + + @Test + fun unquotedEscape_followedByLf_throws() { + shouldThrow { explicitEscapeReader.readAll("a\\\n") } + } + + @Test + fun unquotedEscape_multipleRows() { + explicitEscapeReader.readAll("a\\\\b\nc\\\\d\n") shouldBe listOf( + listOf("a\\b"), + listOf("c\\d"), + ) + } } From 3ab4b2a0790039ff04bfada05f959a2b5d4e601c Mon Sep 17 00:00:00 2001 From: Jun Sekine Date: Sun, 17 May 2026 01:20:27 +0900 Subject: [PATCH 2/2] test: PBT for unquoted-escape parsing under explicit-escape dialect Add a property-based test that generates arbitrary unquoted-safe fields under `CsvDialect(escapeChar = '\\')`, serialises them with the minimal Python-csv style backslash escaping (`\` -> `\\`, `"` -> `\"`), and asserts the reader round-trips them. 500 iterations, seed 0L, follows the project PBT conventions (checkAll + runTest, generators reused from PbtArbs). --- .../kotlincsv/UnquotedEscapePropertyTest.kt | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 src/commonTest/kotlin/com/jsoizo/kotlincsv/UnquotedEscapePropertyTest.kt diff --git a/src/commonTest/kotlin/com/jsoizo/kotlincsv/UnquotedEscapePropertyTest.kt b/src/commonTest/kotlin/com/jsoizo/kotlincsv/UnquotedEscapePropertyTest.kt new file mode 100644 index 0000000..e21e3b3 --- /dev/null +++ b/src/commonTest/kotlin/com/jsoizo/kotlincsv/UnquotedEscapePropertyTest.kt @@ -0,0 +1,53 @@ +package com.jsoizo.kotlincsv + +import io.kotest.common.ExperimentalKotest +import io.kotest.matchers.shouldBe +import io.kotest.property.Arb +import io.kotest.property.PropTestConfig +import io.kotest.property.arbitrary.filter +import io.kotest.property.arbitrary.list +import io.kotest.property.arbitrary.map +import io.kotest.property.checkAll +import kotlinx.coroutines.test.runTest +import kotlin.test.Test + +@OptIn(ExperimentalKotest::class) +class UnquotedEscapePropertyTest { + + private val dialect = CsvDialect(escapeChar = '\\') + + // Chars that can legally appear in an unquoted field. The minimal serializer + // below only escapes `\` and `"`, so delimiter and line terminators must be + // excluded from generation. `\` and `"` are kept so the escape path is exercised. + private val unquotedSafeChar: Arb = fieldChar.filter { ch -> + ch != ',' && ch != '"' && + ch != '\n' && ch != '\r' && + ch != '
' && ch != '
' && ch != '…' + } + + private val unquotedFieldArb: Arb = + Arb.list(unquotedSafeChar, 0..16).map { it.joinToString("") } + + private fun serialize(field: String): String = buildString { + for (ch in field) { + when (ch) { + '\\' -> append("\\\\") + '"' -> append("\\\"") + else -> append(ch) + } + } + append('\n') + } + + @Test + fun unquotedEscape_singleFieldRoundTrip() = runTest { + checkAll( + PropTestConfig(seed = 0L, iterations = 500), + unquotedFieldArb, + ) { field -> + val text = serialize(field) + val parsed = csvReader { this.dialect = this@UnquotedEscapePropertyTest.dialect }.readAll(text) + parsed shouldBe listOf(listOf(field)) + } + } +}