Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions benchmark/parity/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
plugins {
alias(libs.plugins.kotlinJvm)
}

kotlin {
jvmToolchain(21)
}

dependencies {
testImplementation(project(":"))
testImplementation(project(":benchmark:shared"))
testImplementation(libs.kotlincsv.v1.jvm)
testImplementation(libs.bundles.kotest)
testImplementation(libs.kotlin.test.junit5)
}

tasks.withType<Test>().configureEach {
useJUnitPlatform()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.jsoizo.kotlincsv.bench.parity

import kotlin.test.assertEquals

fun assertRowsEqual(expected: List<List<String>>, actual: List<List<String>>) {
assertEquals(expected.size, actual.size, "row count mismatch")
for (i in expected.indices) {
val e = expected[i]
val a = actual[i]
if (e.size != a.size) {
throw AssertionError("row $i col count mismatch: expected=${e.size} actual=${a.size}\n expected=$e\n actual=$a")
}
for (j in e.indices) {
if (e[j] != a[j]) {
throw AssertionError("row $i col $j mismatch: expected=${e[j].quoted()} actual=${a[j].quoted()}")
}
}
}
}

private fun String.quoted(): String = "\"" + replace("\\", "\\\\").replace("\"", "\\\"") + "\""
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package com.jsoizo.kotlincsv.bench.parity

import com.jsoizo.kotlincsv.bench.shared.CsvDataGen
import com.jsoizo.kotlincsv.bench.shared.DatasetSpec

object ParityFixtures {
val hard: CsvDataGen.Generated by lazy {
CsvDataGen.generate(DatasetSpec.HARD, CsvDataGen.DEFAULT_SEED)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package com.jsoizo.kotlincsv.bench.parity

import com.github.doyaaaaaken.kotlincsv.dsl.csvReader as v1csvReader
import com.jsoizo.kotlincsv.csvReader as v2csvReader
import com.jsoizo.kotlincsv.reader.withHeader
import org.junit.jupiter.api.Test
import kotlin.test.assertEquals

class ParityHeaderTest {

@Test
fun readAllWithHeader_string_v1_v2_parity() {
val headers = (0 until ParityFixtures.hard.rows[0].size)
.joinToString(",") { "col$it" }
val body = ParityFixtures.hard.csvText
val text = headers + "\r\n" + body

val v1: List<Map<String, String>> = v1csvReader().readAllWithHeader(text)
val v2Rows: List<Map<String, String>> =
v2csvReader().readAll(text).asSequence().withHeader().toList()

assertEquals(v1.size, v2Rows.size, "row count mismatch")
for (i in v1.indices) {
val e = v1[i]
val a = v2Rows[i]
assertEquals(e.keys.toList(), a.keys.toList(), "row $i header keys mismatch")
for (k in e.keys) {
assertEquals(e[k], a[k], "row $i key '$k' mismatch")
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package com.jsoizo.kotlincsv.bench.parity

import com.github.doyaaaaaken.kotlincsv.dsl.csvReader as v1csvReader
import com.jsoizo.kotlincsv.csvReader as v2csvReader
import com.jsoizo.kotlincsv.reader.readAll as v2readAll
import com.jsoizo.kotlincsv.reader.readAllFromFile as v2readAllFromFile
import org.junit.jupiter.api.AfterAll
import org.junit.jupiter.api.BeforeAll
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.TestInstance
import java.io.ByteArrayInputStream
import java.io.File

@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class ParityReadTest {

private lateinit var tempFile: File

@BeforeAll
fun setUp() {
tempFile = File.createTempFile("parity-hard-", ".csv").apply {
writeBytes(ParityFixtures.hard.csvBytes)
}
}

@AfterAll
fun tearDown() {
tempFile.delete()
}

@Test
fun readAll_string_v1_v2_parity() {
val v1 = v1csvReader().readAll(ParityFixtures.hard.csvText)
val v2 = v2csvReader().readAll(ParityFixtures.hard.csvText)
assertRowsEqual(v1, v2)
assertRowsEqual(ParityFixtures.hard.rows, v2)
}

@Test
fun readAll_inputStream_v1_v2_parity() {
val v1 = ByteArrayInputStream(ParityFixtures.hard.csvBytes).use {
v1csvReader().readAll(it)
}
val v2 = ByteArrayInputStream(ParityFixtures.hard.csvBytes).use {
v2csvReader().v2readAll(it)
}
assertRowsEqual(v1, v2)
assertRowsEqual(ParityFixtures.hard.rows, v2)
}

@Test
fun readAll_file_v1_v2_parity() {
val v1 = v1csvReader().readAll(tempFile)
val v2 = v2csvReader().v2readAllFromFile(tempFile)
assertRowsEqual(v1, v2)
assertRowsEqual(ParityFixtures.hard.rows, v2)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.jsoizo.kotlincsv.bench.parity

import com.github.doyaaaaaken.kotlincsv.dsl.csvReader as v1csvReader
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter as v1csvWriter
import com.jsoizo.kotlincsv.csvReader as v2csvReader
import com.jsoizo.kotlincsv.csvWriter as v2csvWriter
import com.jsoizo.kotlincsv.writer.write as v2write
import org.junit.jupiter.api.Test
import java.io.ByteArrayOutputStream

class ParityWriteTest {

@Test
fun writeAll_outputStream_then_v1_reparse_matches_input() {
val rows = ParityFixtures.hard.rows

val v1Out = ByteArrayOutputStream()
v1csvWriter().writeAll(rows, v1Out)
val v1Reparsed = v1csvReader().readAll(v1Out.toString(Charsets.UTF_8))

val v2Out = ByteArrayOutputStream()
v2csvWriter().v2write(rows.asSequence(), v2Out, "UTF-8")
val v2Reparsed = v2csvReader().readAll(v2Out.toString(Charsets.UTF_8))

assertRowsEqual(rows, v1Reparsed)
assertRowsEqual(rows, v2Reparsed)
assertRowsEqual(v1Reparsed, v2Reparsed)
}
}
7 changes: 7 additions & 0 deletions benchmark/shared/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
plugins {
alias(libs.plugins.kotlinJvm)
}

kotlin {
jvmToolchain(21)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package com.jsoizo.kotlincsv.bench.shared

import kotlin.random.Random

object CsvDataGen {
const val DEFAULT_SEED: Long = 42L

private val asciiPrintable: CharArray =
(('a'..'z') + ('A'..'Z') + ('0'..'9') + " .-_/".toList()).toCharArray()

private val hiragana: CharArray = ('ぁ'..'ん').toList().toCharArray()

data class Generated(
val rows: List<List<String>>,
val csvText: String,
val csvBytes: ByteArray,
val stats: DataStats,
)

fun generate(spec: DatasetSpec, seed: Long = DEFAULT_SEED): Generated {
val random = Random(seed)
val rows = ArrayList<List<String>>(spec.rows)
var totalLen = 0L
var maxLen = 0
var quoteCells = 0
var commaCells = 0
var newlineCells = 0
var multiByteCells = 0
for (r in 0 until spec.rows) {
val row = ArrayList<String>(spec.cols)
for (c in 0 until spec.cols) {
val len = 4 + random.nextInt(12) // 4..15
val sb = StringBuilder(len)
var hasQuote = false
var hasComma = false
var hasNewline = false
var hasMultiByte = false
for (i in 0 until len) {
val pickMulti = spec.utf8MultiByteRate > 0.0 && random.nextDouble() < spec.utf8MultiByteRate / 2.0
val ch: Char = when {
pickMulti -> {
hasMultiByte = true
hiragana[random.nextInt(hiragana.size)]
}
else -> asciiPrintable[random.nextInt(asciiPrintable.size)]
}
sb.append(ch)
}
if (spec.embeddedCommaRate > 0.0 && random.nextDouble() < spec.embeddedCommaRate) {
sb.insert(random.nextInt(sb.length + 1), ',')
hasComma = true
}
if (spec.embeddedNewlineRate > 0.0 && random.nextDouble() < spec.embeddedNewlineRate) {
sb.insert(random.nextInt(sb.length + 1), '\n')
hasNewline = true
}
if (spec.quoteRate > 0.0 && random.nextDouble() < spec.quoteRate) {
sb.insert(random.nextInt(sb.length + 1), '"')
hasQuote = true
}
val cell = sb.toString()
row.add(cell)
totalLen += cell.length
if (cell.length > maxLen) maxLen = cell.length
if (hasQuote) quoteCells++
if (hasComma) commaCells++
if (hasNewline) newlineCells++
if (hasMultiByte) multiByteCells++
}
rows.add(row)
}
val csvText = encode(rows)
val csvBytes = csvText.toByteArray(Charsets.UTF_8)
val totalCells = spec.rows.toLong() * spec.cols
val stats = DataStats(
dataset = spec.name,
rows = spec.rows,
cols = spec.cols,
seed = seed,
avgCellLen = if (totalCells == 0L) 0.0 else totalLen.toDouble() / totalCells,
maxCellLen = maxLen,
quoteRate = if (totalCells == 0L) 0.0 else quoteCells.toDouble() / totalCells,
embeddedCommaRate = if (totalCells == 0L) 0.0 else commaCells.toDouble() / totalCells,
embeddedNewlineRate = if (totalCells == 0L) 0.0 else newlineCells.toDouble() / totalCells,
utf8MultiByteCellRate = if (totalCells == 0L) 0.0 else multiByteCells.toDouble() / totalCells,
totalBytes = csvBytes.size.toLong(),
)
return Generated(rows, csvText, csvBytes, stats)
}

private fun encode(rows: List<List<String>>): String {
val sb = StringBuilder()
for (row in rows) {
var first = true
for (cell in row) {
if (!first) sb.append(',')
first = false
if (cell.contains('"') || cell.contains(',') || cell.contains('\n') || cell.contains('\r')) {
sb.append('"').append(cell.replace("\"", "\"\"")).append('"')
} else {
sb.append(cell)
}
}
sb.append("\r\n")
}
return sb.toString()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.jsoizo.kotlincsv.bench.shared

data class DataStats(
val dataset: String,
val rows: Int,
val cols: Int,
val seed: Long,
val avgCellLen: Double,
val maxCellLen: Int,
val quoteRate: Double,
val embeddedCommaRate: Double,
val embeddedNewlineRate: Double,
val utf8MultiByteCellRate: Double,
val totalBytes: Long,
) {
fun toJson(): String = buildString {
append("{")
append("\"dataset\":\"").append(dataset).append("\",")
append("\"rows\":").append(rows).append(",")
append("\"cols\":").append(cols).append(",")
append("\"seed\":").append(seed).append(",")
append("\"avgCellLen\":").append(avgCellLen).append(",")
append("\"maxCellLen\":").append(maxCellLen).append(",")
append("\"quoteRate\":").append(quoteRate).append(",")
append("\"embeddedCommaRate\":").append(embeddedCommaRate).append(",")
append("\"embeddedNewlineRate\":").append(embeddedNewlineRate).append(",")
append("\"utf8MultiByteCellRate\":").append(utf8MultiByteCellRate).append(",")
append("\"totalBytes\":").append(totalBytes)
append("}")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.jsoizo.kotlincsv.bench.shared

enum class DatasetSpec(
val rows: Int,
val cols: Int,
val quoteRate: Double,
val embeddedCommaRate: Double,
val embeddedNewlineRate: Double,
val utf8MultiByteRate: Double,
) {
SMALL(rows = 1_000, cols = 10, quoteRate = 0.0, embeddedCommaRate = 0.0, embeddedNewlineRate = 0.0, utf8MultiByteRate = 0.0),
MEDIUM(rows = 100_000, cols = 20, quoteRate = 0.0, embeddedCommaRate = 0.0, embeddedNewlineRate = 0.0, utf8MultiByteRate = 0.0),
LARGE(rows = 1_000_000, cols = 10, quoteRate = 0.0, embeddedCommaRate = 0.0, embeddedNewlineRate = 0.0, utf8MultiByteRate = 0.0),
HARD(rows = 10_000, cols = 10, quoteRate = 0.30, embeddedCommaRate = 0.10, embeddedNewlineRate = 0.05, utf8MultiByteRate = 0.20),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.jsoizo.kotlincsv.bench.shared

object EnvProbe {
data class Snapshot(
val jdkVersion: String,
val jdkVendor: String,
val osName: String,
val osVersion: String,
val osArch: String,
val availableProcessors: Int,
val totalMemoryMB: Long,
val kotlinStdlibVersion: String,
val benchSide: String,
val timestampEpochMs: Long,
) {
fun toJson(): String = buildString {
append("{")
append("\"jdkVersion\":\"").append(escape(jdkVersion)).append("\",")
append("\"jdkVendor\":\"").append(escape(jdkVendor)).append("\",")
append("\"osName\":\"").append(escape(osName)).append("\",")
append("\"osVersion\":\"").append(escape(osVersion)).append("\",")
append("\"osArch\":\"").append(escape(osArch)).append("\",")
append("\"availableProcessors\":").append(availableProcessors).append(",")
append("\"totalMemoryMB\":").append(totalMemoryMB).append(",")
append("\"kotlinStdlibVersion\":\"").append(escape(kotlinStdlibVersion)).append("\",")
append("\"benchSide\":\"").append(escape(benchSide)).append("\",")
append("\"timestampEpochMs\":").append(timestampEpochMs)
append("}")
}

private fun escape(s: String): String = s.replace("\\", "\\\\").replace("\"", "\\\"")
}

fun snapshot(benchSide: String): Snapshot {
val runtime = Runtime.getRuntime()
return Snapshot(
jdkVersion = System.getProperty("java.version") ?: "",
jdkVendor = System.getProperty("java.vendor") ?: "",
osName = System.getProperty("os.name") ?: "",
osVersion = System.getProperty("os.version") ?: "",
osArch = System.getProperty("os.arch") ?: "",
availableProcessors = runtime.availableProcessors(),
totalMemoryMB = runtime.totalMemory() / (1024L * 1024L),
kotlinStdlibVersion = KotlinVersion.CURRENT.toString(),
benchSide = benchSide,
timestampEpochMs = System.currentTimeMillis(),
)
}
}
Loading
Loading