obsessiondb · KeKs0r · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/.changeset/per-column-codec.md b/.changeset/per-column-codec.md
@@ -0,0 +1,36 @@
+---
+"@chkit/core": patch
+"@chkit/clickhouse": patch
+"@chkit/plugin-pull": patch
+"chkit": patch
+---
+
+Add support for per-column compression codecs.
+
+Declare a codec directly on a column with a structured discriminated union:
+
+```ts
+import { codec, table } from '@chkit/core'
+
+const events = table({
+  database: 'analytics',
+  name: 'events',
+  columns: [
+    { name: 'id', type: 'UInt64' },
+    { name: 'ts', type: 'DateTime', codec: { kind: 'ZSTD', level: 3 } },
+    { name: 'delta', type: 'Int64', codec: [{ kind: 'Delta', size: 4 }, { kind: 'ZSTD' }] },
+    { name: 'exp', type: 'Float32', codec: codec.raw('SomeNewCodec(42)') },
+  ],
+  engine: 'MergeTree()',
+  primaryKey: ['id'],
+  orderBy: ['id'],
+})
+```
+
+Highlights:
+- `CREATE TABLE` and `ALTER TABLE ADD/MODIFY COLUMN` emit the `CODEC(...)` clause in the correct position (after `DEFAULT` and `COMMENT`, as required by ClickHouse).
+- `chkit generate` emits `MODIFY COLUMN ... REMOVE CODEC` when a codec is dropped and no other column fields change; otherwise a single `MODIFY COLUMN` replaces the codec.
+- `chkit pull` introspects `system.columns.compression_codec` and renders structured codec objects back into the schema file. Unknown codec tokens fall back to `codec.raw(...)` so new ClickHouse codecs still round-trip.
+- Canonicalization fills ClickHouse defaults (`ZSTD` → level 1, `LZ4HC` → level 9, `Delta`/`DoubleDelta`/`Gorilla` → size 1), so `{kind:'ZSTD'}` and `{kind:'ZSTD', level:1}` compare equal and the diff engine stays stable across pull → plan round-trips.
+- Validation catches codec chains with more than one general codec, chains that do not end with a general codec, or an empty chain (`codec: []`) — preprocessor alone is accepted since ClickHouse auto-appends the default general codec.
+- `parseCodec` falls back to `raw` when a known codec token has unexpected extra args (e.g. `ZSTD(3, 1)`), so future ClickHouse codec extensions round-trip cleanly through `chkit pull` instead of silently dropping arguments.
diff --git a/.changeset/skip-index-structured-args.md b/.changeset/skip-index-structured-args.md
@@ -0,0 +1,36 @@
+---
+"@chkit/core": major
+"@chkit/clickhouse": major
+"@chkit/plugin-pull": major
+"chkit": major
+---
+
+**Breaking:** Skip indexes now use a structured discriminated union instead of a free-form `typeArgs: string` field. Each index `type` has its own typed fields, which moves argument validation from runtime to the type system.
+
+```ts
+indexes: [
+  // before
+  { name: 'idx_set', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 },
+  // after
+  { name: 'idx_set', expression: 'source', type: 'set', maxRows: 0, granularity: 1 },
+]
+```
+
+### Migration guide
+
+| Old (`typeArgs`)                               | New (structured)                                                              |
+| ---------------------------------------------- | ----------------------------------------------------------------------------- |
+| `type: 'minmax'`                               | `type: 'minmax'`                                                              |
+| `type: 'set', typeArgs: '0'`                   | `type: 'set', maxRows: 0`                                                     |
+| `type: 'set', typeArgs: '1000'`                | `type: 'set', maxRows: 1000`                                                  |
+| `type: 'bloom_filter'`                         | `type: 'bloom_filter'`                                                        |
+| `type: 'bloom_filter', typeArgs: '0.01'`       | `type: 'bloom_filter', falsePositiveRate: 0.01`                               |
+| `type: 'tokenbf_v1', typeArgs: '32768, 3, 0'`  | `type: 'tokenbf_v1', sizeBytes: 32768, hashFunctions: 3, randomSeed: 0`       |
+| `type: 'ngrambf_v1', typeArgs: '3, 256, 2, 0'` | `type: 'ngrambf_v1', ngramSize: 3, sizeBytes: 256, hashFunctions: 2, randomSeed: 0` |
+
+Highlights:
+- `set` now requires `maxRows` at the type level — forgetting it is a TypeScript error rather than a runtime validation failure.
+- `tokenbf_v1` and `ngrambf_v1` have typed `sizeBytes`, `hashFunctions`, `randomSeed` (and `ngramSize` for ngram), so positional argument mistakes are caught at compile time.
+- `bloom_filter` keeps `falsePositiveRate` optional — omit it to emit a bare `bloom_filter` clause.
+- `chkit pull` now introspects `system.data_skipping_indices.type_full` and emits the structured fields back into schema files; unknown types still round-trip via the existing path.
+- The `index_type_missing_args` validation code is removed since it is now a compile-time concern.
diff --git a/apps/docs/src/content/docs/schema/dsl-reference.md b/apps/docs/src/content/docs/schema/dsl-reference.md
@@ -72,7 +72,7 @@ const events = table({
   ttl: 'received_at + INTERVAL 90 DAY',
   settings: { index_granularity: 8192 },
   indexes: [
-    { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 },
+    { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 },
   ],
   projections: [
     { name: 'p_recent', query: 'SELECT id ORDER BY received_at DESC LIMIT 10' },
@@ -160,20 +160,38 @@ Previous column name for rename tracking. See [Rename support](#rename-support).
 
 ## Skip indexes
 
-Each entry in the `indexes` array is a `SkipIndexDefinition`.
+Each entry in the `indexes` array is a `SkipIndexDefinition`. The shared base fields are:
 
 | Field | Type | Description |
 |-------|------|-------------|
 | `name` | `string` | Index name |
 | `expression` | `string` | Indexed expression |
 | `type` | `'minmax' \| 'set' \| 'bloom_filter' \| 'tokenbf_v1' \| 'ngrambf_v1'` | Index type |
-| `typeArgs` | `string`, optional | Arguments for parameterized index types. **Required** for `set`, `tokenbf_v1`, `ngrambf_v1` (ClickHouse 26+) |
 | `granularity` | `number` | Index granularity |
 
+Type-specific fields:
+
+| Type | Required fields | Optional fields | Notes |
+|------|-----------------|-----------------|-------|
+| `minmax` | — | — | No arguments |
+| `set` | `maxRows: number` | — | `maxRows: 0` stores all unique values (ClickHouse 26+ requires `set(0)` rather than bare `set`) |
+| `bloom_filter` | — | `falsePositiveRate: number` | Defaults to `0.025` when omitted |
+| `tokenbf_v1` | `sizeBytes`, `hashFunctions`, `randomSeed` (all `number`) | — | Maps to `tokenbf_v1(size_bytes, n_hash, seed)` |
+| `ngrambf_v1` | `ngramSize`, `sizeBytes`, `hashFunctions`, `randomSeed` (all `number`) | — | Maps to `ngrambf_v1(n, size_bytes, n_hash, seed)` |
+
 ```ts
 indexes: [
-  { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 },
+  { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 },
   { name: 'idx_ts', expression: 'received_at', type: 'minmax', granularity: 3 },
+  {
+    name: 'idx_body',
+    expression: 'body',
+    type: 'tokenbf_v1',
+    sizeBytes: 256,
+    hashFunctions: 2,
+    randomSeed: 0,
+    granularity: 1,
+  },
 ]
 ```
 

diff --git a/packages/cli/src/drift.test.ts b/packages/cli/src/drift.test.ts
@@ -212,7 +212,7 @@ describe('@chkit/cli drift comparer', () => {
         { name: 'source', type: 'String' },
       ],
       settings: { index_granularity: '4096' },
-      indexes: [{ name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }],
+      indexes: [{ name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }],
       projections: [{ name: 'p_fresh', query: 'SELECT id ORDER BY id LIMIT 5' }],
       ttl: undefined,
     })

diff --git a/packages/cli/src/drift.ts b/packages/cli/src/drift.ts
@@ -201,11 +201,27 @@ function normalizeColumnShape(column: ColumnDefinition): string {
   return parts.join('|')
 }
 
+function renderIndexTypeFingerprint(index: SkipIndexDefinition): string {
+  switch (index.type) {
+    case 'minmax':
+      return 'minmax'
+    case 'set':
+      return `set(${index.maxRows})`
+    case 'bloom_filter':
+      return index.falsePositiveRate !== undefined
+        ? `bloom_filter(${index.falsePositiveRate})`
+        : 'bloom_filter'
+    case 'tokenbf_v1':
+      return `tokenbf_v1(${index.sizeBytes}, ${index.hashFunctions}, ${index.randomSeed})`
+    case 'ngrambf_v1':
+      return `ngrambf_v1(${index.ngramSize}, ${index.sizeBytes}, ${index.hashFunctions}, ${index.randomSeed})`
+  }
+}
+
 function normalizeIndexShape(index: SkipIndexDefinition): string {
-  const typeStr = index.typeArgs !== undefined ? `${index.type}(${index.typeArgs})` : index.type
   return [
     `expr=${normalizeSQLFragment(index.expression)}`,
-    `type=${typeStr}`,
+    `type=${renderIndexTypeFingerprint(index)}`,
     `granularity=${index.granularity}`,
   ].join('|')
 }

diff --git a/packages/clickhouse/src/index.ts b/packages/clickhouse/src/index.ts
@@ -1,6 +1,7 @@
 import { createClient, type ClickHouseSettings } from '@clickhouse/client'
 import {
   normalizeSQLFragment,
+  parseCodec,
   type ChxConfig,
   type ColumnDefinition,
   type ProjectionDefinition,
@@ -77,6 +78,7 @@ export interface SystemColumnRow {
   default_expression?: string
   comment?: string
   position: number
+  compression_codec?: string
 }
 
 export interface SystemSkippingIndexRow {
@@ -130,31 +132,67 @@ export function normalizeColumnFromSystemRow(row: SystemColumnRow): ColumnDefini
   if (row.default_expression && row.default_kind === 'DEFAULT') {
     defaultValue = normalizeSQLFragment(row.default_expression)
   }
+  const codecSteps = parseCodec(row.compression_codec)
   return {
     name: row.name,
     type,
     nullable: nullable || undefined,
     default: defaultValue,
     comment: row.comment?.trim() || undefined,
+    codec: codecSteps,
   }
 }
 
-function parseIndexType(value: string): Pick<SkipIndexDefinition, 'type' | 'typeArgs'> {
+type ParsedIndexShape =
+  | { type: 'minmax' }
+  | { type: 'set'; maxRows: number }
+  | { type: 'bloom_filter'; falsePositiveRate?: number }
+  | { type: 'tokenbf_v1'; sizeBytes: number; hashFunctions: number; randomSeed: number }
+  | {
+      type: 'ngrambf_v1'
+      ngramSize: number
+      sizeBytes: number
+      hashFunctions: number
+      randomSeed: number
+    }
+
+function splitArgs(args: string | undefined): number[] {
+  if (args === undefined) return []
+  return args
+    .split(',')
+    .map((part) => Number(part.trim()))
+    .filter((value) => !Number.isNaN(value))
+}
+
+function parseIndexType(value: string): ParsedIndexShape {
   const match = value.match(/^(\w+)\((.+)\)$/)
   const baseName = match?.[1] ?? value
-  const args = match?.[2]
+  const args = splitArgs(match?.[2])
 
   switch (baseName) {
     case 'minmax':
-      return args !== undefined ? { type: 'minmax', typeArgs: args } : { type: 'minmax' }
+      return { type: 'minmax' }
     case 'bloom_filter':
-      return args !== undefined ? { type: 'bloom_filter', typeArgs: args } : { type: 'bloom_filter' }
+      return args.length > 0
+        ? { type: 'bloom_filter', falsePositiveRate: args[0]! }
+        : { type: 'bloom_filter' }
     case 'tokenbf_v1':
-      return { type: 'tokenbf_v1', typeArgs: args ?? '0' }
+      return {
+        type: 'tokenbf_v1',
+        sizeBytes: args[0] ?? 0,
+        hashFunctions: args[1] ?? 0,
+        randomSeed: args[2] ?? 0,
+      }
     case 'ngrambf_v1':
-      return { type: 'ngrambf_v1', typeArgs: args ?? '0' }
+      return {
+        type: 'ngrambf_v1',
+        ngramSize: args[0] ?? 0,
+        sizeBytes: args[1] ?? 0,
+        hashFunctions: args[2] ?? 0,
+        randomSeed: args[3] ?? 0,
+      }
     default:
-      return { type: 'set', typeArgs: args ?? '0' }
+      return { type: 'set', maxRows: args[0] ?? 0 }
   }
 }
 
@@ -165,7 +203,7 @@ export function normalizeIndexFromSystemRow(row: SystemSkippingIndexRow): SkipIn
     expression: normalizeSQLFragment(row.expr),
     granularity: row.granularity,
     ...parsed,
-  } as SkipIndexDefinition
+  }
 }
 
 export function buildIntrospectedTables(
@@ -484,7 +522,7 @@ WHERE is_temporary = 0
   AND database IN (${quotedDatabases})`
       )
       const columns = await this.query<SystemColumnRow>(
-        `SELECT database, table, name, type, default_kind, default_expression, comment, position
+        `SELECT database, table, name, type, default_kind, default_expression, comment, position, compression_codec
 FROM system.columns
 WHERE database IN (${quotedDatabases})`
       )

diff --git a/packages/core/src/canonical.ts b/packages/core/src/canonical.ts
@@ -10,6 +10,7 @@ import type {
 } from './model.js'
 import { normalizeKeyColumns } from './key-clause.js'
 import { isSchemaDefinition } from './model.js'
+import { canonicalizeCodec } from './codec.js'
 import { normalizeEngine, normalizeSQLFragment } from './sql-normalizer.js'
 
 function sortByName<T extends { name: string }>(items: T[]): T[] {
@@ -29,6 +30,7 @@ function canonicalizeColumn(column: ColumnDefinition): ColumnDefinition {
     renamedFrom: column.renamedFrom?.trim(),
     type: typeof column.type === 'string' ? column.type.trim() : column.type,
     comment: column.comment?.trim(),
+    codec: column.codec ? canonicalizeCodec(column.codec) : undefined,
   }
 }