diff --git a/.changeset/per-column-codec.md b/.changeset/per-column-codec.md new file mode 100644 index 0000000..5e671e2 --- /dev/null +++ b/.changeset/per-column-codec.md @@ -0,0 +1,36 @@ +--- +"@chkit/core": patch +"@chkit/clickhouse": patch +"@chkit/plugin-pull": patch +"chkit": patch +--- + +Add support for per-column compression codecs. + +Declare a codec directly on a column with a structured discriminated union: + +```ts +import { codec, table } from '@chkit/core' + +const events = table({ + database: 'analytics', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'ts', type: 'DateTime', codec: { kind: 'ZSTD', level: 3 } }, + { name: 'delta', type: 'Int64', codec: [{ kind: 'Delta', size: 4 }, { kind: 'ZSTD' }] }, + { name: 'exp', type: 'Float32', codec: codec.raw('SomeNewCodec(42)') }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], +}) +``` + +Highlights: +- `CREATE TABLE` and `ALTER TABLE ADD/MODIFY COLUMN` emit the `CODEC(...)` clause in the correct position (after `DEFAULT` and `COMMENT`, as required by ClickHouse). +- `chkit generate` emits `MODIFY COLUMN ... REMOVE CODEC` when a codec is dropped and no other column fields change; otherwise a single `MODIFY COLUMN` replaces the codec. +- `chkit pull` introspects `system.columns.compression_codec` and renders structured codec objects back into the schema file. Unknown codec tokens fall back to `codec.raw(...)` so new ClickHouse codecs still round-trip. +- Canonicalization fills ClickHouse defaults (`ZSTD` → level 1, `LZ4HC` → level 9, `Delta`/`DoubleDelta`/`Gorilla` → size 1), so `{kind:'ZSTD'}` and `{kind:'ZSTD', level:1}` compare equal and the diff engine stays stable across pull → plan round-trips. +- Validation catches codec chains with more than one general codec, chains that do not end with a general codec, or an empty chain (`codec: []`) — preprocessor alone is accepted since ClickHouse auto-appends the default general codec. +- `parseCodec` falls back to `raw` when a known codec token has unexpected extra args (e.g. `ZSTD(3, 1)`), so future ClickHouse codec extensions round-trip cleanly through `chkit pull` instead of silently dropping arguments. diff --git a/.changeset/skip-index-structured-args.md b/.changeset/skip-index-structured-args.md new file mode 100644 index 0000000..dd4ed55 --- /dev/null +++ b/.changeset/skip-index-structured-args.md @@ -0,0 +1,36 @@ +--- +"@chkit/core": major +"@chkit/clickhouse": major +"@chkit/plugin-pull": major +"chkit": major +--- + +**Breaking:** Skip indexes now use a structured discriminated union instead of a free-form `typeArgs: string` field. Each index `type` has its own typed fields, which moves argument validation from runtime to the type system. + +```ts +indexes: [ + // before + { name: 'idx_set', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + // after + { name: 'idx_set', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, +] +``` + +### Migration guide + +| Old (`typeArgs`) | New (structured) | +| ---------------------------------------------- | ----------------------------------------------------------------------------- | +| `type: 'minmax'` | `type: 'minmax'` | +| `type: 'set', typeArgs: '0'` | `type: 'set', maxRows: 0` | +| `type: 'set', typeArgs: '1000'` | `type: 'set', maxRows: 1000` | +| `type: 'bloom_filter'` | `type: 'bloom_filter'` | +| `type: 'bloom_filter', typeArgs: '0.01'` | `type: 'bloom_filter', falsePositiveRate: 0.01` | +| `type: 'tokenbf_v1', typeArgs: '32768, 3, 0'` | `type: 'tokenbf_v1', sizeBytes: 32768, hashFunctions: 3, randomSeed: 0` | +| `type: 'ngrambf_v1', typeArgs: '3, 256, 2, 0'` | `type: 'ngrambf_v1', ngramSize: 3, sizeBytes: 256, hashFunctions: 2, randomSeed: 0` | + +Highlights: +- `set` now requires `maxRows` at the type level — forgetting it is a TypeScript error rather than a runtime validation failure. +- `tokenbf_v1` and `ngrambf_v1` have typed `sizeBytes`, `hashFunctions`, `randomSeed` (and `ngramSize` for ngram), so positional argument mistakes are caught at compile time. +- `bloom_filter` keeps `falsePositiveRate` optional — omit it to emit a bare `bloom_filter` clause. +- `chkit pull` now introspects `system.data_skipping_indices.type_full` and emits the structured fields back into schema files; unknown types still round-trip via the existing path. +- The `index_type_missing_args` validation code is removed since it is now a compile-time concern. diff --git a/apps/docs/src/content/docs/schema/dsl-reference.md b/apps/docs/src/content/docs/schema/dsl-reference.md index 0e8b327..f92f3e0 100644 --- a/apps/docs/src/content/docs/schema/dsl-reference.md +++ b/apps/docs/src/content/docs/schema/dsl-reference.md @@ -72,7 +72,7 @@ const events = table({ ttl: 'received_at + INTERVAL 90 DAY', settings: { index_granularity: 8192 }, indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, ], projections: [ { name: 'p_recent', query: 'SELECT id ORDER BY received_at DESC LIMIT 10' }, @@ -160,20 +160,38 @@ Previous column name for rename tracking. See [Rename support](#rename-support). ## Skip indexes -Each entry in the `indexes` array is a `SkipIndexDefinition`. +Each entry in the `indexes` array is a `SkipIndexDefinition`. The shared base fields are: | Field | Type | Description | |-------|------|-------------| | `name` | `string` | Index name | | `expression` | `string` | Indexed expression | | `type` | `'minmax' \| 'set' \| 'bloom_filter' \| 'tokenbf_v1' \| 'ngrambf_v1'` | Index type | -| `typeArgs` | `string`, optional | Arguments for parameterized index types. **Required** for `set`, `tokenbf_v1`, `ngrambf_v1` (ClickHouse 26+) | | `granularity` | `number` | Index granularity | +Type-specific fields: + +| Type | Required fields | Optional fields | Notes | +|------|-----------------|-----------------|-------| +| `minmax` | — | — | No arguments | +| `set` | `maxRows: number` | — | `maxRows: 0` stores all unique values (ClickHouse 26+ requires `set(0)` rather than bare `set`) | +| `bloom_filter` | — | `falsePositiveRate: number` | Defaults to `0.025` when omitted | +| `tokenbf_v1` | `sizeBytes`, `hashFunctions`, `randomSeed` (all `number`) | — | Maps to `tokenbf_v1(size_bytes, n_hash, seed)` | +| `ngrambf_v1` | `ngramSize`, `sizeBytes`, `hashFunctions`, `randomSeed` (all `number`) | — | Maps to `ngrambf_v1(n, size_bytes, n_hash, seed)` | + ```ts indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, { name: 'idx_ts', expression: 'received_at', type: 'minmax', granularity: 3 }, + { + name: 'idx_body', + expression: 'body', + type: 'tokenbf_v1', + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, ] ``` diff --git a/packages/cli/src/drift.test.ts b/packages/cli/src/drift.test.ts index 4d6cf67..6627245 100644 --- a/packages/cli/src/drift.test.ts +++ b/packages/cli/src/drift.test.ts @@ -212,7 +212,7 @@ describe('@chkit/cli drift comparer', () => { { name: 'source', type: 'String' }, ], settings: { index_granularity: '4096' }, - indexes: [{ name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }], + indexes: [{ name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }], projections: [{ name: 'p_fresh', query: 'SELECT id ORDER BY id LIMIT 5' }], ttl: undefined, }) diff --git a/packages/cli/src/drift.ts b/packages/cli/src/drift.ts index c0166ef..008d08f 100644 --- a/packages/cli/src/drift.ts +++ b/packages/cli/src/drift.ts @@ -201,11 +201,27 @@ function normalizeColumnShape(column: ColumnDefinition): string { return parts.join('|') } +function renderIndexTypeFingerprint(index: SkipIndexDefinition): string { + switch (index.type) { + case 'minmax': + return 'minmax' + case 'set': + return `set(${index.maxRows})` + case 'bloom_filter': + return index.falsePositiveRate !== undefined + ? `bloom_filter(${index.falsePositiveRate})` + : 'bloom_filter' + case 'tokenbf_v1': + return `tokenbf_v1(${index.sizeBytes}, ${index.hashFunctions}, ${index.randomSeed})` + case 'ngrambf_v1': + return `ngrambf_v1(${index.ngramSize}, ${index.sizeBytes}, ${index.hashFunctions}, ${index.randomSeed})` + } +} + function normalizeIndexShape(index: SkipIndexDefinition): string { - const typeStr = index.typeArgs !== undefined ? `${index.type}(${index.typeArgs})` : index.type return [ `expr=${normalizeSQLFragment(index.expression)}`, - `type=${typeStr}`, + `type=${renderIndexTypeFingerprint(index)}`, `granularity=${index.granularity}`, ].join('|') } diff --git a/packages/clickhouse/src/index.ts b/packages/clickhouse/src/index.ts index 0daccea..2baa648 100644 --- a/packages/clickhouse/src/index.ts +++ b/packages/clickhouse/src/index.ts @@ -1,6 +1,7 @@ import { createClient, type ClickHouseSettings } from '@clickhouse/client' import { normalizeSQLFragment, + parseCodec, type ChxConfig, type ColumnDefinition, type ProjectionDefinition, @@ -77,6 +78,7 @@ export interface SystemColumnRow { default_expression?: string comment?: string position: number + compression_codec?: string } export interface SystemSkippingIndexRow { @@ -130,31 +132,67 @@ export function normalizeColumnFromSystemRow(row: SystemColumnRow): ColumnDefini if (row.default_expression && row.default_kind === 'DEFAULT') { defaultValue = normalizeSQLFragment(row.default_expression) } + const codecSteps = parseCodec(row.compression_codec) return { name: row.name, type, nullable: nullable || undefined, default: defaultValue, comment: row.comment?.trim() || undefined, + codec: codecSteps, } } -function parseIndexType(value: string): Pick { +type ParsedIndexShape = + | { type: 'minmax' } + | { type: 'set'; maxRows: number } + | { type: 'bloom_filter'; falsePositiveRate?: number } + | { type: 'tokenbf_v1'; sizeBytes: number; hashFunctions: number; randomSeed: number } + | { + type: 'ngrambf_v1' + ngramSize: number + sizeBytes: number + hashFunctions: number + randomSeed: number + } + +function splitArgs(args: string | undefined): number[] { + if (args === undefined) return [] + return args + .split(',') + .map((part) => Number(part.trim())) + .filter((value) => !Number.isNaN(value)) +} + +function parseIndexType(value: string): ParsedIndexShape { const match = value.match(/^(\w+)\((.+)\)$/) const baseName = match?.[1] ?? value - const args = match?.[2] + const args = splitArgs(match?.[2]) switch (baseName) { case 'minmax': - return args !== undefined ? { type: 'minmax', typeArgs: args } : { type: 'minmax' } + return { type: 'minmax' } case 'bloom_filter': - return args !== undefined ? { type: 'bloom_filter', typeArgs: args } : { type: 'bloom_filter' } + return args.length > 0 + ? { type: 'bloom_filter', falsePositiveRate: args[0]! } + : { type: 'bloom_filter' } case 'tokenbf_v1': - return { type: 'tokenbf_v1', typeArgs: args ?? '0' } + return { + type: 'tokenbf_v1', + sizeBytes: args[0] ?? 0, + hashFunctions: args[1] ?? 0, + randomSeed: args[2] ?? 0, + } case 'ngrambf_v1': - return { type: 'ngrambf_v1', typeArgs: args ?? '0' } + return { + type: 'ngrambf_v1', + ngramSize: args[0] ?? 0, + sizeBytes: args[1] ?? 0, + hashFunctions: args[2] ?? 0, + randomSeed: args[3] ?? 0, + } default: - return { type: 'set', typeArgs: args ?? '0' } + return { type: 'set', maxRows: args[0] ?? 0 } } } @@ -165,7 +203,7 @@ export function normalizeIndexFromSystemRow(row: SystemSkippingIndexRow): SkipIn expression: normalizeSQLFragment(row.expr), granularity: row.granularity, ...parsed, - } as SkipIndexDefinition + } } export function buildIntrospectedTables( @@ -484,7 +522,7 @@ WHERE is_temporary = 0 AND database IN (${quotedDatabases})` ) const columns = await this.query( - `SELECT database, table, name, type, default_kind, default_expression, comment, position + `SELECT database, table, name, type, default_kind, default_expression, comment, position, compression_codec FROM system.columns WHERE database IN (${quotedDatabases})` ) diff --git a/packages/core/src/canonical.ts b/packages/core/src/canonical.ts index 2c6d47f..c809133 100644 --- a/packages/core/src/canonical.ts +++ b/packages/core/src/canonical.ts @@ -10,6 +10,7 @@ import type { } from './model.js' import { normalizeKeyColumns } from './key-clause.js' import { isSchemaDefinition } from './model.js' +import { canonicalizeCodec } from './codec.js' import { normalizeEngine, normalizeSQLFragment } from './sql-normalizer.js' function sortByName(items: T[]): T[] { @@ -29,6 +30,7 @@ function canonicalizeColumn(column: ColumnDefinition): ColumnDefinition { renamedFrom: column.renamedFrom?.trim(), type: typeof column.type === 'string' ? column.type.trim() : column.type, comment: column.comment?.trim(), + codec: column.codec ? canonicalizeCodec(column.codec) : undefined, } } diff --git a/packages/core/src/codec.test.ts b/packages/core/src/codec.test.ts new file mode 100644 index 0000000..1c242e2 --- /dev/null +++ b/packages/core/src/codec.test.ts @@ -0,0 +1,190 @@ +import { describe, expect, test } from 'bun:test' + +import { + canonicalizeCodec, + codec, + codecsEqual, + parseCodec, + renderCodec, +} from './codec.js' + +describe('renderCodec', () => { + test('renders single general codec without level', () => { + expect(renderCodec({ kind: 'LZ4' })).toBe('CODEC(LZ4)') + }) + + test('renders ZSTD with explicit level', () => { + expect(renderCodec({ kind: 'ZSTD', level: 3 })).toBe('CODEC(ZSTD(3))') + }) + + test('renders ZSTD without level (bare name)', () => { + expect(renderCodec({ kind: 'ZSTD' })).toBe('CODEC(ZSTD)') + }) + + test('renders LZ4HC with level', () => { + expect(renderCodec({ kind: 'LZ4HC', level: 9 })).toBe('CODEC(LZ4HC(9))') + }) + + test('renders chain [Delta, ZSTD]', () => { + expect(renderCodec([{ kind: 'Delta', size: 4 }, { kind: 'ZSTD', level: 3 }])).toBe( + 'CODEC(Delta(4), ZSTD(3))' + ) + }) + + test('renders FPC with both args', () => { + expect(renderCodec({ kind: 'FPC', level: 10, floatSize: 4 })).toBe('CODEC(FPC(10, 4))') + }) + + test('renders NONE / T64 / GCD / ALP bare', () => { + expect(renderCodec({ kind: 'NONE' })).toBe('CODEC(NONE)') + expect(renderCodec({ kind: 'T64' })).toBe('CODEC(T64)') + expect(renderCodec({ kind: 'GCD' })).toBe('CODEC(GCD)') + expect(renderCodec({ kind: 'ALP' })).toBe('CODEC(ALP)') + }) + + test('renders raw verbatim', () => { + expect(renderCodec(codec.raw('SomeNewCodec(42)'))).toBe('CODEC(SomeNewCodec(42))') + }) + + test('renders raw embedded in chain', () => { + expect( + renderCodec([{ kind: 'Delta', size: 4 }, codec.raw('SomeNewCodec(42)')]) + ).toBe('CODEC(Delta(4), SomeNewCodec(42))') + }) +}) + +describe('parseCodec', () => { + test('parses empty / undefined / null to undefined', () => { + expect(parseCodec('')).toBeUndefined() + expect(parseCodec(undefined)).toBeUndefined() + expect(parseCodec(null)).toBeUndefined() + }) + + test('parses bare ZSTD', () => { + expect(parseCodec('CODEC(ZSTD)')).toEqual([{ kind: 'ZSTD' }]) + }) + + test('parses ZSTD with level', () => { + expect(parseCodec('CODEC(ZSTD(3))')).toEqual([{ kind: 'ZSTD', level: 3 }]) + }) + + test('parses LZ4HC with level', () => { + expect(parseCodec('CODEC(LZ4HC(9))')).toEqual([{ kind: 'LZ4HC', level: 9 }]) + }) + + test('parses Delta, ZSTD chain', () => { + expect(parseCodec('CODEC(Delta(4), ZSTD(1))')).toEqual([ + { kind: 'Delta', size: 4 }, + { kind: 'ZSTD', level: 1 }, + ]) + }) + + test('parses FPC with both args', () => { + expect(parseCodec('CODEC(FPC(10, 4))')).toEqual([ + { kind: 'FPC', level: 10, floatSize: 4 }, + ]) + }) + + test('parses general codecs NONE/T64/GCD/ALP bare', () => { + expect(parseCodec('CODEC(NONE)')).toEqual([{ kind: 'NONE' }]) + expect(parseCodec('CODEC(T64)')).toEqual([{ kind: 'T64' }]) + expect(parseCodec('CODEC(GCD)')).toEqual([{ kind: 'GCD' }]) + expect(parseCodec('CODEC(ALP)')).toEqual([{ kind: 'ALP' }]) + }) + + test('falls back to raw for unknown codec tokens', () => { + expect(parseCodec('CODEC(SomeNewCodec(42))')).toEqual([ + { kind: 'raw', expression: 'SomeNewCodec(42)' }, + ]) + }) + + test('raw fallback round-trips through renderCodec', () => { + const parsed = parseCodec('CODEC(SomeNewCodec(42))') + expect(parsed).toBeDefined() + expect(renderCodec(parsed!)).toBe('CODEC(SomeNewCodec(42))') + }) + + test('falls back to raw when known codec has unexpected extra args', () => { + expect(parseCodec('CODEC(ZSTD(3, 1))')).toEqual([ + { kind: 'raw', expression: 'ZSTD(3, 1)' }, + ]) + expect(parseCodec('CODEC(LZ4HC(9, 1))')).toEqual([ + { kind: 'raw', expression: 'LZ4HC(9, 1)' }, + ]) + expect(parseCodec('CODEC(Delta(4, 2))')).toEqual([ + { kind: 'raw', expression: 'Delta(4, 2)' }, + ]) + expect(parseCodec('CODEC(LZ4(1))')).toEqual([ + { kind: 'raw', expression: 'LZ4(1)' }, + ]) + }) +}) + +describe('canonicalizeCodec', () => { + test('fills in ZSTD default level', () => { + expect(canonicalizeCodec({ kind: 'ZSTD' })).toEqual([{ kind: 'ZSTD', level: 1 }]) + }) + + test('fills in LZ4HC default level', () => { + expect(canonicalizeCodec({ kind: 'LZ4HC' })).toEqual([{ kind: 'LZ4HC', level: 9 }]) + }) + + test('fills in Delta/DoubleDelta/Gorilla default size', () => { + expect(canonicalizeCodec({ kind: 'Delta' })).toEqual([{ kind: 'Delta', size: 1 }]) + expect(canonicalizeCodec({ kind: 'DoubleDelta' })).toEqual([ + { kind: 'DoubleDelta', size: 1 }, + ]) + expect(canonicalizeCodec({ kind: 'Gorilla' })).toEqual([{ kind: 'Gorilla', size: 1 }]) + }) + + test('trims raw expression whitespace', () => { + expect(canonicalizeCodec(codec.raw(' SomeNewCodec(42) '))).toEqual([ + { kind: 'raw', expression: 'SomeNewCodec(42)' }, + ]) + }) + + test('normalizes single-step to array form', () => { + expect(canonicalizeCodec({ kind: 'LZ4' })).toEqual([{ kind: 'LZ4' }]) + }) + + test('preserves chain order', () => { + expect( + canonicalizeCodec([{ kind: 'Delta', size: 4 }, { kind: 'ZSTD', level: 3 }]) + ).toEqual([ + { kind: 'Delta', size: 4 }, + { kind: 'ZSTD', level: 3 }, + ]) + }) +}) + +describe('codecsEqual', () => { + test('both undefined → equal', () => { + expect(codecsEqual(undefined, undefined)).toBe(true) + }) + + test('one undefined → not equal', () => { + expect(codecsEqual(undefined, { kind: 'LZ4' })).toBe(false) + expect(codecsEqual({ kind: 'LZ4' }, undefined)).toBe(false) + }) + + test('ZSTD vs ZSTD(1) compare equal after canonicalization', () => { + expect(codecsEqual({ kind: 'ZSTD' }, { kind: 'ZSTD', level: 1 })).toBe(true) + }) + + test('ZSTD vs ZSTD(3) not equal', () => { + expect(codecsEqual({ kind: 'ZSTD' }, { kind: 'ZSTD', level: 3 })).toBe(false) + }) + + test('single-step vs array form with same content are equal', () => { + expect(codecsEqual({ kind: 'LZ4' }, [{ kind: 'LZ4' }])).toBe(true) + }) + + test('chain order matters', () => { + expect( + codecsEqual( + [{ kind: 'Delta', size: 4 }, { kind: 'ZSTD' }], + [{ kind: 'ZSTD' }, { kind: 'Delta', size: 4 }] + ) + ).toBe(false) + }) +}) diff --git a/packages/core/src/codec.ts b/packages/core/src/codec.ts new file mode 100644 index 0000000..f0c5134 --- /dev/null +++ b/packages/core/src/codec.ts @@ -0,0 +1,193 @@ +import type { + ColumnCodec, + ColumnCodecSpec, + GeneralColumnCodec, + PreprocessingColumnCodec, + RawColumnCodec, +} from './model-types.js' + +const GENERAL_KINDS = new Set(['NONE', 'LZ4', 'LZ4HC', 'ZSTD', 'T64', 'GCD', 'ALP']) +const PREPROCESSOR_KINDS = new Set(['Delta', 'DoubleDelta', 'Gorilla', 'FPC']) + +function toArray(spec: ColumnCodecSpec): ColumnCodec[] { + return Array.isArray(spec) ? spec : [spec] +} + +export function isGeneralCodec(codec: ColumnCodec): codec is GeneralColumnCodec { + return GENERAL_KINDS.has(codec.kind) +} + +export function isPreprocessorCodec(codec: ColumnCodec): codec is PreprocessingColumnCodec { + return PREPROCESSOR_KINDS.has(codec.kind) +} + +export function isRawCodec(codec: ColumnCodec): codec is RawColumnCodec { + return codec.kind === 'raw' +} + +function renderStep(step: ColumnCodec): string { + switch (step.kind) { + case 'NONE': + case 'LZ4': + case 'T64': + case 'GCD': + case 'ALP': + return step.kind + case 'LZ4HC': + return step.level !== undefined ? `LZ4HC(${step.level})` : 'LZ4HC' + case 'ZSTD': + return step.level !== undefined ? `ZSTD(${step.level})` : 'ZSTD' + case 'Delta': + case 'DoubleDelta': + case 'Gorilla': + return step.size !== undefined ? `${step.kind}(${step.size})` : step.kind + case 'FPC': + return `FPC(${step.level}, ${step.floatSize})` + case 'raw': + return step.expression + } +} + +export function renderCodec(spec: ColumnCodecSpec): string { + const steps = toArray(spec) + return `CODEC(${steps.map(renderStep).join(', ')})` +} + +const ATOM_PATTERN = /^(\w+)(?:\(([^)]*)\))?$/ + +function parseAtom(raw: string): ColumnCodec | undefined { + const trimmed = raw.trim() + if (trimmed.length === 0) return undefined + const match = trimmed.match(ATOM_PATTERN) + if (!match) return undefined + const [, name, argsRaw] = match + if (!name) return undefined + const rawArgs = argsRaw?.split(',').map((value) => value.trim()) + const args = rawArgs && !(rawArgs.length === 1 && rawArgs[0] === '') ? rawArgs : undefined + + switch (name) { + case 'NONE': + case 'LZ4': + case 'T64': + case 'GCD': + case 'ALP': + if (args !== undefined) return undefined + return { kind: name } + case 'LZ4HC': { + if (args === undefined) return { kind: 'LZ4HC' } + if (args.length !== 1) return undefined + const level = Number(args[0]) + if (!Number.isFinite(level)) return undefined + return { kind: 'LZ4HC', level } + } + case 'ZSTD': { + if (args === undefined) return { kind: 'ZSTD' } + if (args.length !== 1) return undefined + const level = Number(args[0]) + if (!Number.isFinite(level)) return undefined + return { kind: 'ZSTD', level } + } + case 'Delta': + case 'DoubleDelta': + case 'Gorilla': { + if (args === undefined) return { kind: name } + if (args.length !== 1) return undefined + const size = Number(args[0]) + if (size !== 1 && size !== 2 && size !== 4 && size !== 8) return undefined + return { kind: name, size } + } + case 'FPC': { + if (!args || args.length !== 2) return undefined + const level = Number(args[0]) + const floatSize = Number(args[1]) + if (!Number.isFinite(level)) return undefined + if (floatSize !== 4 && floatSize !== 8) return undefined + return { kind: 'FPC', level, floatSize } + } + default: + return undefined + } +} + +function splitTopLevelCommas(input: string): string[] { + const out: string[] = [] + let depth = 0 + let current = '' + for (let i = 0; i < input.length; i++) { + const ch = input[i]! + if (ch === '(') depth += 1 + else if (ch === ')') depth = Math.max(0, depth - 1) + if (ch === ',' && depth === 0) { + out.push(current) + current = '' + continue + } + current += ch + } + if (current.length > 0) out.push(current) + return out +} + +/** + * Parses a ClickHouse-returned codec expression (e.g. `CODEC(Delta(4), ZSTD(1))`). + * Returns undefined for empty/missing input. Unknown or unparsable tokens fall + * back to a single `raw` atom holding the whole chain, so ClickHouse-returned + * codecs we haven't typed still round-trip cleanly. + */ +export function parseCodec(raw: string | undefined | null): ColumnCodec[] | undefined { + if (!raw) return undefined + const trimmed = raw.trim() + if (trimmed.length === 0) return undefined + + const stripped = trimmed.replace(/^CODEC\s*\(([\s\S]*)\)\s*$/i, '$1').trim() + if (stripped.length === 0) return undefined + + const atoms = splitTopLevelCommas(stripped) + .map((value) => value.trim()) + .filter((value) => value.length > 0) + const parsed: ColumnCodec[] = [] + for (const atom of atoms) { + const step = parseAtom(atom) + if (!step) return [{ kind: 'raw', expression: stripped }] + parsed.push(step) + } + return parsed +} + +function canonicalizeStep(step: ColumnCodec): ColumnCodec { + switch (step.kind) { + case 'ZSTD': + return { kind: 'ZSTD', level: step.level ?? 1 } + case 'LZ4HC': + return { kind: 'LZ4HC', level: step.level ?? 9 } + case 'Delta': + case 'DoubleDelta': + case 'Gorilla': + return { kind: step.kind, size: step.size ?? 1 } + case 'FPC': + return { kind: 'FPC', level: step.level, floatSize: step.floatSize } + case 'raw': + return { kind: 'raw', expression: step.expression.trim() } + default: + return { kind: step.kind } + } +} + +/** + * Normalizes a codec spec to array form with ClickHouse defaults filled in, + * so that `{kind:'ZSTD'}` and `{kind:'ZSTD', level:1}` compare equal. + * Does not reorder chain steps — `[Delta, ZSTD]` ≠ `[ZSTD, Delta]` semantically. + */ +export function canonicalizeCodec(spec: ColumnCodecSpec): ColumnCodec[] { + return toArray(spec).map(canonicalizeStep) +} + +export function codecsEqual(a?: ColumnCodecSpec, b?: ColumnCodecSpec): boolean { + if (a === undefined && b === undefined) return true + if (a === undefined || b === undefined) return false + return JSON.stringify(canonicalizeCodec(a)) === JSON.stringify(canonicalizeCodec(b)) +} + +export const codec = { + raw: (expression: string): RawColumnCodec => ({ kind: 'raw', expression: expression.trim() }), +} diff --git a/packages/core/src/index.test.ts b/packages/core/src/index.test.ts index df2369e..8644557 100644 --- a/packages/core/src/index.test.ts +++ b/packages/core/src/index.test.ts @@ -3,6 +3,7 @@ import { describe, expect, test } from 'bun:test' import { ChxValidationError, canonicalizeDefinitions, + codec, collectDefinitionsFromModule, materializedView, planDiff, @@ -216,7 +217,7 @@ describe('@chkit/core planner v1', () => { name: 'idx_source', expression: 'source', type: 'set', - typeArgs: '0', + maxRows: 0, granularity: 1, }, ], @@ -260,14 +261,14 @@ describe('@chkit/core planner v1', () => { name: 'idx_source', expression: 'source', type: 'set', - typeArgs: '0', + maxRows: 0, granularity: 1, }, { name: 'idx_old', expression: 'old_col', type: 'set', - typeArgs: '0', + maxRows: 0, granularity: 1, }, ], @@ -291,7 +292,7 @@ describe('@chkit/core planner v1', () => { name: 'idx_source', expression: 'lower(source)', type: 'set', - typeArgs: '0', + maxRows: 0, granularity: 2, }, ], @@ -586,8 +587,8 @@ describe('@chkit/core planner v1', () => { primaryKey: ['id', 'missing_pk_col'], orderBy: ['id', 'missing_order_col'], indexes: [ - { name: 'idx_source', expression: 'id', type: 'set', typeArgs: '0', granularity: 1 }, - { name: 'idx_source', expression: 'id', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'id', type: 'set', maxRows: 0, granularity: 1 }, + { name: 'idx_source', expression: 'id', type: 'set', maxRows: 0, granularity: 1 }, ], }), ] @@ -621,28 +622,22 @@ describe('@chkit/core planner v1', () => { expect(issues.map((issue) => issue.code)).toEqual(['duplicate_projection_name']) }) - test('validates set index type requires typeArgs', () => { - const defs = [ - table({ - database: 'app', - name: 'events', - columns: [ - { name: 'id', type: 'UInt64' }, - { name: 'source', type: 'String' }, - ], - engine: 'MergeTree()', - primaryKey: ['id'], - orderBy: ['id'], - indexes: [ - // @ts-expect-error — intentionally omitting required typeArgs to test runtime validation - { name: 'idx_source', expression: 'source', type: 'set', granularity: 1 }, - ], - }), - ] - - const issues = validateDefinitions(defs) - expect(issues.map((issue) => issue.code)).toEqual(['index_type_missing_args']) - expect(issues[0]?.message).toContain('typeArgs') + test('set index type requires maxRows at the type level', () => { + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'source', type: 'String' }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + indexes: [ + // @ts-expect-error — set requires `maxRows` at compile time + { name: 'idx_source', expression: 'source', type: 'set', granularity: 1 }, + ], + }) }) test('planDiff throws typed validation error for invalid schema', () => { @@ -723,21 +718,54 @@ describe('@chkit/core planner v1', () => { expect(planA.riskSummary).toEqual(planB.riskSummary) }) - test('renders parameterized index type with typeArgs in CREATE TABLE', () => { + test('renders structured index args in CREATE TABLE', () => { const events = table({ database: 'app', name: 'events', columns: [ { name: 'id', type: 'UInt64' }, { name: 'source', type: 'String' }, + { name: 'body', type: 'String' }, + { name: 'name', type: 'String' }, ], engine: 'MergeTree()', primaryKey: ['id'], orderBy: ['id'], indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, { name: 'idx_id', expression: 'id', type: 'minmax', granularity: 3 }, - { name: 'idx_bloom', expression: 'source', type: 'bloom_filter', typeArgs: '0.01', granularity: 1 }, + { + name: 'idx_bloom', + expression: 'source', + type: 'bloom_filter', + falsePositiveRate: 0.01, + granularity: 1, + }, + { + name: 'idx_bloom_default', + expression: 'source', + type: 'bloom_filter', + granularity: 1, + }, + { + name: 'idx_body', + expression: 'body', + type: 'tokenbf_v1', + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, + { + name: 'idx_name', + expression: 'name', + type: 'ngrambf_v1', + ngramSize: 3, + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, ], }) @@ -745,9 +773,12 @@ describe('@chkit/core planner v1', () => { expect(sql).toContain('TYPE set(0) GRANULARITY 1') expect(sql).toContain('TYPE minmax GRANULARITY 3') expect(sql).toContain('TYPE bloom_filter(0.01) GRANULARITY 1') + expect(sql).toContain('`idx_bloom_default` (source) TYPE bloom_filter GRANULARITY 1') + expect(sql).toContain('TYPE tokenbf_v1(256, 2, 0) GRANULARITY 1') + expect(sql).toContain('TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 1') }) - test('renders parameterized index type with typeArgs in ALTER ADD INDEX', () => { + test('renders structured index args in ALTER ADD INDEX', () => { const oldDefs = [ table({ database: 'app', @@ -768,7 +799,7 @@ describe('@chkit/core planner v1', () => { primaryKey: ['id'], orderBy: ['id'], indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, ], }), ] @@ -778,7 +809,7 @@ describe('@chkit/core planner v1', () => { expect(plan.operations[0]?.sql).toContain('TYPE set(0) GRANULARITY 1') }) - test('detects index change when typeArgs differs', () => { + test('detects index change when structured args differ', () => { const oldDefs = [ table({ database: 'app', @@ -788,7 +819,7 @@ describe('@chkit/core planner v1', () => { primaryKey: ['id'], orderBy: ['id'], indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, ], }), ] @@ -802,7 +833,7 @@ describe('@chkit/core planner v1', () => { primaryKey: ['id'], orderBy: ['id'], indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '100', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 100, granularity: 1 }, ], }), ] @@ -861,6 +892,338 @@ describe('@chkit/core planner v1', () => { }) }) +describe('@chkit/core column codec', () => { + test('renders CODEC clause after DEFAULT', () => { + const events = table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'ts', type: 'DateTime', codec: { kind: 'ZSTD', level: 3 }, default: 'fn:now()' }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }) + + const sql = toCreateSQL(events) + expect(sql).toContain('`ts` DateTime DEFAULT now() CODEC(ZSTD(3))') + }) + + test('renders CODEC chain with preprocessor + general', () => { + const events = table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'delta', type: 'Int64', codec: [{ kind: 'Delta', size: 4 }, { kind: 'ZSTD' }] }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }) + + const sql = toCreateSQL(events) + expect(sql).toContain('`delta` Int64 CODEC(Delta(4), ZSTD)') + }) + + test('renders CODEC on nullable column', () => { + const events = table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'note', type: 'String', nullable: true, codec: { kind: 'ZSTD', level: 3 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }) + + const sql = toCreateSQL(events) + expect(sql).toContain('`note` Nullable(String) CODEC(ZSTD(3))') + }) + + test('plan: add codec to column emits MODIFY COLUMN with CODEC', () => { + const oldDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String' }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const newDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 3 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + + const plan = planDiff(oldDefs, newDefs) + expect(plan.operations.map((op) => op.type)).toEqual(['alter_table_modify_column']) + expect(plan.operations[0]?.sql).toContain('MODIFY COLUMN `payload` String CODEC(ZSTD(3))') + }) + + test('plan: change codec emits single MODIFY COLUMN', () => { + const oldDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 1 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const newDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 6 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + + const plan = planDiff(oldDefs, newDefs) + expect(plan.operations.map((op) => op.type)).toEqual(['alter_table_modify_column']) + expect(plan.operations[0]?.sql).toContain('MODIFY COLUMN `payload` String CODEC(ZSTD(6))') + expect(plan.operations[0]?.sql).not.toContain('REMOVE CODEC') + }) + + test('plan: remove codec emits REMOVE CODEC when other fields unchanged', () => { + const oldDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 3 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const newDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String' }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + + const plan = planDiff(oldDefs, newDefs) + expect(plan.operations).toHaveLength(1) + expect(plan.operations[0]?.type).toBe('alter_table_modify_column') + expect(plan.operations[0]?.sql).toBe( + 'ALTER TABLE app.events MODIFY COLUMN `payload` REMOVE CODEC;' + ) + }) + + test('plan: drop codec + other change emits single MODIFY COLUMN (no separate REMOVE)', () => { + const oldDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 3 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const newDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'LowCardinality(String)' }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + + const plan = planDiff(oldDefs, newDefs) + expect(plan.operations).toHaveLength(1) + expect(plan.operations[0]?.type).toBe('alter_table_modify_column') + expect(plan.operations[0]?.sql).toContain('LowCardinality(String)') + expect(plan.operations[0]?.sql).not.toContain('REMOVE CODEC') + }) + + test('plan: equal codec across canonicalization yields no diff', () => { + const oldDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD' } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const newDefs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 1 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + + const plan = planDiff(oldDefs, newDefs) + expect(plan.operations).toEqual([]) + }) + + test('validates chain with multiple general codecs', () => { + const defs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { + name: 'payload', + type: 'String', + codec: [ + { kind: 'ZSTD', level: 3 }, + { kind: 'LZ4' }, + ], + }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const issues = validateDefinitions(defs) + expect(issues.map((i) => i.code)).toContain('codec_chain_multiple_general') + }) + + test('validates chain ending in preprocessor', () => { + const defs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { + name: 'payload', + type: 'Int64', + codec: [ + { kind: 'ZSTD' }, + { kind: 'Delta', size: 4 }, + ], + }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const issues = validateDefinitions(defs) + expect(issues.map((i) => i.code)).toContain('codec_chain_must_end_with_general') + }) + + test('allows standalone preprocessor codec (CH auto-appends default general)', () => { + const defs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'delta', type: 'Int64', codec: { kind: 'Delta', size: 4 } }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const issues = validateDefinitions(defs) + expect(issues.some((i) => i.code === 'codec_chain_must_end_with_general')).toBe(false) + expect(issues.some((i) => i.code === 'codec_chain_multiple_general')).toBe(false) + }) + + test('flags empty codec chain', () => { + const defs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'payload', type: 'Int64', codec: [] }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const issues = validateDefinitions(defs) + expect(issues.map((i) => i.code)).toContain('codec_chain_empty') + }) + + test('raw codec atoms satisfy any chain position', () => { + const defs = [ + table({ + database: 'app', + name: 'events', + columns: [ + { name: 'id', type: 'UInt64' }, + { + name: 'exp', + type: 'Float32', + codec: [{ kind: 'Delta', size: 4 }, codec.raw('SomeNewCodec(42)')], + }, + ], + engine: 'MergeTree()', + primaryKey: ['id'], + orderBy: ['id'], + }), + ] + const issues = validateDefinitions(defs) + expect(issues.some((i) => i.code.startsWith('codec_chain_'))).toBe(false) + }) +}) + describe('@chkit/core refreshable materialized views', () => { const baseMv = { database: 'analytics', diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 8c728a7..f225e55 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -10,6 +10,16 @@ export { createSnapshot } from './snapshot.js' export { splitTopLevelComma } from './key-clause.js' export { normalizeEngine, normalizeSQLFragment } from './sql-normalizer.js' export { toCreateSQL } from './sql.js' +export { + canonicalizeCodec, + codec, + codecsEqual, + isGeneralCodec, + isPreprocessorCodec, + isRawCodec, + parseCodec, + renderCodec, +} from './codec.js' export { assertValidDefinitions, validateDefinitions } from './validate.js' export { wrapPluginRun } from './plugin-error.js' export { splitSqlStatements, extractExecutableStatements } from './sql-splitter.js' diff --git a/packages/core/src/model-types.ts b/packages/core/src/model-types.ts index e1ff2a0..a7909a3 100644 --- a/packages/core/src/model-types.ts +++ b/packages/core/src/model-types.ts @@ -20,6 +20,38 @@ export type PrimitiveColumnType = | 'DateTime' | 'DateTime64' +/** + * General-purpose compression codecs. Exactly one ends a codec chain. + * Level ranges: LZ4HC 1–12 (default 9), ZSTD 1–22 (default 1). + */ +export type GeneralColumnCodec = + | { kind: 'NONE' | 'LZ4' | 'T64' | 'GCD' | 'ALP' } + | { kind: 'LZ4HC'; level?: number } + | { kind: 'ZSTD'; level?: number } + +/** + * Preprocessing codecs (zero or more) placed before the general codec. + * `size` (bytes) defaults to 1 in ClickHouse for Delta / DoubleDelta / Gorilla. + */ +export type PreprocessingColumnCodec = + | { kind: 'Delta' | 'DoubleDelta' | 'Gorilla'; size?: 1 | 2 | 4 | 8 } + | { kind: 'FPC'; level: number; floatSize: 4 | 8 } + +/** + * Escape hatch — passes the raw expression through unchanged. Useful for + * codecs we haven't typed (new CH versions, experimental codecs) or unusual + * arg shapes. Canonicalization is whitespace-only; round-trip may be noisy. + */ +export interface RawColumnCodec { + kind: 'raw' + expression: string +} + +export type ColumnCodec = GeneralColumnCodec | PreprocessingColumnCodec | RawColumnCodec + +/** Single codec or a chain (preprocessors then exactly one general codec). */ +export type ColumnCodecSpec = ColumnCodec | ColumnCodec[] + export interface ColumnDefinition { name: string type: PrimitiveColumnType | string @@ -27,6 +59,7 @@ export interface ColumnDefinition { nullable?: boolean default?: string | number | boolean comment?: string + codec?: ColumnCodecSpec } interface SkipIndexBase { @@ -36,13 +69,35 @@ interface SkipIndexBase { } /** - * `set` requires a numeric argument (e.g. `typeArgs: '0'`). - * ClickHouse 26+ rejects bare `set` — use `set(0)` for unbounded. + * Skip index with structured, discriminated args per type. Arg signatures + * come from ClickHouse MergeTree docs: + * - `minmax` — no args + * - `set(max_rows)` — required int, 0 = store all unique values + * - `bloom_filter([false_positive_rate])` — optional float, default 0.025 + * - `tokenbf_v1(size_bytes, n_hash, seed)` — 3 required ints + * - `ngrambf_v1(n, size_bytes, n_hash, seed)` — 4 required ints + * + * ClickHouse 26+ requires `set(0)` not bare `set`; `maxRows` is required + * so this is encoded naturally. */ export type SkipIndexDefinition = SkipIndexBase & ( - | { type: 'minmax' | 'bloom_filter'; typeArgs?: string } - | { type: 'set' | 'tokenbf_v1' | 'ngrambf_v1'; typeArgs: string } + | { type: 'minmax' } + | { type: 'set'; maxRows: number } + | { type: 'bloom_filter'; falsePositiveRate?: number } + | { + type: 'tokenbf_v1' + sizeBytes: number + hashFunctions: number + randomSeed: number + } + | { + type: 'ngrambf_v1' + ngramSize: number + sizeBytes: number + hashFunctions: number + randomSeed: number + } ) export interface ProjectionDefinition { @@ -251,12 +306,14 @@ export type ValidationIssueCode = | 'duplicate_projection_name' | 'primary_key_missing_column' | 'order_by_missing_column' - | 'index_type_missing_args' | 'refresh_requires_every_or_after' | 'refresh_every_after_mutually_exclusive' | 'refresh_interval_format' | 'refresh_append_required_for_replicated_target' | 'refresh_depends_on_requires_every' + | 'codec_chain_must_end_with_general' + | 'codec_chain_multiple_general' + | 'codec_chain_empty' export interface ValidationIssue { code: ValidationIssueCode diff --git a/packages/core/src/planner.ts b/packages/core/src/planner.ts index 9eff2ec..ad3b144 100644 --- a/packages/core/src/planner.ts +++ b/packages/core/src/planner.ts @@ -22,6 +22,7 @@ import { renderAlterModifyRefresh, renderAlterModifySetting, renderAlterModifyTTL, + renderAlterRemoveCodec, renderAlterResetSetting, toCreateSQL, } from './sql.js' @@ -139,6 +140,21 @@ function normalizeColumn(column: ColumnDefinition): Omit { + const { codec: _codec, ...rest } = normalizeColumn(column) + return rest +} + +function isCodecRemoval(oldCol: ColumnDefinition, newCol: ColumnDefinition): boolean { + if (!oldCol.codec || newCol.codec) return false + return ( + JSON.stringify(normalizeColumnWithoutCodec(oldCol)) === + JSON.stringify(normalizeColumnWithoutCodec(newCol)) + ) +} + function renderRenameColumnSuggestionSQL(table: TableDefinition, from: string, to: string): string { return `ALTER TABLE ${table.database}.${table.name} RENAME COLUMN \`${from}\` TO \`${to}\`;` } @@ -284,12 +300,15 @@ function diffTables(oldDef: TableDefinition, newDef: TableDefinition): TableDiff sql: renderAlterAddColumn(newDef, column), }) } - for (const { name, newItem } of columnDiff.changed) { + for (const { name, oldItem, newItem } of columnDiff.changed) { + const sql = isCodecRemoval(oldItem, newItem) + ? renderAlterRemoveCodec(newDef, name) + : renderAlterModifyColumn(newDef, newItem) ops.push( { type: 'alter_table_modify_column', key: `table:${newDef.database}.${newDef.name}:column:${name}`, risk: 'caution', - sql: renderAlterModifyColumn(newDef, newItem), + sql, }) } for (const column of columnDiff.removed) { diff --git a/packages/core/src/sql-validation.e2e.test.ts b/packages/core/src/sql-validation.e2e.test.ts index 2ae3a6c..824c559 100644 --- a/packages/core/src/sql-validation.e2e.test.ts +++ b/packages/core/src/sql-validation.e2e.test.ts @@ -25,6 +25,7 @@ import { renderAlterDropProjection, renderAlterModifyRefresh, renderAlterModifySetting, + renderAlterRemoveCodec, renderAlterResetSetting, renderAlterModifyTTL, } from './sql.js' @@ -251,6 +252,79 @@ describe('SQL validation via EXPLAIN AST', () => { } }) + // ========================================================================= + // CREATE TABLE — Column CODEC + // ========================================================================= + + describe('CREATE TABLE — column codec', () => { + const codecCases: Array<{ label: string; col: ColumnDefinition }> = [ + { label: 'ZSTD(3)', col: { name: 'payload', type: 'String', codec: { kind: 'ZSTD', level: 3 } } }, + { label: 'LZ4HC(9)', col: { name: 'payload', type: 'String', codec: { kind: 'LZ4HC', level: 9 } } }, + { label: 'NONE', col: { name: 'payload', type: 'String', codec: { kind: 'NONE' } } }, + { + label: 'Delta(4) + ZSTD(3)', + col: { + name: 'payload', + type: 'Int64', + codec: [ + { kind: 'Delta', size: 4 }, + { kind: 'ZSTD', level: 3 }, + ], + }, + }, + { label: 'T64', col: { name: 'payload', type: 'Int64', codec: { kind: 'T64' } } }, + ] + + for (const { label, col } of codecCases) { + test(`codec: ${label}`, async () => { + const def = baseTable({ columns: [{ name: 'id', type: 'UInt64' }, col] }) + await assertValidSQL(client, toCreateSQL(def)) + }) + } + + test('codec + DEFAULT combined', async () => { + const def = baseTable({ + columns: [ + { name: 'id', type: 'UInt64' }, + { + name: 'ts', + type: 'DateTime', + codec: { kind: 'ZSTD', level: 3 }, + default: 'fn:now()', + }, + ], + }) + await assertValidSQL(client, toCreateSQL(def)) + }) + + test('codec on nullable column', async () => { + const def = baseTable({ + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'note', type: 'String', nullable: true, codec: { kind: 'ZSTD', level: 3 } }, + ], + }) + await assertValidSQL(client, toCreateSQL(def)) + }) + + test('ALTER MODIFY COLUMN with codec', async () => { + const def = baseTable() + await assertValidSQL( + client, + renderAlterModifyColumn(def, { + name: 'value', + type: 'String', + codec: { kind: 'ZSTD', level: 6 }, + }) + ) + }) + + test('ALTER MODIFY COLUMN REMOVE CODEC', async () => { + const def = baseTable() + await assertValidSQL(client, renderAlterRemoveCodec(def, 'payload')) + }) + }) + // ========================================================================= // CREATE TABLE — Column comments and nullable // ========================================================================= @@ -479,23 +553,62 @@ ORDER BY (\`id\`, toDate(\`created_at\`))` }, { label: 'set', - idx: { name: 'idx_status', expression: 'status', type: 'set', typeArgs: '100', granularity: 2 }, + idx: { + name: 'idx_status', + expression: 'status', + type: 'set', + maxRows: 100, + granularity: 2, + }, + }, + { + label: 'set unbounded', + idx: { + name: 'idx_status_all', + expression: 'status', + type: 'set', + maxRows: 0, + granularity: 2, + }, }, { label: 'bloom_filter', idx: { name: 'idx_email', expression: 'email', type: 'bloom_filter', granularity: 1 }, }, { - label: 'bloom_filter with args', - idx: { name: 'idx_email2', expression: 'email', type: 'bloom_filter', typeArgs: '0.01', granularity: 1 }, + label: 'bloom_filter with falsePositiveRate', + idx: { + name: 'idx_email2', + expression: 'email', + type: 'bloom_filter', + falsePositiveRate: 0.01, + granularity: 1, + }, }, { label: 'tokenbf_v1', - idx: { name: 'idx_body', expression: 'body', type: 'tokenbf_v1', typeArgs: '10240, 3, 0', granularity: 1 }, + idx: { + name: 'idx_body', + expression: 'body', + type: 'tokenbf_v1', + sizeBytes: 10240, + hashFunctions: 3, + randomSeed: 0, + granularity: 1, + }, }, { label: 'ngrambf_v1', - idx: { name: 'idx_name', expression: 'name', type: 'ngrambf_v1', typeArgs: '3, 256, 2, 0', granularity: 1 }, + idx: { + name: 'idx_name', + expression: 'name', + type: 'ngrambf_v1', + ngramSize: 3, + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, }, { label: 'expression index', @@ -879,19 +992,52 @@ ORDER BY (\`id\`, toDate(\`created_at\`))` }, { label: 'set', - idx: { name: 'idx_status', expression: 'status', type: 'set', typeArgs: '100', granularity: 2 }, + idx: { + name: 'idx_status', + expression: 'status', + type: 'set', + maxRows: 100, + granularity: 2, + }, }, { label: 'bloom_filter', idx: { name: 'idx_email', expression: 'email', type: 'bloom_filter', granularity: 1 }, }, + { + label: 'bloom_filter with falsePositiveRate', + idx: { + name: 'idx_email_tuned', + expression: 'email', + type: 'bloom_filter', + falsePositiveRate: 0.01, + granularity: 1, + }, + }, { label: 'tokenbf_v1', - idx: { name: 'idx_body', expression: 'body', type: 'tokenbf_v1', typeArgs: '10240, 3, 0', granularity: 1 }, + idx: { + name: 'idx_body', + expression: 'body', + type: 'tokenbf_v1', + sizeBytes: 10240, + hashFunctions: 3, + randomSeed: 0, + granularity: 1, + }, }, { label: 'ngrambf_v1', - idx: { name: 'idx_name', expression: 'name', type: 'ngrambf_v1', typeArgs: '3, 256, 2, 0', granularity: 1 }, + idx: { + name: 'idx_name', + expression: 'name', + type: 'ngrambf_v1', + ngramSize: 3, + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, }, ] diff --git a/packages/core/src/sql.ts b/packages/core/src/sql.ts index e097c4f..c2d7cc3 100644 --- a/packages/core/src/sql.ts +++ b/packages/core/src/sql.ts @@ -7,6 +7,7 @@ import type { TableDefinition, ViewDefinition, } from './model.js' +import { renderCodec } from './codec.js' import { normalizeKeyColumns } from './key-clause.js' import { assertValidDefinitions } from './validate.js' @@ -22,6 +23,7 @@ function renderColumn(col: ColumnDefinition): string { let out = `\`${col.name}\` ${col.nullable ? `Nullable(${col.type})` : col.type}` if (col.default !== undefined) out += ` DEFAULT ${renderDefault(col.default)}` if (col.comment) out += ` COMMENT '${col.comment.replace(/'/g, "''")}'` + if (col.codec) out += ` ${renderCodec(col.codec)}` return out } @@ -32,7 +34,20 @@ function renderKeyClauseColumns(columns: string[]): string { } function renderIndexType(idx: SkipIndexDefinition): string { - return idx.typeArgs !== undefined ? `${idx.type}(${idx.typeArgs})` : idx.type + switch (idx.type) { + case 'minmax': + return 'minmax' + case 'set': + return `set(${idx.maxRows})` + case 'bloom_filter': + return idx.falsePositiveRate !== undefined + ? `bloom_filter(${idx.falsePositiveRate})` + : 'bloom_filter' + case 'tokenbf_v1': + return `tokenbf_v1(${idx.sizeBytes}, ${idx.hashFunctions}, ${idx.randomSeed})` + case 'ngrambf_v1': + return `ngrambf_v1(${idx.ngramSize}, ${idx.sizeBytes}, ${idx.hashFunctions}, ${idx.randomSeed})` + } } function renderTableSQL(def: TableDefinition): string { @@ -143,6 +158,10 @@ export function renderAlterDropColumn(def: TableDefinition, columnName: string): return `ALTER TABLE ${def.database}.${def.name} DROP COLUMN IF EXISTS \`${columnName}\`;` } +export function renderAlterRemoveCodec(def: TableDefinition, columnName: string): string { + return `ALTER TABLE ${def.database}.${def.name} MODIFY COLUMN \`${columnName}\` REMOVE CODEC;` +} + export function renderAlterAddIndex(def: TableDefinition, index: SkipIndexDefinition): string { return `ALTER TABLE ${def.database}.${def.name} ADD INDEX IF NOT EXISTS \`${index.name}\` (${index.expression}) TYPE ${renderIndexType(index)} GRANULARITY ${index.granularity};` } diff --git a/packages/core/src/validate.ts b/packages/core/src/validate.ts index 371ae8a..efe4e0c 100644 --- a/packages/core/src/validate.ts +++ b/packages/core/src/validate.ts @@ -1,6 +1,8 @@ import { definitionKey } from './canonical.js' +import { canonicalizeCodec, isGeneralCodec, isRawCodec } from './codec.js' import { normalizeKeyColumns } from './key-clause.js' import type { + ColumnDefinition, MaterializedViewDefinition, MaterializedViewRefresh, SchemaDefinition, @@ -25,6 +27,53 @@ function pushValidationIssue( }) } +function validateColumnCodec( + def: TableDefinition, + column: ColumnDefinition, + issues: ValidationIssue[] +): void { + if (!column.codec) return + const steps = canonicalizeCodec(column.codec) + if (steps.length === 0) { + pushValidationIssue( + issues, + def, + 'codec_chain_empty', + `Table ${def.database}.${def.name} column "${column.name}" codec chain is empty; provide at least one codec or omit the field` + ) + return + } + let generalCount = 0 + let generalIndex = -1 + for (let i = 0; i < steps.length; i++) { + const step = steps[i]! + if (isRawCodec(step)) continue + if (isGeneralCodec(step)) { + generalCount += 1 + generalIndex = i + } + } + + if (generalCount > 1) { + pushValidationIssue( + issues, + def, + 'codec_chain_multiple_general', + `Table ${def.database}.${def.name} column "${column.name}" codec chain has more than one general codec; only one general codec is allowed at the end of a chain` + ) + return + } + + if (steps.length > 1 && generalCount === 1 && generalIndex !== steps.length - 1) { + pushValidationIssue( + issues, + def, + 'codec_chain_must_end_with_general', + `Table ${def.database}.${def.name} column "${column.name}" codec chain must end with a general codec (NONE, LZ4, LZ4HC, ZSTD, T64, GCD, ALP)` + ) + } +} + function validateTableDefinition(def: TableDefinition, issues: ValidationIssue[]): void { const columnSeen = new Set() const columnSet = new Set() @@ -40,9 +89,9 @@ function validateTableDefinition(def: TableDefinition, issues: ValidationIssue[] } columnSeen.add(column.name) columnSet.add(column.name) + validateColumnCodec(def, column, issues) } - const TYPES_REQUIRING_ARGS = new Set(['set', 'tokenbf_v1', 'ngrambf_v1']) const indexSeen = new Set() for (const index of def.indexes ?? []) { if (indexSeen.has(index.name)) { @@ -55,14 +104,6 @@ function validateTableDefinition(def: TableDefinition, issues: ValidationIssue[] continue } indexSeen.add(index.name) - if (TYPES_REQUIRING_ARGS.has(index.type) && !index.typeArgs) { - pushValidationIssue( - issues, - def, - 'index_type_missing_args', - `Table ${def.database}.${def.name} index "${index.name}" uses type "${index.type}" which requires typeArgs (e.g. typeArgs: '0' for set(0))` - ) - } } const projectionSeen = new Set() diff --git a/packages/plugin-pull/src/index.test.ts b/packages/plugin-pull/src/index.test.ts index 28549c7..54e4673 100644 --- a/packages/plugin-pull/src/index.test.ts +++ b/packages/plugin-pull/src/index.test.ts @@ -29,26 +29,69 @@ describe('@chkit/plugin-pull renderSchemaFile', () => { expect(content).toContain("export default schema(app_events)") }) - test('renders index typeArgs in pulled schema', () => { + test('renders structured index args in pulled schema', () => { const content = renderSchemaFile([ { kind: 'table', database: 'app', name: 'events', engine: 'MergeTree()', - columns: [{ name: 'id', type: 'UInt64' }, { name: 'source', type: 'String' }], + columns: [ + { name: 'id', type: 'UInt64' }, + { name: 'source', type: 'String' }, + { name: 'email', type: 'String' }, + { name: 'body', type: 'String' }, + { name: 'name', type: 'String' }, + ], primaryKey: ['id'], orderBy: ['id'], indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, { name: 'idx_id', expression: 'id', type: 'minmax', granularity: 3 }, + { + name: 'idx_bloom', + expression: 'email', + type: 'bloom_filter', + falsePositiveRate: 0.01, + granularity: 1, + }, + { name: 'idx_bloom_default', expression: 'email', type: 'bloom_filter', granularity: 1 }, + { + name: 'idx_body', + expression: 'body', + type: 'tokenbf_v1', + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, + { + name: 'idx_name', + expression: 'name', + type: 'ngrambf_v1', + ngramSize: 3, + sizeBytes: 256, + hashFunctions: 2, + randomSeed: 0, + granularity: 1, + }, ], }, ]) - expect(content).toContain('type: "set", typeArgs: "0", granularity: 1') + expect(content).toContain('type: "set", maxRows: 0, granularity: 1') expect(content).toContain('type: "minmax", granularity: 3') - expect(content).not.toContain('typeArgs: undefined') + expect(content).toContain('type: "bloom_filter", falsePositiveRate: 0.01, granularity: 1') + expect(content).toContain( + 'name: "idx_bloom_default", expression: "email", type: "bloom_filter", granularity: 1' + ) + expect(content).toContain( + 'type: "tokenbf_v1", sizeBytes: 256, hashFunctions: 2, randomSeed: 0, granularity: 1' + ) + expect(content).toContain( + 'type: "ngrambf_v1", ngramSize: 3, sizeBytes: 256, hashFunctions: 2, randomSeed: 0, granularity: 1' + ) + expect(content).not.toContain('typeArgs') }) test('renders view and materialized view definitions', () => { diff --git a/packages/plugin-pull/src/pull.e2e.test.ts b/packages/plugin-pull/src/pull.e2e.test.ts index 7200830..4f78411 100644 --- a/packages/plugin-pull/src/pull.e2e.test.ts +++ b/packages/plugin-pull/src/pull.e2e.test.ts @@ -103,7 +103,7 @@ describe('@chkit/plugin-pull live env e2e', () => { try { await executor.command( - `CREATE TABLE ${quoteIdent(targetDatabase)}.${quoteIdent(eventsTable)} (id UInt64, source String, received_at DateTime64(3) DEFAULT now64(3)) ENGINE = MergeTree() PARTITION BY toYYYYMM(received_at) PRIMARY KEY (id) ORDER BY (id) SETTINGS index_granularity = 8192` + `CREATE TABLE ${quoteIdent(targetDatabase)}.${quoteIdent(eventsTable)} (id UInt64, source String, received_at DateTime64(3) DEFAULT now64(3), ts DateTime CODEC(Delta, ZSTD)) ENGINE = MergeTree() PARTITION BY toYYYYMM(received_at) PRIMARY KEY (id) ORDER BY (id) SETTINGS index_granularity = 8192` ) // Wait for table to be visible before creating dependent objects @@ -188,6 +188,10 @@ describe('@chkit/plugin-pull live env e2e', () => { expect(pulledSchema).not.toContain('DEFINER') expect(pulledSchema).not.toContain('SQL SECURITY') expect(pulledSchema).not.toContain(noiseDatabase) + + // Codec column — assert structured codec is emitted and round-trips + expect(pulledSchema).toContain('codec: [{ kind: "Delta"') + expect(pulledSchema).toContain('{ kind: "ZSTD"') } finally { await rm(dir, { recursive: true, force: true }) } diff --git a/packages/plugin-pull/src/render-schema.ts b/packages/plugin-pull/src/render-schema.ts index 7cb92cc..2b63993 100644 --- a/packages/plugin-pull/src/render-schema.ts +++ b/packages/plugin-pull/src/render-schema.ts @@ -1,5 +1,8 @@ import { canonicalizeDefinitions, + isRawCodec, + type ColumnCodec, + type ColumnCodecSpec, type MaterializedViewRefresh, type SchemaDefinition, } from '@chkit/core' @@ -10,10 +13,16 @@ export function renderSchemaFile(definitions: SchemaDefinition[]): string { const hasTable = canonical.some((definition) => definition.kind === 'table') const hasView = canonical.some((definition) => definition.kind === 'view') const hasMaterializedView = canonical.some((definition) => definition.kind === 'materialized_view') + const hasRawCodec = canonical.some( + (definition) => + definition.kind === 'table' && + definition.columns.some((column) => column.codec && codecContainsRaw(column.codec)) + ) const imports = ['schema'] if (hasTable) imports.push('table') if (hasView) imports.push('view') if (hasMaterializedView) imports.push('materializedView') + if (hasRawCodec) imports.push('codec') const lines: string[] = [ `import { ${imports.join(', ')} } from '@chkit/core'`, '', @@ -42,6 +51,7 @@ export function renderSchemaFile(definitions: SchemaDefinition[]): string { if (column.nullable) parts.push('nullable: true') if (column.default !== undefined) parts.push(`default: ${renderLiteral(column.default)}`) if (column.comment) parts.push(`comment: ${renderString(column.comment)}`) + if (column.codec) parts.push(`codec: ${renderCodecSource(column.codec)}`) lines.push(` { ${parts.join(', ')} },`) } @@ -74,7 +84,33 @@ export function renderSchemaFile(definitions: SchemaDefinition[]): string { `expression: ${renderString(index.expression)}`, `type: ${renderString(index.type)}`, ] - if (index.typeArgs !== undefined) parts.push(`typeArgs: ${renderString(index.typeArgs)}`) + switch (index.type) { + case 'minmax': + break + case 'set': + parts.push(`maxRows: ${index.maxRows}`) + break + case 'bloom_filter': + if (index.falsePositiveRate !== undefined) { + parts.push(`falsePositiveRate: ${index.falsePositiveRate}`) + } + break + case 'tokenbf_v1': + parts.push( + `sizeBytes: ${index.sizeBytes}`, + `hashFunctions: ${index.hashFunctions}`, + `randomSeed: ${index.randomSeed}` + ) + break + case 'ngrambf_v1': + parts.push( + `ngramSize: ${index.ngramSize}`, + `sizeBytes: ${index.sizeBytes}`, + `hashFunctions: ${index.hashFunctions}`, + `randomSeed: ${index.randomSeed}` + ) + break + } parts.push(`granularity: ${index.granularity}`) lines.push(` { ${parts.join(', ')} },`) } @@ -181,3 +217,28 @@ function renderKey(value: string): string { if (/^[a-zA-Z_$][a-zA-Z0-9_$]*$/.test(value)) return value return renderString(value) } + +function codecContainsRaw(spec: ColumnCodecSpec): boolean { + const steps = Array.isArray(spec) ? spec : [spec] + return steps.some((step) => isRawCodec(step)) +} + +function renderCodecStepSource(step: ColumnCodec): string { + if (isRawCodec(step)) return `codec.raw(${renderString(step.expression)})` + const parts: string[] = [`kind: ${renderString(step.kind)}`] + if (step.kind === 'ZSTD' || step.kind === 'LZ4HC') { + if (step.level !== undefined) parts.push(`level: ${step.level}`) + } else if (step.kind === 'Delta' || step.kind === 'DoubleDelta' || step.kind === 'Gorilla') { + if (step.size !== undefined) parts.push(`size: ${step.size}`) + } else if (step.kind === 'FPC') { + parts.push(`level: ${step.level}`) + parts.push(`floatSize: ${step.floatSize}`) + } + return `{ ${parts.join(', ')} }` +} + +function renderCodecSource(spec: ColumnCodecSpec): string { + const steps = Array.isArray(spec) ? spec : [spec] + if (steps.length === 1) return renderCodecStepSource(steps[0]!) + return `[${steps.map(renderCodecStepSource).join(', ')}]` +} diff --git a/skills/chkit/SKILL.md b/skills/chkit/SKILL.md index 3465f77..47b37ae 100644 --- a/skills/chkit/SKILL.md +++ b/skills/chkit/SKILL.md @@ -67,7 +67,7 @@ const events = table({ ttl: 'received_at + INTERVAL 90 DAY', settings: { index_granularity: 8192 }, indexes: [ - { name: 'idx_source', expression: 'source', type: 'set', typeArgs: '0', granularity: 1 }, + { name: 'idx_source', expression: 'source', type: 'set', maxRows: 0, granularity: 1 }, ], })