From 25b195d76f2745dab579c9986492d468d51ef648 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 16:28:45 +0900 Subject: [PATCH 01/11] fix: bind QUALIFY against the post-projection scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QUALIFY filters on window / projection outputs, so it sees the SELECT's output aliases. It was bound against the FROM scope (in select_clause_reads), so an alias reference (`QUALIFY rn = 1` for `… AS rn`) resolved to a phantom base column `t.rn`. Bind it in bind_select against the output-aware clause_scope like HAVING — an alias binds Derived (dropped from reads, the dependency already counted at the window expr) while a real column / inline window key / subquery source still reads. QUALIFY is filter-position, so it adds reads but never lineage. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder/expr.rs | 9 ++-- sql-insight/src/resolver/binder/query.rs | 17 ++++-- .../reads_semantics.rs | 53 +++++++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/sql-insight/src/resolver/binder/expr.rs b/sql-insight/src/resolver/binder/expr.rs index f33b64e..b19d4db 100644 --- a/sql-insight/src/resolver/binder/expr.rs +++ b/sql-insight/src/resolver/binder/expr.rs @@ -561,9 +561,11 @@ impl<'a> Binder<'a> { } /// Filter-position reads from a SELECT's auxiliary clauses (`DISTINCT ON` - /// keys, `TOP n`, Hive `LATERAL VIEW`, `PREWHERE`, `QUALIFY`, `CONNECT BY` - /// / `START WITH`, `CLUSTER BY` / `DISTRIBUTE BY`, named `WINDOW` specs), - /// resolved against the FROM scope. None feed values. + /// keys, `TOP n`, Hive `LATERAL VIEW`, `PREWHERE`, `CONNECT BY` / `START + /// WITH`, `CLUSTER BY` / `DISTRIBUTE BY`, named `WINDOW` specs), resolved + /// against the FROM scope. None feed values. `QUALIFY` is *not* here — it + /// filters on window / projection outputs (post-projection), so it binds + /// against the output-aware scope in [`bind_select`](Self::bind_select). pub(super) fn select_clause_reads(&mut self, select: &Select, scope: &Scope) -> Vec { let mut reads = Vec::new(); if let Some(Distinct::On(exprs)) = &select.distinct { @@ -578,7 +580,6 @@ impl<'a> Binder<'a> { reads.push(self.bind_expr(&lateral_view.lateral_view, scope)); } reads.extend(select.prewhere.iter().map(|e| self.bind_expr(e, scope))); - reads.extend(select.qualify.iter().map(|e| self.bind_expr(e, scope))); for connect_by in &select.connect_by { match connect_by { ConnectByKind::ConnectBy { relationships, .. } => { diff --git a/sql-insight/src/resolver/binder/query.rs b/sql-insight/src/resolver/binder/query.rs index fa18a14..4bae4e7 100644 --- a/sql-insight/src/resolver/binder/query.rs +++ b/sql-insight/src/resolver/binder/query.rs @@ -437,9 +437,9 @@ impl<'a> Binder<'a> { pub(super) fn bind_select(&mut self, select: &Select) -> (LogicalPlan, Scope) { let (from, scope) = self.bind_from(&select.from); // WHERE + the WHERE-family auxiliary clauses (DISTINCT ON / TOP / - // LATERAL VIEW / PREWHERE / QUALIFY / CONNECT BY / CLUSTER BY / named - // WINDOW) filter rows before grouping — a filter over the FROM (no - // output aliases visible). + // LATERAL VIEW / PREWHERE / CONNECT BY / CLUSTER BY / named WINDOW) + // filter rows before grouping — a filter over the FROM (no output + // aliases visible). QUALIFY is post-projection (handled below). let mut where_reads: Vec = select .selection .iter() @@ -481,6 +481,17 @@ impl<'a> Binder<'a> { input: Box::new(node), exprs, }); + // QUALIFY filters on window / projection outputs (it runs after the + // window functions the projection computes), so it sees output aliases + // — bind it against `clause_scope` like HAVING, so an alias reference + // (`QUALIFY rn = 1`) binds `Derived` and drops from reads rather than + // surfacing a phantom base column. Filter-position: reads, no lineage. + if let Some(qualify) = &select.qualify { + node = LogicalPlan::Filter(Filter { + input: Box::new(node), + predicate: vec![self.bind_expr(qualify, &clause_scope)], + }); + } // SORT BY (Hive) sees the outputs, like a trailing ORDER BY. let sort_keys = self.order_by_expr_keys(&select.sort_by, &clause_scope); if !sort_keys.is_empty() { diff --git a/sql-insight/tests/column_operation_extractor/reads_semantics.rs b/sql-insight/tests/column_operation_extractor/reads_semantics.rs index b37d6eb..510c6d1 100644 --- a/sql-insight/tests/column_operation_extractor/reads_semantics.rs +++ b/sql-insight/tests/column_operation_extractor/reads_semantics.rs @@ -1024,6 +1024,59 @@ mod output_alias_visibility { ); } + #[test] + fn qualify_window_alias_is_suppressed() { + // `rn` aliases a window output; QUALIFY references it post-projection, + // not a stored column — so no phantom `t.rn` read. The real dependency + // (the window's `PARTITION BY a`) is counted at the projection. + assert_column_ops( + "SELECT a, ROW_NUMBER() OVER (PARTITION BY a) AS rn FROM t QUALIFY rn = 1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "a")], + writes: vec![], + lineage: vec![passthrough(col("t", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn qualify_window_key_is_a_read() { + // A window key referenced only in QUALIFY's inline window (`b` is not + // projected) is a genuine new read — QUALIFY adds reads even as it + // suppresses alias references. Still no lineage (filter position). + assert_column_ops( + "SELECT a FROM t QUALIFY ROW_NUMBER() OVER (PARTITION BY b) = 1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "b")], + writes: vec![], + lineage: vec![passthrough(col("t", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn qualify_base_column_is_a_read() { + // A QUALIFY referencing a real column (not an output alias) is a + // (filter-position) read, like any predicate. + assert_column_ops( + "SELECT a, b FROM t QUALIFY b > 0", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "b"), read("t", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t", "a"), out("a", 0)), + passthrough(col("t", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + #[test] fn qualified_ref_in_order_by_is_not_an_alias() { // `t.total` is qualified — aliases are unqualified-only, so it's From 92e4100809985b43fc4b0d2ea32afa6b3c884a12 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 16:42:15 +0900 Subject: [PATCH 02/11] fix: extract MERGE RETURNING / OUTPUT columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Merge node carried no `returning`, and bind_merge ignored `merge.output`, so `MERGE … RETURNING s.b, t.c` / MSSQL `OUTPUT inserted.a, s.b` dropped its projected columns entirely (no reads, no lineage, no diagnostic). Add a `returning` field to the Merge plan node (like Insert/Update/Delete), bind the OUTPUT clause's select items over the target + source scope, and walk them for reads + `QueryOutput` lineage. MSSQL `OUTPUT … INTO ` (a secondary write) and the `inserted` / `deleted` pseudo-tables are not modelled yet. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder.rs | 6 ++-- sql-insight/src/resolver/binder/statement.rs | 9 ++++++ sql-insight/src/resolver/lineage.rs | 3 ++ sql-insight/src/resolver/logical_plan.rs | 6 +++- .../column_operation_extractor/cte_and_dml.rs | 28 +++++++++++++++++++ 5 files changed, 48 insertions(+), 4 deletions(-) diff --git a/sql-insight/src/resolver/binder.rs b/sql-insight/src/resolver/binder.rs index 31857ed..df94de8 100644 --- a/sql-insight/src/resolver/binder.rs +++ b/sql-insight/src/resolver/binder.rs @@ -42,9 +42,9 @@ use sqlparser::ast::{ Expr as SqlExpr, FromTable, Function, FunctionArg, FunctionArgExpr, FunctionArguments, GroupByExpr, GroupByWithModifier, Ident, Insert as SqlInsert, JoinConstraint, JoinOperator, JsonPathElem, Merge as SqlMerge, MergeAction, MergeInsertKind, ObjectName, ObjectType, - OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, PipeOperator, PivotValueSource, - Query, Select, SelectItem, SetExpr, Statement, TableAlias, TableFactor, TableObject, - TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, Values as SqlValues, + OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, OutputClause, PipeOperator, + PivotValueSource, Query, Select, SelectItem, SetExpr, Statement, TableAlias, TableFactor, + TableObject, TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, Values as SqlValues, }; use sqlparser::ast::{ diff --git a/sql-insight/src/resolver/binder/statement.rs b/sql-insight/src/resolver/binder/statement.rs index aff9c28..5d5e192 100644 --- a/sql-insight/src/resolver/binder/statement.rs +++ b/sql-insight/src/resolver/binder/statement.rs @@ -595,6 +595,14 @@ impl<'a> Binder<'a> { } } } + // RETURNING (Snowflake) / OUTPUT (MSSQL) projects the affected rows + // over the target + source scope, like the other DML roots. (MSSQL + // `OUTPUT … INTO
`'s secondary write is not modelled yet.) + let output_items = merge.output.as_ref().map(|o| match o { + OutputClause::Output { select_items, .. } + | OutputClause::Returning { select_items, .. } => select_items.clone(), + }); + let returning = self.bind_returning(&output_items, &scope); LogicalPlan::Merge(Merge { target: TableWrite { reference: target, @@ -603,6 +611,7 @@ impl<'a> Binder<'a> { source: Box::new(source), on, clauses, + returning, }) } diff --git a/sql-insight/src/resolver/lineage.rs b/sql-insight/src/resolver/lineage.rs index e59d4ee..7f719d3 100644 --- a/sql-insight/src/resolver/lineage.rs +++ b/sql-insight/src/resolver/lineage.rs @@ -134,6 +134,9 @@ pub(super) fn collect_column_lineage( MergeClause::Delete => {} } } + // RETURNING / OUTPUT projects the affected rows (target + source), + // traced through the source like the other DMLs' RETURNING. + returning_lineage(&m.returning, &m.source, &mut context, &mut edges); } // A bare query (or unmodelled root): one `QueryOutput` group per // projection (a set operation has one per branch — positions restart diff --git a/sql-insight/src/resolver/logical_plan.rs b/sql-insight/src/resolver/logical_plan.rs index dfd6af1..736b9d6 100644 --- a/sql-insight/src/resolver/logical_plan.rs +++ b/sql-insight/src/resolver/logical_plan.rs @@ -251,13 +251,16 @@ pub(crate) struct Delete { } /// `MERGE INTO target USING source ON on WHEN ...`: the `clauses` drive -/// writes / lineage (UPDATE SET / INSERT VALUES). +/// writes / lineage (UPDATE SET / INSERT VALUES). `returning` projects the +/// affected rows — the `RETURNING` (Snowflake) / `OUTPUT` (MSSQL) clause's +/// select items, over the target + source scope, like the other DML roots. #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct Merge { pub(crate) target: TableWrite, pub(crate) source: Box, pub(crate) on: Vec, pub(crate) clauses: Vec, + pub(crate) returning: Vec, } /// One `WHEN [NOT] MATCHED ...` action of a [`Merge`]. @@ -593,6 +596,7 @@ pub(super) fn own_exprs(op: &LogicalPlan) -> Vec<&Expr> { MergeClause::Delete => {} } } + exprs.extend(m.returning.iter().map(|ne| &ne.expr)); exprs } LogicalPlan::Scan(_) diff --git a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs index a5f6989..b1915e8 100644 --- a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs +++ b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs @@ -124,6 +124,34 @@ mod merge { ); } + #[test] + fn merge_returning_projects_affected_rows() { + // RETURNING (Snowflake) / OUTPUT (MSSQL) projects the affected rows over + // the target + source — its columns surface as reads with `QueryOutput` + // lineage, alongside the WHEN action's write / lineage. + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN MATCHED THEN UPDATE SET t.a = s.a RETURNING s.b, t.c", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![ + read("t", "id"), + read("s", "id"), + read("s", "a"), + read("s", "b"), + read("t", "c"), + ], + writes: vec![write("t", "a")], + lineage: vec![ + passthrough(col("s", "a"), relation("t", "a")), + passthrough(col("s", "b"), out("b", 0)), + passthrough(col("t", "c"), out("c", 1)), + ], + diagnostics: vec![], + }, + ); + } + #[test] fn merge_when_not_matched_insert_emits_lineage_and_write() { assert_column_ops( From e057b3a6fdb812542274fed9053d7c306ab4c449 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 16:55:50 +0900 Subject: [PATCH 03/11] fix: extract wildcard REPLACE (expr AS col) outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A `SELECT * REPLACE (price * 1.1 AS price)` discarded its REPLACE clause entirely — the replacement expression's reads and lineage were lost. Each REPLACE element is a real value-producing output (the wildcard's same-named column with its value swapped), so bind it as a named projection output, like a standalone `expr AS col`: its reads / lineage are identical. The wildcard's own columns stay unexpanded (WildcardSuppressed), so the replacement's output position is best-effort. bind_select_item now returns Vec (a wildcard can yield several REPLACE outputs); the three call sites flat_map over it. Documented the behavior + position caveat in the crate Limitations. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/lib.rs | 16 ++++-- sql-insight/src/resolver/binder.rs | 4 +- sql-insight/src/resolver/binder/expr.rs | 57 +++++++++++++------ sql-insight/src/resolver/binder/query.rs | 4 +- sql-insight/src/resolver/binder/statement.rs | 2 +- .../column_operation_extractor/diagnostics.rs | 19 +++++++ 6 files changed, 74 insertions(+), 28 deletions(-) diff --git a/sql-insight/src/lib.rs b/sql-insight/src/lib.rs index 94ee8f9..96a8073 100644 --- a/sql-insight/src/lib.rs +++ b/sql-insight/src/lib.rs @@ -117,13 +117,17 @@ //! Intentional non-support and known gaps — set expectations before //! relying on a given output: //! -//! - **Wildcards not expanded**: `SELECT *` / `t.*` contribute -//! nothing to `reads` / `lineage`. Expanding them safely would -//! require modelling USING / NATURAL JOIN merge, EXCLUDE / REPLACE -//! clauses, and multi-level aliases — too much rigor for a -//! SQL-text-only library. Surfaced as +//! - **Wildcards not expanded**: the `*` / `t.*` itself contributes +//! nothing to `reads` / `lineage` (expanding it safely would require +//! modelling USING / NATURAL JOIN merge, EXCLUDE / EXCEPT / RENAME, and +//! multi-level aliases — too much rigor for a SQL-text-only library). +//! Surfaced as //! [`WildcardSuppressed`](diagnostic::ColumnLevelDiagnosticKind::WildcardSuppressed) -//! so consumers can detect incomplete projections. +//! so consumers can detect incomplete projections. A `REPLACE (expr AS +//! col)` clause *is* extracted — each replacement's `expr` contributes +//! reads and a `col` lineage edge, exactly like a standalone `expr AS col` +//! — but its **output position** is best-effort, since the wildcard's own +//! columns aren't enumerated to place it among them. //! - **Table functions are opaque**: `UNNEST` / `generate_series` / //! `JSON_TABLE` / `PIVOT` etc. produce dynamic columns that aren't //! enumerated. Their argument expressions surface as reads, but a diff --git a/sql-insight/src/resolver/binder.rs b/sql-insight/src/resolver/binder.rs index df94de8..be398ba 100644 --- a/sql-insight/src/resolver/binder.rs +++ b/sql-insight/src/resolver/binder.rs @@ -49,8 +49,8 @@ use sqlparser::ast::{ use sqlparser::ast::{ AccessExpr, ConnectByKind, Distinct, FunctionArgumentClause, LimitClause, ListAggOnOverflow, - NamedWindowExpr, SelectItemQualifiedWildcardKind, Subscript, TopQuantity, WindowFrameBound, - WindowSpec, WindowType, + NamedWindowExpr, SelectItemQualifiedWildcardKind, Subscript, TopQuantity, + WildcardAdditionalOptions, WindowFrameBound, WindowSpec, WindowType, }; use sqlparser::tokenizer::Span; diff --git a/sql-insight/src/resolver/binder/expr.rs b/sql-insight/src/resolver/binder/expr.rs index b19d4db..c9bb50c 100644 --- a/sql-insight/src/resolver/binder/expr.rs +++ b/sql-insight/src/resolver/binder/expr.rs @@ -5,26 +5,26 @@ use super::*; impl<'a> Binder<'a> { - pub(super) fn bind_select_item( - &mut self, - item: &SelectItem, - scope: &Scope, - ) -> Option { + pub(super) fn bind_select_item(&mut self, item: &SelectItem, scope: &Scope) -> Vec { match item { - SelectItem::UnnamedExpr(expr) => Some(NamedExpr { + SelectItem::UnnamedExpr(expr) => vec![NamedExpr { name: inferred_name(expr), expr: self.bind_expr(expr, scope), - }), - SelectItem::ExprWithAlias { expr, alias } => Some(NamedExpr { + }], + SelectItem::ExprWithAlias { expr, alias } => vec![NamedExpr { name: Some(alias.clone()), expr: self.bind_expr(expr, scope), - }), + }], // A wildcard isn't expanded (the rigor cost is too high for a // SQL-text-only library); record it so consumers know this - // projection's column lineage is incomplete, and skip it. + // projection's column lineage is incomplete. A `REPLACE (expr AS + // col)` clause is a real value-producing output, though — bind each + // replacement as a named output (its reads / lineage are exactly a + // standalone `expr AS col`; only the output position is best-effort, + // since the wildcard's own columns aren't enumerated). SelectItem::Wildcard(options) => { self.record_wildcard_suppressed("wildcard `*`", options.wildcard_token.0.span); - None + self.replace_outputs(options, scope) } SelectItem::QualifiedWildcard(kind, options) => { let description = match kind { @@ -38,20 +38,43 @@ impl<'a> Binder<'a> { self.record_wildcard_suppressed(&description, options.wildcard_token.0.span); // `(expr).*` still projects its base expression as one // Transformation output (a structural field access); `alias.*` - // has no inspectable base. - match kind { - SelectItemQualifiedWildcardKind::Expr(expr) => Some(NamedExpr { + // has no inspectable base. Either way a `REPLACE` clause's + // explicit outputs follow. + let mut out = match kind { + SelectItemQualifiedWildcardKind::Expr(expr) => vec![NamedExpr { name: None, expr: Expr::Call { args: vec![self.bind_expr(expr, scope)], }, - }), - SelectItemQualifiedWildcardKind::ObjectName(_) => None, - } + }], + SelectItemQualifiedWildcardKind::ObjectName(_) => Vec::new(), + }; + out.extend(self.replace_outputs(options, scope)); + out } } } + /// A wildcard's `REPLACE (expr AS col, …)` outputs: each replacement is a + /// value-producing column named by `col`, bound like a standalone + /// `expr AS col` (its reads / lineage are identical). The wildcard's other + /// columns stay unexpanded. + fn replace_outputs( + &mut self, + options: &WildcardAdditionalOptions, + scope: &Scope, + ) -> Vec { + options + .opt_replace + .iter() + .flat_map(|replace| &replace.items) + .map(|element| NamedExpr { + name: Some(element.column_name.clone()), + expr: self.bind_expr(&element.expr, scope), + }) + .collect() + } + /// Resolve a `sqlparser` expression into a bound [`Expr`], mirroring the /// resolver's `collect_expr`. A bare column reference is the only /// [`Passthrough`](crate::extractor::ColumnLineageKind::Passthrough) shape diff --git a/sql-insight/src/resolver/binder/query.rs b/sql-insight/src/resolver/binder/query.rs index 4bae4e7..918e935 100644 --- a/sql-insight/src/resolver/binder/query.rs +++ b/sql-insight/src/resolver/binder/query.rs @@ -257,7 +257,7 @@ impl<'a> Binder<'a> { ) -> Vec { items .iter() - .filter_map(|i| self.bind_select_item(i, scope)) + .flat_map(|i| self.bind_select_item(i, scope)) .collect() } @@ -458,7 +458,7 @@ impl<'a> Binder<'a> { let exprs: Vec = select .projection .iter() - .filter_map(|item| self.bind_select_item(item, &scope)) + .flat_map(|item| self.bind_select_item(item, &scope)) .collect(); let clause_scope = scope.with_query_outputs(self.output_cols(&exprs)); // GROUP BY → an `Aggregate` over the filtered rows; its keys are reads. diff --git a/sql-insight/src/resolver/binder/statement.rs b/sql-insight/src/resolver/binder/statement.rs index 5d5e192..17156c7 100644 --- a/sql-insight/src/resolver/binder/statement.rs +++ b/sql-insight/src/resolver/binder/statement.rs @@ -818,7 +818,7 @@ impl<'a> Binder<'a> { returning .iter() .flatten() - .filter_map(|item| self.bind_select_item(item, scope)) + .flat_map(|item| self.bind_select_item(item, scope)) .collect() } diff --git a/sql-insight/tests/column_operation_extractor/diagnostics.rs b/sql-insight/tests/column_operation_extractor/diagnostics.rs index 9f6ab44..a2fed47 100644 --- a/sql-insight/tests/column_operation_extractor/diagnostics.rs +++ b/sql-insight/tests/column_operation_extractor/diagnostics.rs @@ -2,6 +2,7 @@ use crate::support::*; mod reported { use super::*; + use sql_insight::sqlparser::dialect::BigQueryDialect; #[test] fn unsupported_statement_reports_diagnostic() { @@ -95,6 +96,24 @@ mod reported { ); } + #[test] + fn wildcard_replace_expression_contributes_reads_and_lineage() { + // The `*` stays suppressed, but a `REPLACE (expr AS col)` is a real + // value-producing output: `products.price` reads and feeds `price` + // (Transformation), exactly like a standalone `price * 1.1 AS price`. + assert_column_ops_with_dialect( + &BigQueryDialect {}, + "SELECT * REPLACE (price * 1.1 AS price) FROM products", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("products", "price")], + writes: vec![], + lineage: vec![transformation(col("products", "price"), out("price", 0))], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::WildcardSuppressed)], + }, + ); + } + #[test] fn multiple_statements_produce_multiple_results() { let sql = "SELECT t1.a FROM t1; SELECT t2.b FROM t2"; From 8e487fb9f51a75c2197dabf81cfae5aa9214020c Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 17:04:20 +0900 Subject: [PATCH 04/11] fix: bind DELETE/UPDATE ORDER BY and LIMIT clauses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bind_delete ignored the MySQL `ORDER BY` / `LIMIT` tail, so `DELETE … ORDER BY priority LIMIT 5` dropped `priority` from reads. Bind both as filter-position reads (ORDER BY keys reference the target's columns → reads; a constant LIMIT adds none). bind_update gets the same treatment for its `LIMIT` (parity; no observable read since LIMIT is a row count, but no longer silently dropped). Audit of the other DML clauses: INSERT ON DUPLICATE / ON CONFLICT, the SET form, and MERGE RETURNING are already bound; INSERT PARTITION stays intentionally unextracted. UPDATE has no ORDER BY in this sqlparser version. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder/statement.rs | 31 +++++++++++++++---- .../writes_deletes.rs | 19 ++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/sql-insight/src/resolver/binder/statement.rs b/sql-insight/src/resolver/binder/statement.rs index 17156c7..3d964be 100644 --- a/sql-insight/src/resolver/binder/statement.rs +++ b/sql-insight/src/resolver/binder/statement.rs @@ -375,12 +375,20 @@ impl<'a> Binder<'a> { input = combine(input, node); } } - // WHERE: a filter over the read input (picks which rows update; its - // reads / subqueries do not feed the new value). - if let Some(predicate) = &update.selection { + // WHERE + the MySQL `LIMIT` tail are filter-position reads (they pick / + // bound which rows update; their reads / subqueries never feed the new + // value). A `LIMIT` is normally a constant, so it adds no read, but it's + // bound for parity with SELECT / DELETE rather than silently dropped. + let mut filter_reads: Vec = update + .selection + .iter() + .map(|predicate| self.bind_expr(predicate, &scope)) + .collect(); + filter_reads.extend(update.limit.iter().map(|e| self.bind_expr(e, &scope))); + if !filter_reads.is_empty() { input = LogicalPlan::Filter(Filter { input: Box::new(input), - predicate: vec![self.bind_expr(predicate, &scope)], + predicate: filter_reads, }); } // SET assignments resolve against the target + FROM scope; each writes @@ -463,10 +471,21 @@ impl<'a> Binder<'a> { targets.push(target); } } - if let Some(predicate) = &delete.selection { + // WHERE + the MySQL `ORDER BY` / `LIMIT` tail are filter-position reads + // (row selection / positioning / count), none feeding lineage. ORDER BY + // keys reference the target's columns, so they read it (source/sink); a + // constant LIMIT adds no read but is bound for parity, not dropped. + let mut filter_reads: Vec = delete + .selection + .iter() + .map(|predicate| self.bind_expr(predicate, &scope)) + .collect(); + filter_reads.extend(self.order_by_expr_keys(&delete.order_by, &scope)); + filter_reads.extend(delete.limit.iter().map(|e| self.bind_expr(e, &scope))); + if !filter_reads.is_empty() { input = LogicalPlan::Filter(Filter { input: Box::new(input), - predicate: vec![self.bind_expr(predicate, &scope)], + predicate: filter_reads, }); } // RETURNING resolves against the FROM / USING scope (which holds the diff --git a/sql-insight/tests/column_operation_extractor/writes_deletes.rs b/sql-insight/tests/column_operation_extractor/writes_deletes.rs index 8308b6a..7b22ee5 100644 --- a/sql-insight/tests/column_operation_extractor/writes_deletes.rs +++ b/sql-insight/tests/column_operation_extractor/writes_deletes.rs @@ -167,6 +167,7 @@ mod writes { mod delete { use super::*; + use sql_insight::sqlparser::dialect::MySqlDialect; #[test] fn delete_qualified_predicate_is_a_read() { @@ -181,4 +182,22 @@ mod delete { }, ); } + + #[test] + fn delete_order_by_keys_are_reads_constant_limit_is_not() { + // MySQL `DELETE … ORDER BY … LIMIT`: ORDER BY keys reference the + // target's columns (reads, filter position); the constant LIMIT adds + // no read. Neither feeds lineage. + assert_column_ops_with_dialect( + &MySqlDialect {}, + "DELETE FROM t WHERE flag = 1 ORDER BY priority DESC LIMIT 5", + ColumnOperation { + statement_kind: StatementKind::Delete, + reads: vec![read("t", "flag"), read("t", "priority")], + writes: vec![], + lineage: vec![], + diagnostics: vec![], + }, + ); + } } From c8c0fa61c3024634a43e9acb548b9ad9b6fab3ab Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 17:11:22 +0900 Subject: [PATCH 05/11] fix: flow the LHS of IN (subquery) in value position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In value position, `a IN (list)` flowed `a` to the output (a boolean transformation), but `a IN (subquery)` flowed nothing — its origins were empty. Treat the InSubquery LHS as a value operand (a Transformation), like InList; the subquery's columns stay a membership test (reads, no origin). Fixed in both origins_of_expr and conflict_value_origins. Filter-position IN is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/origins.rs | 13 +++++++++++-- .../tests/column_operation_extractor/lineage.rs | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/sql-insight/src/resolver/origins.rs b/sql-insight/src/resolver/origins.rs index 22b1c5c..99dd0b2 100644 --- a/sql-insight/src/resolver/origins.rs +++ b/sql-insight/src/resolver/origins.rs @@ -151,8 +151,12 @@ pub(super) fn origins_of_expr<'a>( .iter() .flat_map(|c| origins_of_ref(c, input, context)) .collect(), + // `a IN (subquery)`: the LHS is a value operand — `a IN (…)` is a + // boolean transformation of it, like `a IN (list)`. The subquery's + // columns are a membership test (filter), contributing no origin. + Expr::InSubquery { expr, .. } => transform(origins_of_expr(expr, input, context)), // Tests / suppressed operands contribute no value origin (reads only). - Expr::Exists(_) | Expr::InSubquery { .. } | Expr::Filter(_) => Vec::new(), + Expr::Exists(_) | Expr::Filter(_) => Vec::new(), } } @@ -423,7 +427,12 @@ pub(super) fn conflict_value_origins<'a>( .iter() .filter_map(|c| column_read(c).map(|r| (r, ColumnLineageKind::Passthrough))) .collect(), - Expr::Exists(_) | Expr::InSubquery { .. } | Expr::Filter(_) => Vec::new(), + // `a IN (subquery)`: the LHS flows as a value operand (see + // `origins_of_expr`); the subquery side is a filter. + Expr::InSubquery { expr, .. } => { + transform(conflict_value_origins(expr, columns, source, context)) + } + Expr::Exists(_) | Expr::Filter(_) => Vec::new(), } } diff --git a/sql-insight/tests/column_operation_extractor/lineage.rs b/sql-insight/tests/column_operation_extractor/lineage.rs index 44e78f0..7cf4e60 100644 --- a/sql-insight/tests/column_operation_extractor/lineage.rs +++ b/sql-insight/tests/column_operation_extractor/lineage.rs @@ -48,6 +48,23 @@ mod projections { ); } + #[test] + fn in_subquery_in_value_position_flows_lhs_not_the_subquery() { + // `a IN (subquery)` projected as a value: the LHS `a` flows to the + // output (a boolean transformation of it, symmetric with `a IN (list)`); + // the subquery's `s.x` is a membership test — a read, never a source. + assert_column_ops( + "SELECT (a IN (SELECT x FROM s)) AS f FROM t", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("s", "x")], + writes: vec![], + lineage: vec![transformation(col("t", "a"), out("f", 0))], + diagnostics: vec![], + }, + ); + } + #[test] fn select_mixed_projection_separates_targets_by_position() { assert_column_ops( From fee6320db7157e45633ea223233c7535377a0a98 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 17:37:30 +0900 Subject: [PATCH 06/11] =?UTF-8?q?fix:=20extract=20tuple=20SET=20(a,=20b)?= =?UTF-8?q?=20=3D=20=E2=80=A6=20assignments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A tuple assignment target dropped its whole assignment (no writes, the RHS reads/lineage lost). Expand a tuple `SET (a, b) = …` to one Assignment per target column, paired positionally with the RHS: a row value `(e0, e1)` element-wise, a `(SELECT x, y)` subquery by output column. Generalize Expr::Subquery with an `output` index (0 for a scalar; the i-th for each tuple target), reusing trace_nth_output — so `(a,b)=(SELECT x,y)` flows a←x, b←y (and over a UNION, each branch's i-th output), with the shared subquery read once. Applies to UPDATE, MERGE WHEN UPDATE, and ON CONFLICT / ON DUPLICATE SET. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder.rs | 17 +-- sql-insight/src/resolver/binder/expr.rs | 10 +- sql-insight/src/resolver/binder/query.rs | 5 +- sql-insight/src/resolver/binder/statement.rs | 136 ++++++++++++------ sql-insight/src/resolver/logical_plan.rs | 27 ++-- sql-insight/src/resolver/origins.rs | 19 ++- .../arm_coverage.rs | 11 +- .../column_operation_extractor/lineage.rs | 39 +++++ 8 files changed, 189 insertions(+), 75 deletions(-) diff --git a/sql-insight/src/resolver/binder.rs b/sql-insight/src/resolver/binder.rs index be398ba..14cd091 100644 --- a/sql-insight/src/resolver/binder.rs +++ b/sql-insight/src/resolver/binder.rs @@ -37,14 +37,15 @@ //! named on the root, never a read scan. use sqlparser::ast::{ - AlterTable as SqlAlterTable, AlterTableOperation, AssignmentTarget, CreateTable, - CreateTableLikeKind, CreateView as SqlCreateView, Cte as SqlCte, Delete as SqlDelete, - Expr as SqlExpr, FromTable, Function, FunctionArg, FunctionArgExpr, FunctionArguments, - GroupByExpr, GroupByWithModifier, Ident, Insert as SqlInsert, JoinConstraint, JoinOperator, - JsonPathElem, Merge as SqlMerge, MergeAction, MergeInsertKind, ObjectName, ObjectType, - OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, OutputClause, PipeOperator, - PivotValueSource, Query, Select, SelectItem, SetExpr, Statement, TableAlias, TableFactor, - TableObject, TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, Values as SqlValues, + AlterTable as SqlAlterTable, AlterTableOperation, Assignment as SqlAssignment, + AssignmentTarget, CreateTable, CreateTableLikeKind, CreateView as SqlCreateView, Cte as SqlCte, + Delete as SqlDelete, Expr as SqlExpr, FromTable, Function, FunctionArg, FunctionArgExpr, + FunctionArguments, GroupByExpr, GroupByWithModifier, Ident, Insert as SqlInsert, + JoinConstraint, JoinOperator, JsonPathElem, Merge as SqlMerge, MergeAction, MergeInsertKind, + ObjectName, ObjectType, OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, + OutputClause, PipeOperator, PivotValueSource, Query, Select, SelectItem, SetExpr, Statement, + TableAlias, TableFactor, TableObject, TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, + Values as SqlValues, }; use sqlparser::ast::{ diff --git a/sql-insight/src/resolver/binder/expr.rs b/sql-insight/src/resolver/binder/expr.rs index c9bb50c..37d2948 100644 --- a/sql-insight/src/resolver/binder/expr.rs +++ b/sql-insight/src/resolver/binder/expr.rs @@ -284,7 +284,10 @@ impl<'a> Binder<'a> { self.call([member_of.value.as_ref(), member_of.array.as_ref()], scope) } // A scalar subquery (value position): its output flows in. - SqlExpr::Subquery(query) => Expr::Subquery(Box::new(self.bind_subquery(query, scope))), + SqlExpr::Subquery(query) => Expr::Subquery { + plan: Box::new(self.bind_subquery(query, scope)), + output: 0, + }, // Tests (filter position): columns read, never an origin. SqlExpr::Exists { subquery, .. } => { Expr::Exists(Box::new(self.bind_subquery(subquery, scope))) @@ -477,7 +480,10 @@ impl<'a> Binder<'a> { } } } else if let FunctionArguments::Subquery(query) = &function.args { - args.push(Expr::Subquery(Box::new(self.bind_subquery(query, scope)))); + args.push(Expr::Subquery { + plan: Box::new(self.bind_subquery(query, scope)), + output: 0, + }); } if let Some(filter) = &function.filter { suppressed.push(self.bind_expr(filter, scope)); diff --git a/sql-insight/src/resolver/binder/query.rs b/sql-insight/src/resolver/binder/query.rs index 918e935..f2ab448 100644 --- a/sql-insight/src/resolver/binder/query.rs +++ b/sql-insight/src/resolver/binder/query.rs @@ -836,7 +836,10 @@ impl<'a> Binder<'a> { .map(|o| self.bind_expr(&o.expr, scope)) .collect(), PivotValueSource::Subquery(query) => { - vec![Expr::Subquery(Box::new(self.bind_subquery(query, scope)))] + vec![Expr::Subquery { + plan: Box::new(self.bind_subquery(query, scope)), + output: 0, + }] } } } diff --git a/sql-insight/src/resolver/binder/statement.rs b/sql-insight/src/resolver/binder/statement.rs index 3d964be..9a258c4 100644 --- a/sql-insight/src/resolver/binder/statement.rs +++ b/sql-insight/src/resolver/binder/statement.rs @@ -316,16 +316,7 @@ impl<'a> Binder<'a> { // A conflict-action SET always targets the insert target's own columns. let bound = assignments .iter() - .filter_map(|a| { - let value = self.bind_expr(&a.value, &scope); - let (target, target_resolution) = - self.assignment_target(&a.target, &scope, target)?; - Some(Assignment { - target, - target_resolution, - value, - }) - }) + .flat_map(|a| self.bind_assignment(a, &scope, target)) .collect(); let predicate = selection .map(|s| self.bind_expr(s, &scope)) @@ -393,20 +384,12 @@ impl<'a> Binder<'a> { } // SET assignments resolve against the target + FROM scope; each writes // its resolved target table (the root, or the relation a qualifier names - // in a multi-table `UPDATE t1 JOIN t2 SET t2.col = …`). + // in a multi-table `UPDATE t1 JOIN t2 SET t2.col = …`). A tuple + // `SET (a, b) = …` expands to one assignment per target column. let assignments = update .assignments .iter() - .filter_map(|a| { - let value = self.bind_expr(&a.value, &scope); - let (target, target_resolution) = - self.assignment_target(&a.target, &scope, &target)?; - Some(Assignment { - target, - target_resolution, - value, - }) - }) + .flat_map(|a| self.bind_assignment(a, &scope, &target)) .collect(); // RETURNING resolves against the statement scope (target + FROM). let returning = self.bind_returning(&update.returning, &scope); @@ -592,20 +575,11 @@ impl<'a> Binder<'a> { on.push(self.bind_expr(predicate, &scope)); } // A MERGE WHEN UPDATE always targets the merge target's - // own columns. + // own columns (a tuple SET expands per target column). let assignments = update .assignments .iter() - .filter_map(|a| { - let value = self.bind_expr(&a.value, &scope); - let (target, target_resolution) = - self.assignment_target(&a.target, &scope, &target)?; - Some(Assignment { - target, - target_resolution, - value, - }) - }) + .flat_map(|a| self.bind_assignment(a, &scope, &target)) .collect(); clauses.push(MergeClause::Update { assignments }); } @@ -746,22 +720,96 @@ impl<'a> Binder<'a> { }) } - /// Resolve a SET assignment's target to the column it writes, qualified by - /// its **resolved table**: an unqualified column writes the DML `root`; a - /// qualified `t2.col` (a multi-table `UPDATE t1 JOIN t2 SET t2.col = …`) - /// writes whichever in-scope real table the qualifier names. Returns `None` - /// — dropped (+ flagged by the caller) — for a tuple target (`SET (a,b)=…`, - /// not modelled) or a qualifier that names no writable table (a derived - /// table / CTE / unknown alias can't be a write target). - fn assignment_target( + /// Bind one SET assignment into the per-column [`Assignment`]s it writes — + /// one for a single `col = expr`, several for a tuple `(a, b) = …` (one per + /// target column). See [`bind_tuple_assignment`](Self::bind_tuple_assignment) + /// for the tuple pairing. `root` is the DML target an unqualified column + /// writes; `scope` resolves a qualified `t2.col` and the RHS reads. + pub(super) fn bind_assignment( + &mut self, + assignment: &SqlAssignment, + scope: &Scope, + root: &TableReference, + ) -> Vec { + match &assignment.target { + AssignmentTarget::ColumnName(name) => { + let value = self.bind_expr(&assignment.value, scope); + self.resolve_assignment_column(name, scope, root) + .map(|(target, target_resolution)| Assignment { + target, + target_resolution, + value, + }) + .into_iter() + .collect() + } + AssignmentTarget::Tuple(names) => { + self.bind_tuple_assignment(names, &assignment.value, scope, root) + } + } + } + + /// Expand a tuple `SET (a, b, …) = rhs` into one [`Assignment`] per target, + /// pairing each with its positional value: a row value `(e0, e1)` + /// element-wise; a `(SELECT x, y)` subquery by output column (each target + /// projects its positional output via [`Expr::Subquery`]'s `output`). The + /// subquery is bound once (reads count once); each target's value clones + /// that plan, differing only in which output column it projects — so the + /// reads walker must see the subquery on exactly one target (the first; the + /// rest carry `output > 0`, which it skips). Targets past the RHS arity, or + /// that resolve to no writable table, are dropped. + fn bind_tuple_assignment( + &mut self, + names: &[ObjectName], + rhs: &SqlExpr, + scope: &Scope, + root: &TableReference, + ) -> Vec { + let values: Vec = match rhs { + // `(a, b) = (e0, e1)` — a row value: each element is one value. + SqlExpr::Tuple(elems) => elems.iter().map(|e| self.bind_expr(e, scope)).collect(), + // `(a, b) = (SELECT x, y …)` — each target projects its positional + // output column of the one subquery. + SqlExpr::Subquery(query) => { + let plan = self.bind_subquery(query, scope); + (0..names.len()) + .map(|output| Expr::Subquery { + plan: Box::new(plan.clone()), + output, + }) + .collect() + } + // Any other RHS on a tuple target isn't SQL we model row-wise; bind + // it once so its reads still surface (paired with the first target). + other => vec![self.bind_expr(other, scope)], + }; + names + .iter() + .zip(values) + .filter_map(|(name, value)| { + self.resolve_assignment_column(name, scope, root).map( + |(target, target_resolution)| Assignment { + target, + target_resolution, + value, + }, + ) + }) + .collect() + } + + /// Resolve a SET assignment's target column to the column it writes, + /// qualified by its **resolved table**: an unqualified column writes the DML + /// `root`; a qualified `t2.col` (a multi-table `UPDATE t1 JOIN t2 SET t2.col + /// = …`) writes whichever in-scope real table the qualifier names. Returns + /// `None` — dropped — for a qualifier that names no writable table (a + /// derived table / CTE / unknown alias can't be a write target). + fn resolve_assignment_column( &self, - target: &AssignmentTarget, + name: &ObjectName, scope: &Scope, root: &TableReference, ) -> Option<(ColumnWrite, ResolutionKind)> { - let AssignmentTarget::ColumnName(name) = target else { - return None; // tuple `SET (a, b) = …` — not modelled - }; let parts: Vec = name .0 .iter() diff --git a/sql-insight/src/resolver/logical_plan.rs b/sql-insight/src/resolver/logical_plan.rs index 736b9d6..00ebe8b 100644 --- a/sql-insight/src/resolver/logical_plan.rs +++ b/sql-insight/src/resolver/logical_plan.rs @@ -371,9 +371,14 @@ pub(crate) enum Expr { partition: Vec, order: Vec, }, - /// A scalar subquery (value position): its first output column's origins - /// flow into the enclosing value (as a `Transformation`). - Subquery(Box), + /// A subquery in value position: the `output`-th output column's origins + /// flow into the enclosing value (as a `Transformation`). `output` is 0 for + /// a scalar `(SELECT …)`; a tuple assignment `SET (a, b) = (SELECT x, y …)` + /// binds one per target column, each projecting its positional output. + Subquery { + plan: Box, + output: usize, + }, /// `EXISTS (subquery)` (filter position): a test, never a value origin. Exists(Box), /// `expr IN (subquery)` (filter position). @@ -420,7 +425,7 @@ impl Expr { } Expr::Window { arg, .. } => vec![arg], Expr::Column(_) - | Expr::Subquery(_) + | Expr::Subquery { .. } | Expr::Exists(_) | Expr::InSubquery { .. } | Expr::Filter(_) @@ -439,7 +444,7 @@ impl Expr { Expr::Filter(exprs) => exprs.iter().collect(), Expr::Column(_) | Expr::Call { .. } - | Expr::Subquery(_) + | Expr::Subquery { .. } | Expr::Exists(_) | Expr::Fanin(_) => Vec::new(), } @@ -451,8 +456,14 @@ impl Expr { /// it for the source-table feed). pub(super) fn value_subplans(&self) -> Vec<&LogicalPlan> { match self { - Expr::Subquery(plan) => vec![plan], - Expr::Column(_) + Expr::Subquery { plan, output: 0 } => vec![plan], + // `output > 0` is a tuple `SET (a, b) = (SELECT …)` clone: every + // target column clones the same subquery, differing only in which + // output it projects. Expose the shared plan once (at output 0 + // above) so reads / source-feeds stay occurrence-correct; column + // lineage traces each output separately via `origins`, not here. + Expr::Subquery { .. } + | Expr::Column(_) | Expr::Call { .. } | Expr::Case { .. } | Expr::Window { .. } @@ -474,7 +485,7 @@ impl Expr { | Expr::Call { .. } | Expr::Case { .. } | Expr::Window { .. } - | Expr::Subquery(_) + | Expr::Subquery { .. } | Expr::Filter(_) | Expr::Fanin(_) => Vec::new(), } diff --git a/sql-insight/src/resolver/origins.rs b/sql-insight/src/resolver/origins.rs index 99dd0b2..19b5cf2 100644 --- a/sql-insight/src/resolver/origins.rs +++ b/sql-insight/src/resolver/origins.rs @@ -140,8 +140,10 @@ pub(super) fn origins_of_expr<'a>( } // The function argument is value; partition / order keys are filter. Expr::Window { arg, .. } => transform(origins_of_expr(arg, input, context)), - // A scalar subquery's first output column flows as a transformation. - Expr::Subquery(plan) => transform(scalar_subquery_origins(plan, context)), + // A subquery's `output`-th column flows as a transformation. + Expr::Subquery { plan, output } => { + transform(subquery_output_origins(plan, *output, context)) + } // A merge-column fan-in: each owning side is its own origin, traced // like a column ref — a real-table side is a `Passthrough` base read, // a derived / CTE side traces into its producing subquery (so a @@ -341,15 +343,18 @@ fn origins_into<'a>( /// value) — fanning across **every** set-operation branch (each branch's /// position-0 output), not just the leftmost, since the branches merge /// positionally. -fn scalar_subquery_origins<'a>( +fn subquery_output_origins<'a>( op: &'a LogicalPlan, + output: usize, context: &mut TraceContext<'a>, ) -> Vec<(ColumnRead, ColumnLineageKind)> { - // Position 0 of every branch. `output_operands` accumulates each peeled + // The `output`-th column of every branch (0 for a scalar subquery; the i-th + // for a tuple `SET (a, b) = (SELECT x, y)` assignment — a set-op body fans + // the position over each branch). `output_operands` accumulates each peeled // `With`'s CTE declarations onto the operand and `trace_nth_output` // registers them — so a `CteRef` in the body (including the subquery's *own* // leading `WITH`, which `TraceContext::new` doesn't see) resolves. - trace_nth_output(&output_operands(op), 0, context) + trace_nth_output(&output_operands(op), output, context) } /// Trace the `i`-th output column of every operand — the positional fan-out @@ -422,7 +427,9 @@ pub(super) fn conflict_value_origins<'a>( Expr::Window { arg, .. } => { transform(conflict_value_origins(arg, columns, source, context)) } - Expr::Subquery(plan) => transform(scalar_subquery_origins(plan, context)), + Expr::Subquery { plan, output } => { + transform(subquery_output_origins(plan, *output, context)) + } Expr::Fanin(refs) => refs .iter() .filter_map(|c| column_read(c).map(|r| (r, ColumnLineageKind::Passthrough))) diff --git a/sql-insight/tests/column_operation_extractor/arm_coverage.rs b/sql-insight/tests/column_operation_extractor/arm_coverage.rs index 5abd557..184990a 100644 --- a/sql-insight/tests/column_operation_extractor/arm_coverage.rs +++ b/sql-insight/tests/column_operation_extractor/arm_coverage.rs @@ -1301,14 +1301,13 @@ mod relation_arm_coverage { } #[test] - fn update_tuple_assignment_target_is_skipped() { - // `AssignmentTarget::Tuple(_)` returns `None` from - // `assignment_target_parts`, so the SET position is skipped: - // no writes emitted (tuple targets are not yet supported for - // column-level pairing). The RHS literals contribute no reads. + fn update_tuple_assignment_writes_each_target_column() { + // `AssignmentTarget::Tuple` expands to one write per target column, + // paired positionally with the RHS row value. The literals `(1, 2)` + // contribute no reads. let result = op("UPDATE t SET (a, b) = (1, 2)"); assert_unordered_eq!(result.reads, vec![]); - assert_eq!(result.writes, vec![]); + assert_eq!(result.writes, vec![w("t", "a"), w("t", "b")]); } #[test] diff --git a/sql-insight/tests/column_operation_extractor/lineage.rs b/sql-insight/tests/column_operation_extractor/lineage.rs index 7cf4e60..98823c8 100644 --- a/sql-insight/tests/column_operation_extractor/lineage.rs +++ b/sql-insight/tests/column_operation_extractor/lineage.rs @@ -2,6 +2,7 @@ use crate::support::*; mod projections { use super::*; + use sql_insight::sqlparser::dialect::PostgreSqlDialect; #[test] fn select_bare_column_emits_passthrough_edge_to_query_output() { @@ -265,6 +266,44 @@ mod projections { ); } + #[test] + fn update_tuple_set_subquery_pairs_each_target_with_its_output() { + // `SET (a, b) = (SELECT x, y FROM s)`: the one subquery is read once, + // and each target column pairs with its positional output (a ← x via + // output 0, b ← y via output 1) — not the first output for both. + assert_column_ops_with_dialect( + &PostgreSqlDialect {}, + "UPDATE t SET (a, b) = (SELECT x, y FROM s)", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("s", "x"), read("s", "y")], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![ + transformation(col("s", "x"), relation("t", "a")), + transformation(col("s", "y"), relation("t", "b")), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn update_tuple_set_row_value_pairs_each_target_element_wise() { + // `SET (a, b) = (10, c)`: a row value — each element is one target's + // value (a ← 10, no source; b ← c passthrough). + assert_column_ops_with_dialect( + &PostgreSqlDialect {}, + "UPDATE t SET (a, b) = (10, c)", + ColumnOperation { + statement_kind: StatementKind::Update, + reads: vec![read("t", "c")], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![passthrough(col("t", "c"), relation("t", "b"))], + diagnostics: vec![], + }, + ); + } + #[test] fn update_set_with_qualified_rhs_resolves_to_other_table() { assert_column_ops( From cfc13ee7e0636176c974d4800698ef99d73f27b9 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 17:51:50 +0900 Subject: [PATCH 07/11] docs: note that normalization replaces all literals, including structural ones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The normalizer's "every literal → ?" pass also rewrites JSON paths, CAST FORMAT strings, and TABLESAMPLE counts — consistent with how the same literals normalize as function arguments (JSON_EXTRACT path, TO_CHAR format) and with LIMIT/OFFSET. Document this neutrally so consumers know queries differing only in such a literal collapse together. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/normalizer.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sql-insight/src/normalizer.rs b/sql-insight/src/normalizer.rs index 9bc8d66..307f45f 100644 --- a/sql-insight/src/normalizer.rs +++ b/sql-insight/src/normalizer.rs @@ -4,8 +4,19 @@ //! //! The base pass replaces every literal `Value` with a `?` //! placeholder, so queries that differ only in their parameter -//! values collapse to the same string. Three opt-in toggles -//! ([`NormalizerOptions`]) further collapse repetitive shapes: +//! values collapse to the same string. +//! +//! "Every literal" is meant literally: it includes `Value`s in +//! structurally significant positions, not just bound-parameter slots. +//! A JSON path (`JSON_TABLE(data, '$.a')`, `JSON_EXTRACT(data, '$.a')`), +//! a `CAST(x AS DATE FORMAT 'YYYY-MM-DD')` format string, the +//! `TABLESAMPLE (BUCKET 3 OUT OF 10)` / `(10 PERCENT)` counts, and +//! `LIMIT` / `OFFSET` are all rewritten to `?`. So two queries differing +//! only in such a literal — e.g. selecting a different JSON field or +//! sampling a different bucket — collapse to the same normalized string. +//! +//! Three opt-in toggles ([`NormalizerOptions`]) further collapse +//! repetitive shapes: //! //! - [`unify_in_list`](NormalizerOptions::unify_in_list): //! `IN (1, 2, 3)` → `IN (...)`. From 1058d683f705bd9291776fe2a8131fc484718d40 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 18:04:15 +0900 Subject: [PATCH 08/11] fix: treat a parameterized FROM function as an opaque table factor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `generate_series(1, 10) AS g` parses as TableFactor::Table with args, which was bound as a real-table Scan — so the function name appeared in table reads and a ref through its alias (`g.value`) surfaced as a phantom base read `generate_series.value`. Bind it as an opaque table-producing factor (like UNNEST / TableFactor::Function): its args are reads, a ref through its alias is a synthetic source, and the name is not a real table. This matches the documented "table functions are opaque" contract and the UNNEST path. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder/query.rs | 25 +++++++++---------- .../arm_coverage.rs | 19 ++++++++++++++ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/sql-insight/src/resolver/binder/query.rs b/sql-insight/src/resolver/binder/query.rs index f2ab448..d52274e 100644 --- a/sql-insight/src/resolver/binder/query.rs +++ b/sql-insight/src/resolver/binder/query.rs @@ -585,6 +585,17 @@ impl<'a> Binder<'a> { TableFactor::Table { name, alias, args, .. } => { + // `foo(args)` in FROM is a table-valued function, not a base + // table — bind it as an opaque table-producing factor (like + // UNNEST / `TableFactor::Function`): its produced columns are + // dynamic, so a reference through its alias is a synthetic source + // and the function name is *not* a real table read. The argument + // expressions read against the sibling scope. + if let Some(args) = args { + let bound = + self.bind_function_arg_list(&args.args, &Scope::from_relations(left)); + return self.opaque(LogicalPlan::Empty, bound, alias.as_ref()); + } let Some(written) = self.table_ref(name) else { return (LogicalPlan::Empty, Scope::default()); }; @@ -611,19 +622,7 @@ impl<'a> Binder<'a> { ); } } - let (scan, scope) = self.bind_named_table(&written, alias_name); - // A parameterised table reference `foo(args)`: the argument - // expressions read against the surrounding (sibling) scope — - // attach them as a non-feeding filter over the scan. - let node = match args { - Some(args) => LogicalPlan::Filter(Filter { - input: Box::new(scan), - predicate: self - .bind_function_arg_list(&args.args, &Scope::from_relations(left)), - }), - None => scan, - }; - (node, scope) + self.bind_named_table(&written, alias_name) } // A derived table `() AS d`: bind the subquery, expose // its output columns as a synthetic relation under the alias. A diff --git a/sql-insight/tests/column_operation_extractor/arm_coverage.rs b/sql-insight/tests/column_operation_extractor/arm_coverage.rs index 184990a..6883b1c 100644 --- a/sql-insight/tests/column_operation_extractor/arm_coverage.rs +++ b/sql-insight/tests/column_operation_extractor/arm_coverage.rs @@ -872,6 +872,25 @@ mod relation_arm_coverage { ); } + #[test] + fn parameterized_table_function_is_opaque() { + // `generate_series(1, 10) AS g` is a table-valued function, not a base + // table: a reference through its alias (`g.value`) is a synthetic + // source, dropped from reads, and `generate_series` is not a real table + // read. (It used to be scanned as a real table, surfacing a phantom + // `generate_series.value` base read.) Literal args contribute nothing. + assert_unordered_eq!( + reads("SELECT g.value FROM generate_series(1, 10) AS g"), + vec![] + ); + // A column argument is still walked (here `t` is not in FROM, so it's + // unresolved) — proving the args read like any other opaque factor. + assert_unordered_eq!( + reads("SELECT g.v FROM generate_series(t.a) AS g"), + vec![unresolved("a")] + ); + } + #[test] fn pivot() { let result = op("SELECT * FROM t PIVOT(SUM(t.amt) FOR t.mon IN ('a', 'b'))"); From 19c3a7389a2c5d73c43f56aaf6c94a741f87a6c5 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 18:23:14 +0900 Subject: [PATCH 09/11] feat: flag INSERT/MERGE arity mismatches for VALUES and column-less sources The InsertColumnsArityMismatch diagnostic only fired for an explicit column list against a SELECT source. Extend it: derive the source value count from a VALUES row width too, and check it for plain INSERT, MERGE WHEN-INSERT, and the column-less catalog-filled case. An explicit list is flagged on any mismatch (both directions are a hard error in real engines); a column-less target is flagged only when the source overflows it (a narrower source may fill the first N columns + defaults, e.g. PostgreSQL). New diagnose_insert_arity helper. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder.rs | 24 ++++++++ sql-insight/src/resolver/binder/statement.rs | 59 ++++++++++++++----- .../column_operation_extractor/cte_and_dml.rs | 25 ++++++++ .../column_operation_extractor/diagnostics.rs | 17 ++++++ .../column_operation_extractor/resolution.rs | 6 +- 5 files changed, 113 insertions(+), 18 deletions(-) diff --git a/sql-insight/src/resolver/binder.rs b/sql-insight/src/resolver/binder.rs index 14cd091..da45cb2 100644 --- a/sql-insight/src/resolver/binder.rs +++ b/sql-insight/src/resolver/binder.rs @@ -283,6 +283,30 @@ impl<'a> Binder<'a> { }); } + /// Record an INSERT / MERGE-INSERT arity mismatch between the target columns + /// and the source values *if* they disagree (a no-op otherwise) — the caller + /// passes the two determinate counts (no wildcard). An **explicit** column + /// list must match exactly: either direction silently zips to the shorter + /// side. A **column-less** target filled from the catalog is flagged only + /// when the source is *wider* (the surplus is dropped); a narrower source + /// may rely on column defaults, so it isn't. + pub(super) fn diagnose_insert_arity( + &mut self, + target: &TableReference, + explicit: bool, + target_columns: usize, + source_columns: usize, + ) { + let mismatch = if explicit { + source_columns != target_columns + } else { + source_columns > target_columns + }; + if mismatch { + self.record_insert_columns_arity_mismatch(target, target_columns, source_columns); + } + } + /// Flag a CTAS / CREATE VIEW (without an explicit column list) whose source /// projects unaliased expressions: those columns have no name recoverable /// from the SQL text, so they're dropped from column `writes` / `lineage`. diff --git a/sql-insight/src/resolver/binder/statement.rs b/sql-insight/src/resolver/binder/statement.rs index 9a258c4..55eaa2b 100644 --- a/sql-insight/src/resolver/binder/statement.rs +++ b/sql-insight/src/resolver/binder/statement.rs @@ -160,21 +160,29 @@ impl<'a> Binder<'a> { if columns.is_empty() && insert.source.is_some() { self.record_insert_columns_unresolved(&target, source_wildcard); } - // An *explicit* target column list whose count differs from the source - // query's projected columns: relation lineage zips to the shorter side, - // silently dropping the surplus. Flag it. (Only when the source exposes - // a determinate projection — a `VALUES` / pure-wildcard source yields no - // operands here, and a wildcard-bearing source is indeterminate.) - if !insert.columns.is_empty() && !source_wildcard { - if let Some(operand) = output_operands(&input).first() { - let outputs = operand.outputs; - if !outputs.is_empty() && outputs.len() != columns.len() { - self.record_insert_columns_arity_mismatch( - &target, - columns.len(), - outputs.len(), - ); - } + // The source's determinate value count: a `VALUES` row width, or a + // SELECT's projected output count. `None` when indeterminate (a + // wildcard-bearing source, or no inspectable operand). + let source_count = if source_wildcard { + None + } else { + match &input { + LogicalPlan::Values(v) => v.rows.first().map(Vec::len), + other => output_operands(other) + .first() + .map(|operand| operand.outputs.len()) + .filter(|n| *n > 0), + } + }; + if let Some(source_count) = source_count { + if !insert.columns.is_empty() { + // An explicit list must match the source exactly — either + // direction silently zips to the shorter side. + self.diagnose_insert_arity(&target, true, columns.len(), source_count); + } else if !m.columns.is_empty() { + // A column-less target filled from the catalog: a wider source + // overflows the table, dropping its surplus columns. + self.diagnose_insert_arity(&target, false, m.columns.len(), source_count); } } // ON CONFLICT DO UPDATE / ON DUPLICATE KEY UPDATE: extra writes + their @@ -547,6 +555,27 @@ impl<'a> Binder<'a> { if columns.is_empty() && !row.is_empty() { self.record_insert_columns_unresolved(&target, false); } + // Arity (mirrors `bind_insert`): an explicit column + // list must match the row exactly; a column-less + // catalog-filled target is flagged only when the row + // overflows it. + if !row.is_empty() { + if !insert.columns.is_empty() { + self.diagnose_insert_arity( + &target, + true, + columns.len(), + row.len(), + ); + } else if !catalog_cols.is_empty() { + self.diagnose_insert_arity( + &target, + false, + catalog_cols.len(), + row.len(), + ); + } + } clauses.push(MergeClause::Insert { columns: self.column_writes(&target, &catalog_cols, columns), values: row, diff --git a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs index b1915e8..c201ad2 100644 --- a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs +++ b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs @@ -152,6 +152,31 @@ mod merge { ); } + #[test] + fn merge_insert_arity_mismatch_is_flagged() { + // 3 target columns, 2 values: the surplus column is dropped (no value to + // pair with), flagged like a plain INSERT arity mismatch. + assert_column_ops( + "MERGE INTO t USING s ON t.id = s.id \ + WHEN NOT MATCHED THEN INSERT (a, b, c) VALUES (s.x, s.y)", + ColumnOperation { + statement_kind: StatementKind::Merge, + reads: vec![ + read("t", "id"), + read("s", "id"), + read("s", "x"), + read("s", "y"), + ], + writes: vec![write("t", "a"), write("t", "b")], + lineage: vec![ + passthrough(col("s", "x"), relation("t", "a")), + passthrough(col("s", "y"), relation("t", "b")), + ], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::InsertColumnsArityMismatch)], + }, + ); + } + #[test] fn merge_when_not_matched_insert_emits_lineage_and_write() { assert_column_ops( diff --git a/sql-insight/tests/column_operation_extractor/diagnostics.rs b/sql-insight/tests/column_operation_extractor/diagnostics.rs index a2fed47..548ef82 100644 --- a/sql-insight/tests/column_operation_extractor/diagnostics.rs +++ b/sql-insight/tests/column_operation_extractor/diagnostics.rs @@ -191,6 +191,23 @@ mod reported { ); } + #[test] + fn insert_values_arity_mismatch_is_flagged() { + // An explicit column list against a VALUES row: 3 columns, 2 values → + // flagged like the SELECT-source case. All three columns still surface + // as writes (from syntax); a VALUES source has no traceable lineage. + assert_column_ops( + "INSERT INTO t (a, b, c) VALUES (1, 2)", + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![], + writes: vec![write("t", "a"), write("t", "b"), write("t", "c")], + lineage: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::InsertColumnsArityMismatch)], + }, + ); + } + #[test] fn insert_with_wildcard_source_drops_lineage_and_skips_arity_diagnostic() { // A wildcard in the source projection (`SELECT *, y`) makes the column diff --git a/sql-insight/tests/column_operation_extractor/resolution.rs b/sql-insight/tests/column_operation_extractor/resolution.rs index 66389fd..cc9d472 100644 --- a/sql-insight/tests/column_operation_extractor/resolution.rs +++ b/sql-insight/tests/column_operation_extractor/resolution.rs @@ -207,8 +207,8 @@ mod catalog_strict { #[test] fn catalog_insert_without_explicit_columns_source_longer_than_target() { - // 3 source projections vs t = [x, y] — pair what fits, - // surplus source column gets no lineage. + // 3 source projections vs t = [x, y] — pair what fits, the surplus + // source column gets no lineage, and the overflow is flagged. let catalog = TestCatalog::default().with("t", vec!["x", "y"]); assert_column_ops_with_catalog( "INSERT INTO t SELECT a, b, c FROM s", @@ -221,7 +221,7 @@ mod catalog_strict { passthrough(col("s", "a"), relation("t", "x")), passthrough(col("s", "b"), relation("t", "y")), ], - diagnostics: vec![], + diagnostics: vec![diag(ColumnLevelDiagnosticKind::InsertColumnsArityMismatch)], }, ); } From d7950f83c14af2fe6b3e7af28540bb6c3262f628 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 18:33:34 +0900 Subject: [PATCH 10/11] feat: count GROUP BY / ORDER BY positional ordinals as reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A positional ordinal key (`GROUP BY 1`, `ORDER BY 2`) bound to an empty literal, so it added no read — `GROUP BY 1` undercounted vs `GROUP BY a` (occurrence-based reads). Resolve a plain positive-integer key to the 1-based n-th query output and bind it by name, reusing clause-alias resolution: an identity output reads its column again, an introduced alias binds Derived (suppressed, the dependency is at the projection). Out-of-range / non-integer / unnamed positions bind as written. New ordinal_output_name helper. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/resolver/binder.rs | 17 +++++- sql-insight/src/resolver/binder/expr.rs | 17 +++++- .../reads_semantics.rs | 55 +++++++++++++++++++ 3 files changed, 86 insertions(+), 3 deletions(-) diff --git a/sql-insight/src/resolver/binder.rs b/sql-insight/src/resolver/binder.rs index da45cb2..2bfde37 100644 --- a/sql-insight/src/resolver/binder.rs +++ b/sql-insight/src/resolver/binder.rs @@ -45,7 +45,7 @@ use sqlparser::ast::{ ObjectName, ObjectType, OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, OutputClause, PipeOperator, PivotValueSource, Query, Select, SelectItem, SetExpr, Statement, TableAlias, TableFactor, TableObject, TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, - Values as SqlValues, + Value, Values as SqlValues, }; use sqlparser::ast::{ @@ -696,6 +696,21 @@ fn inferred_name(expr: &SqlExpr) -> Option { } } +/// The name of the query output a positional ordinal key (`GROUP BY 1` / +/// `ORDER BY 1`) refers to — the 1-based n-th [`Scope::query_outputs`] entry, +/// if it has one. `None` for a non-integer / zero / out-of-range position, or +/// an anonymous output: the caller then binds the literal as written. +fn ordinal_output_name(expr: &SqlExpr, scope: &Scope) -> Option { + let SqlExpr::Value(v) = expr else { + return None; + }; + let Value::Number(digits, _) = &v.value else { + return None; + }; + let n: usize = digits.parse().ok()?; + scope.query_outputs.get(n.checked_sub(1)?)?.name.clone() +} + /// The `ON` predicate of a join operator, if any. /// The constraint of any constraint-carrying join operator (everything but /// `CROSS APPLY` / `OUTER APPLY`). diff --git a/sql-insight/src/resolver/binder/expr.rs b/sql-insight/src/resolver/binder/expr.rs index 37d2948..bdbf30e 100644 --- a/sql-insight/src/resolver/binder/expr.rs +++ b/sql-insight/src/resolver/binder/expr.rs @@ -687,12 +687,25 @@ impl<'a> Binder<'a> { _ => None, })); for expr in members { - keys.push(self.bind_expr(expr, scope)); + keys.push(self.bind_clause_key(expr, scope)); } } keys } + /// Bind a GROUP BY / ORDER BY key. A positional ordinal (`GROUP BY 1`) binds + /// as if the 1-based n-th output column were named explicitly — so it reads + /// (an identity output) or suppresses (an introduced alias) exactly like the + /// by-name form, keeping `reads` occurrence-consistent (`GROUP BY a` and + /// `GROUP BY 1` agree). Any other key — or an out-of-range / unnamed + /// position — binds as written. + fn bind_clause_key(&mut self, expr: &SqlExpr, scope: &Scope) -> Expr { + match ordinal_output_name(expr, scope) { + Some(name) => self.bind_expr(&SqlExpr::Identifier(name), scope), + None => self.bind_expr(expr, scope), + } + } + /// The ORDER BY key expressions (a trailing `query.order_by`). pub(super) fn order_by_keys(&mut self, order_by: &OrderBy, scope: &Scope) -> Vec { let OrderByKind::Expressions(exprs) = &order_by.kind else { @@ -706,7 +719,7 @@ impl<'a> Binder<'a> { pub(super) fn order_by_expr_keys(&mut self, exprs: &[OrderByExpr], scope: &Scope) -> Vec { exprs .iter() - .map(|e| self.bind_expr(&e.expr, scope)) + .map(|e| self.bind_clause_key(&e.expr, scope)) .collect() } diff --git a/sql-insight/tests/column_operation_extractor/reads_semantics.rs b/sql-insight/tests/column_operation_extractor/reads_semantics.rs index 510c6d1..acecd86 100644 --- a/sql-insight/tests/column_operation_extractor/reads_semantics.rs +++ b/sql-insight/tests/column_operation_extractor/reads_semantics.rs @@ -1008,6 +1008,61 @@ mod output_alias_visibility { ); } + #[test] + fn group_by_ordinal_reads_like_the_named_column() { + // `GROUP BY 1` is the 1st output (`a`, an identity output), so it reads + // `t.a` a second time — occurrence-consistent with `GROUP BY a`. + assert_column_ops( + "SELECT a FROM t GROUP BY 1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "a")], + writes: vec![], + lineage: vec![passthrough(col("t", "a"), out("a", 0))], + diagnostics: vec![], + }, + ); + } + + #[test] + fn group_by_ordinal_of_introduced_alias_is_suppressed() { + // `GROUP BY 1` here is `a + b AS x` (an introduced alias) — like + // `GROUP BY x`, it binds Derived and adds no read; the dependency on + // `a`, `b` is already counted at the projection. + assert_column_ops( + "SELECT a + b AS x FROM t GROUP BY 1", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "b")], + writes: vec![], + lineage: vec![ + transformation(col("t", "a"), out("x", 0)), + transformation(col("t", "b"), out("x", 0)), + ], + diagnostics: vec![], + }, + ); + } + + #[test] + fn order_by_ordinal_reads_the_referenced_output() { + // `ORDER BY 2` is the 2nd output (`b`), so it reads `t.b` again — like + // `ORDER BY b`. (`a` is read once, at the projection.) + assert_column_ops( + "SELECT a, b FROM t ORDER BY 2", + ColumnOperation { + statement_kind: StatementKind::Select, + reads: vec![read("t", "a"), read("t", "b"), read("t", "b")], + writes: vec![], + lineage: vec![ + passthrough(col("t", "a"), out("a", 0)), + passthrough(col("t", "b"), out("b", 1)), + ], + diagnostics: vec![], + }, + ); + } + #[test] fn having_computed_alias_is_suppressed() { // `s` = `SUM(a)` is a computed alias; HAVING s references the From 1c503c7ed1995651b9f6bd6ab3c51672a784d584 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Sat, 27 Jun 2026 19:28:16 +0900 Subject: [PATCH 11/11] test: cover untested SQL constructs and public reference conversions Add coverage for constructs the suite didn't exercise: a function called with a subquery argument, a named-window `OVER w` reference with frame-bound expressions, Hive `CLUSTER BY` / `DISTRIBUTE BY`, and an `ON CONFLICT DO UPDATE` value that is a subquery. Add unit tests for `TableReference`'s public `TryFrom` conversions (`&ObjectName` / `&TableFactor` / `&Insert`, plus the derived-factor error path) and three-part-name `Display`. Lifts the weakest module (reference.rs 83% -> 95% region, functions -> 100%); total region 94.2% -> 94.7%, line 95.6% -> 96.1%. Co-Authored-By: Claude Opus 4.8 (1M context) --- sql-insight/src/reference.rs | 63 +++++++++++++++++++ .../arm_coverage.rs | 36 +++++++++++ .../column_operation_extractor/cte_and_dml.rs | 18 ++++++ 3 files changed, 117 insertions(+) diff --git a/sql-insight/src/reference.rs b/sql-insight/src/reference.rs index 9a80cfa..93d41c0 100644 --- a/sql-insight/src/reference.rs +++ b/sql-insight/src/reference.rs @@ -552,3 +552,66 @@ impl TryFrom<&ObjectName> for TableReference { Self::try_from_name(obj_name) } } + +#[cfg(test)] +mod tests { + use super::*; + use sqlparser::ast::{SetExpr, Statement}; + use sqlparser::dialect::GenericDialect; + use sqlparser::parser::Parser; + + /// The first FROM factor of `SELECT 1 FROM ` — a handle on a parsed + /// `TableFactor` (and, for a `Table`, its `ObjectName`) to drive the public + /// `TryFrom` conversions. + fn first_table_factor(from: &str) -> TableFactor { + let sql = format!("SELECT 1 FROM {from}"); + let mut stmts = Parser::parse_sql(&GenericDialect {}, &sql).unwrap(); + let Statement::Query(query) = stmts.remove(0) else { + panic!("expected a query"); + }; + let SetExpr::Select(select) = *query.body else { + panic!("expected a SELECT"); + }; + select.from.into_iter().next().unwrap().relation + } + + #[test] + fn try_from_object_name_keeps_catalog_schema_name_and_displays_all_parts() { + let factor = first_table_factor("cat.sch.tbl"); + let TableFactor::Table { name, .. } = &factor else { + panic!("expected a table factor"); + }; + let reference = TableReference::try_from(name).unwrap(); + assert_eq!(reference.catalog.as_ref().unwrap().value, "cat"); + assert_eq!(reference.schema.as_ref().unwrap().value, "sch"); + assert_eq!(reference.name.value, "tbl"); + // Display renders every present part (the three-part / catalog branch). + assert_eq!(reference.to_string(), "cat.sch.tbl"); + } + + #[test] + fn try_from_table_factor_converts_a_table_and_rejects_a_derived_factor() { + let table = first_table_factor("a.b"); + let reference = TableReference::try_from(&table).unwrap(); + assert_eq!(reference.schema.as_ref().unwrap().value, "a"); + assert_eq!(reference.name.value, "b"); + // A non-`Table` factor names no stored table — an analysis error. + let derived = first_table_factor("(SELECT 1) AS d"); + assert!(matches!( + TableReference::try_from(&derived), + Err(Error::AnalysisError(_)) + )); + } + + #[test] + fn try_from_insert_takes_the_target_name() { + let mut stmts = + Parser::parse_sql(&GenericDialect {}, "INSERT INTO a.b VALUES (1)").unwrap(); + let Statement::Insert(insert) = stmts.remove(0) else { + panic!("expected an insert"); + }; + let reference = TableReference::try_from(&insert).unwrap(); + assert_eq!(reference.schema.as_ref().unwrap().value, "a"); + assert_eq!(reference.name.value, "b"); + } +} diff --git a/sql-insight/tests/column_operation_extractor/arm_coverage.rs b/sql-insight/tests/column_operation_extractor/arm_coverage.rs index 6883b1c..9386cfd 100644 --- a/sql-insight/tests/column_operation_extractor/arm_coverage.rs +++ b/sql-insight/tests/column_operation_extractor/arm_coverage.rs @@ -355,6 +355,42 @@ mod expr_arm_coverage { ); } + #[test] + fn function_subquery_argument() { + // A function called with a subquery argument (`f((SELECT …))`): the + // subquery's columns are walked as reads (value-position). + assert_unordered_eq!(reads("SELECT f((SELECT x FROM t)) FROM t"), vec![c("x")]); + } + + #[test] + fn named_window_reference_with_frame() { + // `OVER w` references a `WINDOW w AS (…)` definition: the function arg is + // a value, and the named window's ORDER BY key + frame-bound expressions + // surface as (row-positioning) reads. + assert_unordered_eq!( + reads( + "SELECT SUM(a) OVER w FROM t \ + WINDOW w AS (ORDER BY b ROWS BETWEEN c PRECEDING AND d FOLLOWING)" + ), + vec![c("a"), c("b"), c("c"), c("d")] + ); + } + + #[test] + fn cluster_by() { + // Hive `CLUSTER BY` keys are row-positioning reads. + assert_unordered_eq!(reads("SELECT a FROM t CLUSTER BY b"), vec![c("a"), c("b")]); + } + + #[test] + fn distribute_by() { + // Hive `DISTRIBUTE BY` keys are row-positioning reads. + assert_unordered_eq!( + reads("SELECT a FROM t DISTRIBUTE BY b"), + vec![c("a"), c("b")] + ); + } + #[test] fn limit_offset_comma() { // The `LIMIT , ` form (`OffsetCommaLimit`): both diff --git a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs index c201ad2..f5f3f64 100644 --- a/sql-insight/tests/column_operation_extractor/cte_and_dml.rs +++ b/sql-insight/tests/column_operation_extractor/cte_and_dml.rs @@ -614,6 +614,24 @@ mod on_conflict { ); } + #[test] + fn pg_on_conflict_do_update_set_subquery_value() { + // DO UPDATE SET x = (SELECT max(y) FROM s): the conflict-action value is + // a subquery — its column flows to the SET target (Transformation, via + // `max`). `x` is written twice (the INSERT column + the SET target). + assert_column_ops_with_dialect( + "INSERT INTO t (x) VALUES (1) ON CONFLICT (x) DO UPDATE SET x = (SELECT max(y) FROM s)", + &PostgreSqlDialect {}, + ColumnOperation { + statement_kind: StatementKind::Insert, + reads: vec![read("s", "y")], + writes: vec![write("t", "x"), write("t", "x")], + lineage: vec![transformation(col("s", "y"), relation("t", "x"))], + diagnostics: vec![], + }, + ); + } + #[test] fn pg_on_conflict_do_nothing_is_indistinguishable_from_plain_insert() { assert_column_ops_with_dialect(