Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions sql-insight/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,17 @@
//! Intentional non-support and known gaps — set expectations before
//! relying on a given output:
//!
//! - **Wildcards not expanded**: `SELECT *` / `t.*` contribute
//! nothing to `reads` / `lineage`. Expanding them safely would
//! require modelling USING / NATURAL JOIN merge, EXCLUDE / REPLACE
//! clauses, and multi-level aliases — too much rigor for a
//! SQL-text-only library. Surfaced as
//! - **Wildcards not expanded**: the `*` / `t.*` itself contributes
//! nothing to `reads` / `lineage` (expanding it safely would require
//! modelling USING / NATURAL JOIN merge, EXCLUDE / EXCEPT / RENAME, and
//! multi-level aliases — too much rigor for a SQL-text-only library).
//! Surfaced as
//! [`WildcardSuppressed`](diagnostic::ColumnLevelDiagnosticKind::WildcardSuppressed)
//! so consumers can detect incomplete projections.
//! so consumers can detect incomplete projections. A `REPLACE (expr AS
//! col)` clause *is* extracted — each replacement's `expr` contributes
//! reads and a `col` lineage edge, exactly like a standalone `expr AS col`
//! — but its **output position** is best-effort, since the wildcard's own
//! columns aren't enumerated to place it among them.
//! - **Table functions are opaque**: `UNNEST` / `generate_series` /
//! `JSON_TABLE` / `PIVOT` etc. produce dynamic columns that aren't
//! enumerated. Their argument expressions surface as reads, but a
Expand Down
15 changes: 13 additions & 2 deletions sql-insight/src/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,19 @@
//!
//! The base pass replaces every literal `Value` with a `?`
//! placeholder, so queries that differ only in their parameter
//! values collapse to the same string. Three opt-in toggles
//! ([`NormalizerOptions`]) further collapse repetitive shapes:
//! values collapse to the same string.
//!
//! "Every literal" is meant literally: it includes `Value`s in
//! structurally significant positions, not just bound-parameter slots.
//! A JSON path (`JSON_TABLE(data, '$.a')`, `JSON_EXTRACT(data, '$.a')`),
//! a `CAST(x AS DATE FORMAT 'YYYY-MM-DD')` format string, the
//! `TABLESAMPLE (BUCKET 3 OUT OF 10)` / `(10 PERCENT)` counts, and
//! `LIMIT` / `OFFSET` are all rewritten to `?`. So two queries differing
//! only in such a literal — e.g. selecting a different JSON field or
//! sampling a different bucket — collapse to the same normalized string.
//!
//! Three opt-in toggles ([`NormalizerOptions`]) further collapse
//! repetitive shapes:
//!
//! - [`unify_in_list`](NormalizerOptions::unify_in_list):
//! `IN (1, 2, 3)` → `IN (...)`.
Expand Down
63 changes: 63 additions & 0 deletions sql-insight/src/reference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,3 +552,66 @@ impl TryFrom<&ObjectName> for TableReference {
Self::try_from_name(obj_name)
}
}

#[cfg(test)]
mod tests {
use super::*;
use sqlparser::ast::{SetExpr, Statement};
use sqlparser::dialect::GenericDialect;
use sqlparser::parser::Parser;

/// The first FROM factor of `SELECT 1 FROM <from>` — a handle on a parsed
/// `TableFactor` (and, for a `Table`, its `ObjectName`) to drive the public
/// `TryFrom` conversions.
fn first_table_factor(from: &str) -> TableFactor {
let sql = format!("SELECT 1 FROM {from}");
let mut stmts = Parser::parse_sql(&GenericDialect {}, &sql).unwrap();
let Statement::Query(query) = stmts.remove(0) else {
panic!("expected a query");
};
let SetExpr::Select(select) = *query.body else {
panic!("expected a SELECT");
};
select.from.into_iter().next().unwrap().relation
}

#[test]
fn try_from_object_name_keeps_catalog_schema_name_and_displays_all_parts() {
let factor = first_table_factor("cat.sch.tbl");
let TableFactor::Table { name, .. } = &factor else {
panic!("expected a table factor");
};
let reference = TableReference::try_from(name).unwrap();
assert_eq!(reference.catalog.as_ref().unwrap().value, "cat");
assert_eq!(reference.schema.as_ref().unwrap().value, "sch");
assert_eq!(reference.name.value, "tbl");
// Display renders every present part (the three-part / catalog branch).
assert_eq!(reference.to_string(), "cat.sch.tbl");
}

#[test]
fn try_from_table_factor_converts_a_table_and_rejects_a_derived_factor() {
let table = first_table_factor("a.b");
let reference = TableReference::try_from(&table).unwrap();
assert_eq!(reference.schema.as_ref().unwrap().value, "a");
assert_eq!(reference.name.value, "b");
// A non-`Table` factor names no stored table — an analysis error.
let derived = first_table_factor("(SELECT 1) AS d");
assert!(matches!(
TableReference::try_from(&derived),
Err(Error::AnalysisError(_))
));
}

#[test]
fn try_from_insert_takes_the_target_name() {
let mut stmts =
Parser::parse_sql(&GenericDialect {}, "INSERT INTO a.b VALUES (1)").unwrap();
let Statement::Insert(insert) = stmts.remove(0) else {
panic!("expected an insert");
};
let reference = TableReference::try_from(&insert).unwrap();
assert_eq!(reference.schema.as_ref().unwrap().value, "a");
assert_eq!(reference.name.value, "b");
}
}
60 changes: 50 additions & 10 deletions sql-insight/src/resolver/binder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,21 @@
//! named on the root, never a read scan.

use sqlparser::ast::{
AlterTable as SqlAlterTable, AlterTableOperation, AssignmentTarget, CreateTable,
CreateTableLikeKind, CreateView as SqlCreateView, Cte as SqlCte, Delete as SqlDelete,
Expr as SqlExpr, FromTable, Function, FunctionArg, FunctionArgExpr, FunctionArguments,
GroupByExpr, GroupByWithModifier, Ident, Insert as SqlInsert, JoinConstraint, JoinOperator,
JsonPathElem, Merge as SqlMerge, MergeAction, MergeInsertKind, ObjectName, ObjectType,
OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind, PipeOperator, PivotValueSource,
Query, Select, SelectItem, SetExpr, Statement, TableAlias, TableFactor, TableObject,
TableWithJoins, Update as SqlUpdate, UpdateTableFromKind, Values as SqlValues,
AlterTable as SqlAlterTable, AlterTableOperation, Assignment as SqlAssignment,
AssignmentTarget, CreateTable, CreateTableLikeKind, CreateView as SqlCreateView, Cte as SqlCte,
Delete as SqlDelete, Expr as SqlExpr, FromTable, Function, FunctionArg, FunctionArgExpr,
FunctionArguments, GroupByExpr, GroupByWithModifier, Ident, Insert as SqlInsert,
JoinConstraint, JoinOperator, JsonPathElem, Merge as SqlMerge, MergeAction, MergeInsertKind,
ObjectName, ObjectType, OnConflictAction, OnInsert, OrderBy, OrderByExpr, OrderByKind,
OutputClause, PipeOperator, PivotValueSource, Query, Select, SelectItem, SetExpr, Statement,
TableAlias, TableFactor, TableObject, TableWithJoins, Update as SqlUpdate, UpdateTableFromKind,
Value, Values as SqlValues,
};

use sqlparser::ast::{
AccessExpr, ConnectByKind, Distinct, FunctionArgumentClause, LimitClause, ListAggOnOverflow,
NamedWindowExpr, SelectItemQualifiedWildcardKind, Subscript, TopQuantity, WindowFrameBound,
WindowSpec, WindowType,
NamedWindowExpr, SelectItemQualifiedWildcardKind, Subscript, TopQuantity,
WildcardAdditionalOptions, WindowFrameBound, WindowSpec, WindowType,
};
use sqlparser::tokenizer::Span;

Expand Down Expand Up @@ -282,6 +283,30 @@ impl<'a> Binder<'a> {
});
}

/// Record an INSERT / MERGE-INSERT arity mismatch between the target columns
/// and the source values *if* they disagree (a no-op otherwise) — the caller
/// passes the two determinate counts (no wildcard). An **explicit** column
/// list must match exactly: either direction silently zips to the shorter
/// side. A **column-less** target filled from the catalog is flagged only
/// when the source is *wider* (the surplus is dropped); a narrower source
/// may rely on column defaults, so it isn't.
pub(super) fn diagnose_insert_arity(
&mut self,
target: &TableReference,
explicit: bool,
target_columns: usize,
source_columns: usize,
) {
let mismatch = if explicit {
source_columns != target_columns
} else {
source_columns > target_columns
};
if mismatch {
self.record_insert_columns_arity_mismatch(target, target_columns, source_columns);
}
}

/// Flag a CTAS / CREATE VIEW (without an explicit column list) whose source
/// projects unaliased expressions: those columns have no name recoverable
/// from the SQL text, so they're dropped from column `writes` / `lineage`.
Expand Down Expand Up @@ -671,6 +696,21 @@ fn inferred_name(expr: &SqlExpr) -> Option<Ident> {
}
}

/// The name of the query output a positional ordinal key (`GROUP BY 1` /
/// `ORDER BY 1`) refers to — the 1-based n-th [`Scope::query_outputs`] entry,
/// if it has one. `None` for a non-integer / zero / out-of-range position, or
/// an anonymous output: the caller then binds the literal as written.
fn ordinal_output_name(expr: &SqlExpr, scope: &Scope) -> Option<Ident> {
let SqlExpr::Value(v) = expr else {
return None;
};
let Value::Number(digits, _) = &v.value else {
return None;
};
let n: usize = digits.parse().ok()?;
scope.query_outputs.get(n.checked_sub(1)?)?.name.clone()
}

/// The `ON` predicate of a join operator, if any.
/// The constraint of any constraint-carrying join operator (everything but
/// `CROSS APPLY` / `OUTER APPLY`).
Expand Down
93 changes: 68 additions & 25 deletions sql-insight/src/resolver/binder/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,26 @@
use super::*;

impl<'a> Binder<'a> {
pub(super) fn bind_select_item(
&mut self,
item: &SelectItem,
scope: &Scope,
) -> Option<NamedExpr> {
pub(super) fn bind_select_item(&mut self, item: &SelectItem, scope: &Scope) -> Vec<NamedExpr> {
match item {
SelectItem::UnnamedExpr(expr) => Some(NamedExpr {
SelectItem::UnnamedExpr(expr) => vec![NamedExpr {
name: inferred_name(expr),
expr: self.bind_expr(expr, scope),
}),
SelectItem::ExprWithAlias { expr, alias } => Some(NamedExpr {
}],
SelectItem::ExprWithAlias { expr, alias } => vec![NamedExpr {
name: Some(alias.clone()),
expr: self.bind_expr(expr, scope),
}),
}],
// A wildcard isn't expanded (the rigor cost is too high for a
// SQL-text-only library); record it so consumers know this
// projection's column lineage is incomplete, and skip it.
// projection's column lineage is incomplete. A `REPLACE (expr AS
// col)` clause is a real value-producing output, though — bind each
// replacement as a named output (its reads / lineage are exactly a
// standalone `expr AS col`; only the output position is best-effort,
// since the wildcard's own columns aren't enumerated).
SelectItem::Wildcard(options) => {
self.record_wildcard_suppressed("wildcard `*`", options.wildcard_token.0.span);
None
self.replace_outputs(options, scope)
}
SelectItem::QualifiedWildcard(kind, options) => {
let description = match kind {
Expand All @@ -38,20 +38,43 @@ impl<'a> Binder<'a> {
self.record_wildcard_suppressed(&description, options.wildcard_token.0.span);
// `(expr).*` still projects its base expression as one
// Transformation output (a structural field access); `alias.*`
// has no inspectable base.
match kind {
SelectItemQualifiedWildcardKind::Expr(expr) => Some(NamedExpr {
// has no inspectable base. Either way a `REPLACE` clause's
// explicit outputs follow.
let mut out = match kind {
SelectItemQualifiedWildcardKind::Expr(expr) => vec![NamedExpr {
name: None,
expr: Expr::Call {
args: vec![self.bind_expr(expr, scope)],
},
}),
SelectItemQualifiedWildcardKind::ObjectName(_) => None,
}
}],
SelectItemQualifiedWildcardKind::ObjectName(_) => Vec::new(),
};
out.extend(self.replace_outputs(options, scope));
out
}
}
}

/// A wildcard's `REPLACE (expr AS col, …)` outputs: each replacement is a
/// value-producing column named by `col`, bound like a standalone
/// `expr AS col` (its reads / lineage are identical). The wildcard's other
/// columns stay unexpanded.
fn replace_outputs(
&mut self,
options: &WildcardAdditionalOptions,
scope: &Scope,
) -> Vec<NamedExpr> {
options
.opt_replace
.iter()
.flat_map(|replace| &replace.items)
.map(|element| NamedExpr {
name: Some(element.column_name.clone()),
expr: self.bind_expr(&element.expr, scope),
})
.collect()
}

/// Resolve a `sqlparser` expression into a bound [`Expr`], mirroring the
/// resolver's `collect_expr`. A bare column reference is the only
/// [`Passthrough`](crate::extractor::ColumnLineageKind::Passthrough) shape
Expand Down Expand Up @@ -261,7 +284,10 @@ impl<'a> Binder<'a> {
self.call([member_of.value.as_ref(), member_of.array.as_ref()], scope)
}
// A scalar subquery (value position): its output flows in.
SqlExpr::Subquery(query) => Expr::Subquery(Box::new(self.bind_subquery(query, scope))),
SqlExpr::Subquery(query) => Expr::Subquery {
plan: Box::new(self.bind_subquery(query, scope)),
output: 0,
},
// Tests (filter position): columns read, never an origin.
SqlExpr::Exists { subquery, .. } => {
Expr::Exists(Box::new(self.bind_subquery(subquery, scope)))
Expand Down Expand Up @@ -454,7 +480,10 @@ impl<'a> Binder<'a> {
}
}
} else if let FunctionArguments::Subquery(query) = &function.args {
args.push(Expr::Subquery(Box::new(self.bind_subquery(query, scope))));
args.push(Expr::Subquery {
plan: Box::new(self.bind_subquery(query, scope)),
output: 0,
});
}
if let Some(filter) = &function.filter {
suppressed.push(self.bind_expr(filter, scope));
Expand Down Expand Up @@ -561,9 +590,11 @@ impl<'a> Binder<'a> {
}

/// Filter-position reads from a SELECT's auxiliary clauses (`DISTINCT ON`
/// keys, `TOP n`, Hive `LATERAL VIEW`, `PREWHERE`, `QUALIFY`, `CONNECT BY`
/// / `START WITH`, `CLUSTER BY` / `DISTRIBUTE BY`, named `WINDOW` specs),
/// resolved against the FROM scope. None feed values.
/// keys, `TOP n`, Hive `LATERAL VIEW`, `PREWHERE`, `CONNECT BY` / `START
/// WITH`, `CLUSTER BY` / `DISTRIBUTE BY`, named `WINDOW` specs), resolved
/// against the FROM scope. None feed values. `QUALIFY` is *not* here — it
/// filters on window / projection outputs (post-projection), so it binds
/// against the output-aware scope in [`bind_select`](Self::bind_select).
pub(super) fn select_clause_reads(&mut self, select: &Select, scope: &Scope) -> Vec<Expr> {
let mut reads = Vec::new();
if let Some(Distinct::On(exprs)) = &select.distinct {
Expand All @@ -578,7 +609,6 @@ impl<'a> Binder<'a> {
reads.push(self.bind_expr(&lateral_view.lateral_view, scope));
}
reads.extend(select.prewhere.iter().map(|e| self.bind_expr(e, scope)));
reads.extend(select.qualify.iter().map(|e| self.bind_expr(e, scope)));
for connect_by in &select.connect_by {
match connect_by {
ConnectByKind::ConnectBy { relationships, .. } => {
Expand Down Expand Up @@ -657,12 +687,25 @@ impl<'a> Binder<'a> {
_ => None,
}));
for expr in members {
keys.push(self.bind_expr(expr, scope));
keys.push(self.bind_clause_key(expr, scope));
}
}
keys
}

/// Bind a GROUP BY / ORDER BY key. A positional ordinal (`GROUP BY 1`) binds
/// as if the 1-based n-th output column were named explicitly — so it reads
/// (an identity output) or suppresses (an introduced alias) exactly like the
/// by-name form, keeping `reads` occurrence-consistent (`GROUP BY a` and
/// `GROUP BY 1` agree). Any other key — or an out-of-range / unnamed
/// position — binds as written.
fn bind_clause_key(&mut self, expr: &SqlExpr, scope: &Scope) -> Expr {
match ordinal_output_name(expr, scope) {
Some(name) => self.bind_expr(&SqlExpr::Identifier(name), scope),
None => self.bind_expr(expr, scope),
}
}

/// The ORDER BY key expressions (a trailing `query.order_by`).
pub(super) fn order_by_keys(&mut self, order_by: &OrderBy, scope: &Scope) -> Vec<Expr> {
let OrderByKind::Expressions(exprs) = &order_by.kind else {
Expand All @@ -676,7 +719,7 @@ impl<'a> Binder<'a> {
pub(super) fn order_by_expr_keys(&mut self, exprs: &[OrderByExpr], scope: &Scope) -> Vec<Expr> {
exprs
.iter()
.map(|e| self.bind_expr(&e.expr, scope))
.map(|e| self.bind_clause_key(&e.expr, scope))
.collect()
}

Expand Down
Loading