diff --git a/AGENTS.md b/AGENTS.md index 7d3262710..6f27bdb0c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,6 +33,12 @@ Skills follow the [Agent Skills](https://agentskills.io) open standard. Each ski - `SKILL.md` — The skill definition with YAML frontmatter (name, description, argument-hint) and detailed instructions. - Additional supporting files as needed. +Currently available skills: + +- [`check-upstream`](.ai/skills/check-upstream/SKILL.md) — audit upstream + Apache DataFusion features (functions, DataFrame ops, SessionContext + methods, FFI types) not yet exposed in the Python bindings. + ## Pull Requests Every pull request must follow the template in diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index a7a497dab..50fe94e8d 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -49,4 +49,5 @@ benchmarks/tpch/create_tables.sql **/.cargo/config.toml uv.lock examples/tpch/answers_sf1/*.tbl -SKILL.md \ No newline at end of file +SKILL.md +docs/source/llms.txt \ No newline at end of file diff --git a/docs/source/ai-coding-assistants.rst b/docs/source/ai-coding-assistants.rst new file mode 100644 index 000000000..7c12cb43b --- /dev/null +++ b/docs/source/ai-coding-assistants.rst @@ -0,0 +1,82 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Using AI Coding Assistants +========================== + +If you write DataFusion Python code with an AI coding assistant, this +project ships machine-readable guidance so the assistant produces +idiomatic code rather than guessing from its training data. + +What is published +----------------- + +- `SKILL.md `_ — + a dense, skill-oriented reference covering imports, data loading, + DataFrame operations, expression building, SQL-to-DataFrame mappings, + idiomatic patterns, and common pitfalls. Follows the + `Agent Skills `_ open standard. +- `llms.txt `_ — an entry point for LLM-based tools following the + `llmstxt.org `_ convention. Categorized links to the + skill, user guide, API reference, and examples. + +Both files live at stable URLs so an agent can discover them without a +checkout of the repo. + +Installing the skill +-------------------- + +**Preferred:** run + +.. code-block:: shell + + npx skills add apache/datafusion-python + +This installs the skill in any supported agent on your machine (Claude +Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others). +The command writes the pointer into the agent's configuration so that any +project you open that uses DataFusion Python picks up the skill +automatically. + +**Manual:** if you are not using the ``skills`` registry, paste this +single line into your project's ``AGENTS.md`` or ``CLAUDE.md``:: + + For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/SKILL.md + +Most assistants resolve that pointer the first time they see a +DataFusion-related prompt in the project. + +What the skill covers +--------------------- + +Writing DataFusion Python code has a handful of conventions that are easy +for a model to miss — bitwise ``&`` / ``|`` / ``~`` instead of Python +``and`` / ``or`` / ``not``, the lazy-DataFrame immutability model, how +window functions replace SQL correlated subqueries, the ``case`` / +``when`` builder syntax, and the ``in_list`` / ``array_position`` options +for membership tests. The skill enumerates each of these with short, +copyable examples. + +It is *not* a replacement for this user guide. Think of it as a distilled +reference the assistant keeps open while it writes code for you. + +If you are an agent author +-------------------------- + +The skill file and ``llms.txt`` are the two supported integration +points. Both are versioned along with the release and follow open +standards — no project-specific handshake is required. diff --git a/docs/source/conf.py b/docs/source/conf.py index 01813b032..b2e9bb8c3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -129,6 +129,10 @@ def setup(sphinx) -> None: # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +# Copy agent-facing files (llms.txt) verbatim to the site root so they +# resolve at conventional URLs like `https://.../python/llms.txt`. +html_extra_path = ["llms.txt"] + html_logo = "_static/images/2x_bgwhite_original.png" html_css_files = ["theme_overrides.css"] diff --git a/docs/source/index.rst b/docs/source/index.rst index 134d41cb6..0e2b065c1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -78,6 +78,7 @@ Example user-guide/configuration user-guide/sql user-guide/upgrade-guides + ai-coding-assistants .. _toc.contributor_guide: diff --git a/docs/source/llms.txt b/docs/source/llms.txt new file mode 100644 index 000000000..4d6680426 --- /dev/null +++ b/docs/source/llms.txt @@ -0,0 +1,36 @@ +# DataFusion in Python + +> Apache DataFusion Python is a Python binding for Apache DataFusion, an in-process, Arrow-native query engine. It exposes a SQL interface and a lazy DataFrame API over PyArrow and any Arrow C Data Interface source. This file points agents and LLM-based tools at the most useful entry points for writing DataFusion Python code. + +## Agent Guide + +- [SKILL.md (agent skill, raw)](https://raw.githubusercontent.com/apache/datafusion-python/main/SKILL.md): idiomatic DataFrame API patterns, SQL-to-DataFrame mappings, common pitfalls, and the full `functions` catalog. Primary source of truth for writing datafusion-python code. +- [Using DataFusion with AI coding assistants](https://datafusion.apache.org/python/ai-coding-assistants.html): human-readable guide for installing the skill and manual setup pointers. + +## User Guide + +- [Introduction](https://datafusion.apache.org/python/user-guide/introduction.html): install, the Pokemon quick start, Jupyter tips. +- [Basics](https://datafusion.apache.org/python/user-guide/basics.html): `SessionContext`, `DataFrame`, and `Expr` at a glance. +- [Data sources](https://datafusion.apache.org/python/user-guide/data-sources.html): Parquet, CSV, JSON, Arrow, Pandas, Polars, and Python objects. +- [DataFrame operations](https://datafusion.apache.org/python/user-guide/dataframe/index.html): the lazy query-building interface. +- [Common operations](https://datafusion.apache.org/python/user-guide/common-operations/index.html): select, filter, join, aggregate, window, expressions, and functions. +- [SQL](https://datafusion.apache.org/python/user-guide/sql.html): running SQL against registered tables. +- [Configuration](https://datafusion.apache.org/python/user-guide/configuration.html): session and runtime options. + +## DataFrame API reference + +- [`datafusion.dataframe.DataFrame`](https://datafusion.apache.org/python/autoapi/datafusion/dataframe/index.html): the lazy DataFrame builder (`select`, `filter`, `aggregate`, `join`, `sort`, `limit`, set operations). +- [`datafusion.expr`](https://datafusion.apache.org/python/autoapi/datafusion/expr/index.html): expression tree nodes (`Expr`, `Window`, `WindowFrame`, `GroupingSet`). +- [`datafusion.functions`](https://datafusion.apache.org/python/autoapi/datafusion/functions/index.html): 290+ scalar, aggregate, and window functions. +- [`datafusion.context.SessionContext`](https://datafusion.apache.org/python/autoapi/datafusion/context/index.html): session entry point, data loading, SQL execution. + +## Examples + +- [TPC-H queries (GitHub)](https://github.com/apache/datafusion-python/tree/main/examples/tpch): canonical translations of TPC-H Q01–Q22 to idiomatic DataFrame code, each with reference SQL embedded in the module docstring. +- [Other examples (GitHub)](https://github.com/apache/datafusion-python/tree/main/examples): UDF/UDAF/UDWF, Substrait, Pandas/Polars interop, S3 reads. + +## Optional + +- [Contributor guide](https://datafusion.apache.org/python/contributor-guide/introduction.html): building from source, extending the Python bindings. +- [Upgrade guides](https://datafusion.apache.org/python/user-guide/upgrade-guides.html): migration notes between releases. +- [Upstream Rust `DataFusion`](https://datafusion.apache.org/): the underlying query engine. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index de24a2ba5..a902fab5c 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -163,6 +163,59 @@ Suppose we want to find the speed values for only Pokemon that have low Attack v f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack")]) +Building per-group arrays +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:py:func:`~datafusion.functions.array_agg` collects the values within each +group into a list. Combined with ``distinct=True`` and the ``filter`` +argument, it lets you ask two questions of the same group in one pass — +"what are all the values?" and "what are the values that satisfy some +condition?". + +Suppose each row records a line item with the supplier that fulfilled it and +a flag for whether that supplier met the commit date. We want to identify +orders where exactly one supplier failed, among two or more suppliers in +total: + +.. ipython:: python + + orders_df = ctx.from_pydict( + { + "order_id": [1, 1, 1, 2, 2, 3], + "supplier_id": [100, 101, 102, 200, 201, 300], + "failed": [False, True, False, False, False, True], + }, + ) + + grouped = orders_df.aggregate( + [col("order_id")], + [ + f.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"), + f.array_agg( + col("supplier_id"), + filter=col("failed"), + distinct=True, + ).alias("failed_suppliers"), + ], + ) + + grouped.filter( + (f.array_length(col("failed_suppliers")) == lit(1)) + & (f.array_length(col("all_suppliers")) > lit(1)) + ).select( + col("order_id"), + f.array_element(col("failed_suppliers"), lit(1)).alias("the_one_bad_supplier"), + ) + +Two aspects of the pattern are worth calling out: + +- ``filter=`` on an aggregate narrows the rows contributing to *that* + aggregate only. Filtering the DataFrame before the aggregate would have + dropped whole groups that no longer had any rows. +- :py:func:`~datafusion.functions.array_length` tests group size without + another aggregate pass, and :py:func:`~datafusion.functions.array_element` + extracts a single value when you have proven the array has length one. + Grouping Sets ------------- diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 7848b4ee7..aeb6e2ed1 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -146,6 +146,98 @@ This function returns a new array with the elements repeated. In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. +Testing membership in a list +---------------------------- + +A common need is filtering rows where a column equals *any* of a small set of +values. DataFusion offers three forms; they differ in readability and in how +they scale: + +1. A compound boolean using ``|`` across explicit equalities. +2. :py:func:`~datafusion.functions.in_list`, which accepts a list of + expressions and tests equality against all of them in one call. +3. A trick with :py:func:`~datafusion.functions.array_position` and + :py:func:`~datafusion.functions.make_array`, which returns the 1-based + index of the value in a constructed array, or null if it is not present. + +.. ipython:: python + + from datafusion import SessionContext, col, lit + from datafusion import functions as f + + ctx = SessionContext() + df = ctx.from_pydict({"shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"]}) + + # Option 1: compound boolean. Fine for two values; awkward past three. + df.filter((col("shipmode") == lit("MAIL")) | (col("shipmode") == lit("SHIP"))) + + # Option 2: in_list. Preferred for readability as the set grows. + df.filter(f.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")])) + + # Option 3: array_position / make_array. Useful when you already have the + # set as an array column and want "is in that array" semantics. + df.filter( + ~f.array_position( + f.make_array(lit("MAIL"), lit("SHIP")), col("shipmode") + ).is_null() + ) + +Use ``in_list`` as the default. It is explicit, readable, and matches the +semantics users expect from SQL's ``IN (...)``. Reach for the +``array_position`` form only when the membership set is itself an array +column rather than a literal list. + +Conditional expressions +----------------------- + +DataFusion provides :py:func:`~datafusion.functions.case` for the SQL +``CASE`` expression in both its switched and searched forms, along with +:py:func:`~datafusion.functions.when` as a standalone builder for the +searched form. + +**Switched CASE** (one expression compared against several literal values): + +.. ipython:: python + + df = ctx.from_pydict( + {"priority": ["1-URGENT", "2-HIGH", "3-MEDIUM", "5-LOW"]}, + ) + + df.select( + col("priority"), + f.case(col("priority")) + .when(lit("1-URGENT"), lit(1)) + .when(lit("2-HIGH"), lit(1)) + .otherwise(lit(0)) + .alias("is_high_priority"), + ) + +**Searched CASE** (an independent boolean predicate per branch). Use this +form whenever a branch tests more than simple equality — for example, +checking whether a joined column is ``NULL`` to gate a computed value: + +.. ipython:: python + + df = ctx.from_pydict( + {"volume": [10.0, 20.0, 30.0], "supplier_id": [1, None, 2]}, + ) + + df.select( + col("volume"), + col("supplier_id"), + f.when(col("supplier_id").is_not_null(), col("volume")) + .otherwise(lit(0.0)) + .alias("attributed_volume"), + ) + +This searched-CASE pattern is idiomatic for "attribute the measure to the +matching side of a left join, otherwise contribute zero" — a shape that +appears in TPC-H Q08 and similar market-share calculations. + +If a switched CASE has only two or three branches that test equality, an +``in_list`` filter combined with :py:meth:`~datafusion.expr.Expr.otherwise` +is often simpler than the full ``case`` builder. + Structs ------- diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index f669721a3..48249bb0d 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -101,6 +101,124 @@ write Rust based UDFs and to expose them to Python. There is an example in the `DataFusion blog `_ describing how to do this. +When not to use a UDF +^^^^^^^^^^^^^^^^^^^^^ + +A UDF is the right tool when the per-row computation genuinely cannot be +expressed with built-in functions. It is often the *wrong* tool for a +predicate that happens to be easier to write in Python. A UDF is opaque +to the optimizer, which means filters expressed as UDFs lose several +rewrites that the engine applies to filters built from native +expressions. The most visible of these is **Parquet predicate pushdown**: +a native predicate can prune entire row groups using the min/max +statistics in the Parquet footer, while a UDF predicate cannot. + +The following example writes a small Parquet file, then filters it two +ways: first with a native expression, then with a UDF that computes the +same result. The filter itself is simple on purpose so we can compare +the plans side by side. + +.. ipython:: python + + import tempfile, os + import pyarrow as pa + import pyarrow.parquet as pq + from datafusion import SessionContext, col, lit, udf + + tmpdir = tempfile.mkdtemp() + parquet_path = os.path.join(tmpdir, "items.parquet") + pq.write_table( + pa.table({ + "id": list(range(100)), + "brand": ["A", "B", "C", "D"] * 25, + "qty": [i * 10 for i in range(100)], + }), + parquet_path, + ) + + ctx = SessionContext() + items = ctx.read_parquet(parquet_path) + +**Native-expression predicate.** The filter is a plain boolean tree +over column references and literals, so the optimizer can analyze it: + +.. ipython:: python + + native_filtered = items.filter( + (col("brand") == lit("A")) & (col("qty") >= lit(150)) + ) + print(native_filtered.execution_plan().display_indent()) + +Notice the ``DataSourceExec`` line. It carries three annotations the +optimizer computed from the predicate: + +- ``predicate=brand@1 = A AND qty@2 >= 150`` — the filter is pushed + into the Parquet scan itself, so the scan only reads matching rows. +- ``pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ... + qty_max@4 >= 150`` — the scan prunes whole row groups by consulting + the Parquet min/max statistics in the footer *before* reading any + column data. +- ``required_guarantees=[brand in (A)]`` — the scan uses this when a + bloom filter or dictionary is available to skip pages. + +**UDF predicate.** Now wrap the same logic in a Python UDF: + +.. ipython:: python + + def brand_qty_filter(brand_arr: pa.Array, qty_arr: pa.Array) -> pa.Array: + return pa.array([ + b.as_py() == "A" and q.as_py() >= 150 + for b, q in zip(brand_arr, qty_arr) + ]) + + pred_udf = udf( + brand_qty_filter, [pa.string(), pa.int64()], pa.bool_(), "stable", + ) + udf_filtered = items.filter(pred_udf(col("brand"), col("qty"))) + print(udf_filtered.execution_plan().display_indent()) + +The ``DataSourceExec`` now carries only ``predicate=brand_qty_filter(...)``. +There is no ``pruning_predicate`` and no ``required_guarantees``: the +scan has to materialize every row group and hand each row to the +Python callback just to decide whether to keep it. + +At small scale the cost difference is invisible; on a Parquet file with +many row groups, or data whose min/max statistics line up well with +the predicate, the native form can skip most of the file. The UDF form +reads all of it. + +**Takeaway.** Reach for a UDF when the per-row computation is genuinely +not expressible as a tree of built-in functions (custom numerical work, +external lookups, complex business rules). When it *is* expressible — +even if the native form is a little more verbose — build the ``Expr`` +tree directly so the optimizer can see through it. For disjunctive +predicates the idiom is to produce one clause per bucket and combine +them with ``|``: + +.. code-block:: python + + from functools import reduce + from operator import or_ + from datafusion import col, lit, functions as f + + buckets = { + "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5}, + "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10}, + } + + def bucket_clause(brand, spec): + return ( + (col("brand") == lit(brand)) + & f.in_list(col("container"), [lit(c) for c in spec["containers"]]) + & (col("quantity") >= lit(spec["min_qty"])) + & (col("quantity") <= lit(spec["min_qty"] + 10)) + & (col("size") >= lit(1)) + & (col("size") <= lit(spec["max_size"])) + ) + + predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items())) + df = df.filter(predicate) + Aggregate Functions ------------------- diff --git a/examples/README.md b/examples/README.md index 0ef194afe..3024c782f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -37,6 +37,7 @@ Here is a direct link to the file used in the examples: - [Query a Parquet file using the DataFrame API](./dataframe-parquet.py) - [Run a SQL query and store the results in a Pandas DataFrame](./sql-to-pandas.py) - [Query PyArrow Data](./query-pyarrow-data.py) +- [Array operations: membership tests, array_agg patterns, array inspection](./array-operations.py) ### Running User-Defined Python Code diff --git a/examples/array-operations.py b/examples/array-operations.py new file mode 100644 index 000000000..884f93974 --- /dev/null +++ b/examples/array-operations.py @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Array operations in DataFusion Python. + +Runnable reference for the idiomatic array-building and array-inspection +patterns. No external data is required -- the example constructs all inputs +through ``from_pydict``. + +Topics covered: + +- ``F.make_array`` to build a literal array expression. +- ``F.array_position`` and ``F.in_list`` for membership tests. +- ``F.array_length`` and ``F.array_element`` for inspecting an aggregated + array. +- ``F.array_agg(distinct=True, filter=...)`` for building two related arrays + per group in one pass, and filtering groups by array size afterwards. + +Run with:: + + python examples/array-operations.py +""" + +from datafusion import SessionContext, col, lit +from datafusion import functions as F + +ctx = SessionContext() + + +# --------------------------------------------------------------------------- +# 1. Membership tests: in_list vs. array_position / make_array +# --------------------------------------------------------------------------- + +shipments = ctx.from_pydict( + { + "order_id": [1, 2, 3, 4, 5], + "shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"], + } +) + +print("\n== in_list: is shipmode one of {MAIL, SHIP}? ==") +shipments.filter(F.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")])).show() + +print("\n== array_position / make_array: same question via a literal array ==") +shipments.filter( + ~F.array_position(F.make_array(lit("MAIL"), lit("SHIP")), col("shipmode")).is_null() +).show() + + +# --------------------------------------------------------------------------- +# 2. array_agg with filter to inspect groups of two related arrays +# --------------------------------------------------------------------------- +# +# Input represents line items per order, each fulfilled by one supplier. The +# `failed` column marks whether the supplier met the commit date. We want to +# find orders with multiple suppliers where exactly one of them failed, and +# report that single failing supplier. + +line_items = ctx.from_pydict( + { + "order_id": [1, 1, 1, 2, 2, 3, 3, 3, 3], + "supplier_id": [100, 101, 102, 200, 201, 300, 301, 302, 303], + "failed": [False, True, False, False, False, True, False, False, False], + } +) + +grouped = line_items.aggregate( + [col("order_id")], + [ + F.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"), + F.array_agg( + col("supplier_id"), + filter=col("failed"), + distinct=True, + ).alias("failed_suppliers"), + ], +) + +print("\n== per-order supplier arrays ==") +grouped.sort(col("order_id").sort()).show() + +print("\n== orders with >1 supplier and exactly one failure ==") +singled_out = grouped.filter( + (F.array_length(col("failed_suppliers")) == lit(1)) + & (F.array_length(col("all_suppliers")) > lit(1)) +).select( + col("order_id"), + F.array_element(col("failed_suppliers"), lit(1)).alias("bad_supplier"), +) +singled_out.sort(col("order_id").sort()).show()