diff --git a/AGENTS.md b/AGENTS.md
index 7d3262710..6f27bdb0c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -33,6 +33,12 @@ Skills follow the [Agent Skills](https://agentskills.io) open standard. Each ski
 - `SKILL.md` — The skill definition with YAML frontmatter (name, description, argument-hint) and detailed instructions.
 - Additional supporting files as needed.
 
+Currently available skills:
+
+- [`check-upstream`](.ai/skills/check-upstream/SKILL.md) — audit upstream
+  Apache DataFusion features (functions, DataFrame ops, SessionContext
+  methods, FFI types) not yet exposed in the Python bindings.
+
 ## Pull Requests
 
 Every pull request must follow the template in
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index a7a497dab..50fe94e8d 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -49,4 +49,5 @@ benchmarks/tpch/create_tables.sql
 **/.cargo/config.toml
 uv.lock
 examples/tpch/answers_sf1/*.tbl
-SKILL.md
\ No newline at end of file
+SKILL.md
+docs/source/llms.txt
\ No newline at end of file
diff --git a/docs/source/ai-coding-assistants.rst b/docs/source/ai-coding-assistants.rst
new file mode 100644
index 000000000..7c12cb43b
--- /dev/null
+++ b/docs/source/ai-coding-assistants.rst
@@ -0,0 +1,82 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Using AI Coding Assistants
+==========================
+
+If you write DataFusion Python code with an AI coding assistant, this
+project ships machine-readable guidance so the assistant produces
+idiomatic code rather than guessing from its training data.
+
+What is published
+-----------------
+
+- `SKILL.md <https://github.com/apache/datafusion-python/blob/main/SKILL.md>`_ —
+  a dense, skill-oriented reference covering imports, data loading,
+  DataFrame operations, expression building, SQL-to-DataFrame mappings,
+  idiomatic patterns, and common pitfalls. Follows the
+  `Agent Skills <https://agentskills.io>`_ open standard.
+- `llms.txt <llms.txt>`_ — an entry point for LLM-based tools following the
+  `llmstxt.org <https://llmstxt.org>`_ convention. Categorized links to the
+  skill, user guide, API reference, and examples.
+
+Both files live at stable URLs so an agent can discover them without a
+checkout of the repo.
+
+Installing the skill
+--------------------
+
+**Preferred:** run
+
+.. code-block:: shell
+
+    npx skills add apache/datafusion-python
+
+This installs the skill in any supported agent on your machine (Claude
+Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others).
+The command writes the pointer into the agent's configuration so that any
+project you open that uses DataFusion Python picks up the skill
+automatically.
+
+**Manual:** if you are not using the ``skills`` registry, paste this
+single line into your project's ``AGENTS.md`` or ``CLAUDE.md``::
+
+    For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/SKILL.md
+
+Most assistants resolve that pointer the first time they see a
+DataFusion-related prompt in the project.
+
+What the skill covers
+---------------------
+
+Writing DataFusion Python code has a handful of conventions that are easy
+for a model to miss — bitwise ``&`` / ``|`` / ``~`` instead of Python
+``and`` / ``or`` / ``not``, the lazy-DataFrame immutability model, how
+window functions replace SQL correlated subqueries, the ``case`` /
+``when`` builder syntax, and the ``in_list`` / ``array_position`` options
+for membership tests. The skill enumerates each of these with short,
+copyable examples.
+
+It is *not* a replacement for this user guide. Think of it as a distilled
+reference the assistant keeps open while it writes code for you.
+
+If you are an agent author
+--------------------------
+
+The skill file and ``llms.txt`` are the two supported integration
+points. Both are versioned along with the release and follow open
+standards — no project-specific handshake is required.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 01813b032..b2e9bb8c3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -129,6 +129,10 @@ def setup(sphinx) -> None:
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
+# Copy agent-facing files (llms.txt) verbatim to the site root so they
+# resolve at conventional URLs like `https://.../python/llms.txt`.
+html_extra_path = ["llms.txt"]
+
 html_logo = "_static/images/2x_bgwhite_original.png"
 
 html_css_files = ["theme_overrides.css"]
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 134d41cb6..0e2b065c1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -78,6 +78,7 @@ Example
    user-guide/configuration
    user-guide/sql
    user-guide/upgrade-guides
+   ai-coding-assistants
 
 
 .. _toc.contributor_guide:
diff --git a/docs/source/llms.txt b/docs/source/llms.txt
new file mode 100644
index 000000000..4d6680426
--- /dev/null
+++ b/docs/source/llms.txt
@@ -0,0 +1,36 @@
+# DataFusion in Python
+
+> Apache DataFusion Python is a Python binding for Apache DataFusion, an in-process, Arrow-native query engine. It exposes a SQL interface and a lazy DataFrame API over PyArrow and any Arrow C Data Interface source. This file points agents and LLM-based tools at the most useful entry points for writing DataFusion Python code.
+
+## Agent Guide
+
+- [SKILL.md (agent skill, raw)](https://raw.githubusercontent.com/apache/datafusion-python/main/SKILL.md): idiomatic DataFrame API patterns, SQL-to-DataFrame mappings, common pitfalls, and the full `functions` catalog. Primary source of truth for writing datafusion-python code.
+- [Using DataFusion with AI coding assistants](https://datafusion.apache.org/python/ai-coding-assistants.html): human-readable guide for installing the skill and manual setup pointers.
+
+## User Guide
+
+- [Introduction](https://datafusion.apache.org/python/user-guide/introduction.html): install, the Pokemon quick start, Jupyter tips.
+- [Basics](https://datafusion.apache.org/python/user-guide/basics.html): `SessionContext`, `DataFrame`, and `Expr` at a glance.
+- [Data sources](https://datafusion.apache.org/python/user-guide/data-sources.html): Parquet, CSV, JSON, Arrow, Pandas, Polars, and Python objects.
+- [DataFrame operations](https://datafusion.apache.org/python/user-guide/dataframe/index.html): the lazy query-building interface.
+- [Common operations](https://datafusion.apache.org/python/user-guide/common-operations/index.html): select, filter, join, aggregate, window, expressions, and functions.
+- [SQL](https://datafusion.apache.org/python/user-guide/sql.html): running SQL against registered tables.
+- [Configuration](https://datafusion.apache.org/python/user-guide/configuration.html): session and runtime options.
+
+## DataFrame API reference
+
+- [`datafusion.dataframe.DataFrame`](https://datafusion.apache.org/python/autoapi/datafusion/dataframe/index.html): the lazy DataFrame builder (`select`, `filter`, `aggregate`, `join`, `sort`, `limit`, set operations).
+- [`datafusion.expr`](https://datafusion.apache.org/python/autoapi/datafusion/expr/index.html): expression tree nodes (`Expr`, `Window`, `WindowFrame`, `GroupingSet`).
+- [`datafusion.functions`](https://datafusion.apache.org/python/autoapi/datafusion/functions/index.html): 290+ scalar, aggregate, and window functions.
+- [`datafusion.context.SessionContext`](https://datafusion.apache.org/python/autoapi/datafusion/context/index.html): session entry point, data loading, SQL execution.
+
+## Examples
+
+- [TPC-H queries (GitHub)](https://github.com/apache/datafusion-python/tree/main/examples/tpch): canonical translations of TPC-H Q01–Q22 to idiomatic DataFrame code, each with reference SQL embedded in the module docstring.
+- [Other examples (GitHub)](https://github.com/apache/datafusion-python/tree/main/examples): UDF/UDAF/UDWF, Substrait, Pandas/Polars interop, S3 reads.
+
+## Optional
+
+- [Contributor guide](https://datafusion.apache.org/python/contributor-guide/introduction.html): building from source, extending the Python bindings.
+- [Upgrade guides](https://datafusion.apache.org/python/user-guide/upgrade-guides.html): migration notes between releases.
+- [Upstream Rust `DataFusion`](https://datafusion.apache.org/): the underlying query engine.
diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst
index de24a2ba5..a902fab5c 100644
--- a/docs/source/user-guide/common-operations/aggregations.rst
+++ b/docs/source/user-guide/common-operations/aggregations.rst
@@ -163,6 +163,59 @@ Suppose we want to find the speed values for only Pokemon that have low Attack v
         f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack")])
 
 
+Building per-group arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:py:func:`~datafusion.functions.array_agg` collects the values within each
+group into a list. Combined with ``distinct=True`` and the ``filter``
+argument, it lets you ask two questions of the same group in one pass —
+"what are all the values?" and "what are the values that satisfy some
+condition?".
+
+Suppose each row records a line item with the supplier that fulfilled it and
+a flag for whether that supplier met the commit date. We want to identify
+orders where exactly one supplier failed, among two or more suppliers in
+total:
+
+.. ipython:: python
+
+    orders_df = ctx.from_pydict(
+        {
+            "order_id": [1, 1, 1, 2, 2, 3],
+            "supplier_id": [100, 101, 102, 200, 201, 300],
+            "failed":      [False, True, False, False, False, True],
+        },
+    )
+
+    grouped = orders_df.aggregate(
+        [col("order_id")],
+        [
+            f.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"),
+            f.array_agg(
+                col("supplier_id"),
+                filter=col("failed"),
+                distinct=True,
+            ).alias("failed_suppliers"),
+        ],
+    )
+
+    grouped.filter(
+        (f.array_length(col("failed_suppliers")) == lit(1))
+        & (f.array_length(col("all_suppliers")) > lit(1))
+    ).select(
+        col("order_id"),
+        f.array_element(col("failed_suppliers"), lit(1)).alias("the_one_bad_supplier"),
+    )
+
+Two aspects of the pattern are worth calling out:
+
+- ``filter=`` on an aggregate narrows the rows contributing to *that*
+  aggregate only. Filtering the DataFrame before the aggregate would have
+  dropped whole groups that no longer had any rows.
+- :py:func:`~datafusion.functions.array_length` tests group size without
+  another aggregate pass, and :py:func:`~datafusion.functions.array_element`
+  extracts a single value when you have proven the array has length one.
+
 Grouping Sets
 -------------
 
diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst
index 7848b4ee7..aeb6e2ed1 100644
--- a/docs/source/user-guide/common-operations/expressions.rst
+++ b/docs/source/user-guide/common-operations/expressions.rst
@@ -146,6 +146,98 @@ This function returns a new array with the elements repeated.
 In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`.
 
 
+Testing membership in a list
+----------------------------
+
+A common need is filtering rows where a column equals *any* of a small set of
+values. DataFusion offers three forms; they differ in readability and in how
+they scale:
+
+1. A compound boolean using ``|`` across explicit equalities.
+2. :py:func:`~datafusion.functions.in_list`, which accepts a list of
+   expressions and tests equality against all of them in one call.
+3. A trick with :py:func:`~datafusion.functions.array_position` and
+   :py:func:`~datafusion.functions.make_array`, which returns the 1-based
+   index of the value in a constructed array, or null if it is not present.
+
+.. ipython:: python
+
+    from datafusion import SessionContext, col, lit
+    from datafusion import functions as f
+
+    ctx = SessionContext()
+    df = ctx.from_pydict({"shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"]})
+
+    # Option 1: compound boolean. Fine for two values; awkward past three.
+    df.filter((col("shipmode") == lit("MAIL")) | (col("shipmode") == lit("SHIP")))
+
+    # Option 2: in_list. Preferred for readability as the set grows.
+    df.filter(f.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")]))
+
+    # Option 3: array_position / make_array. Useful when you already have the
+    # set as an array column and want "is in that array" semantics.
+    df.filter(
+        ~f.array_position(
+            f.make_array(lit("MAIL"), lit("SHIP")), col("shipmode")
+        ).is_null()
+    )
+
+Use ``in_list`` as the default. It is explicit, readable, and matches the
+semantics users expect from SQL's ``IN (...)``. Reach for the
+``array_position`` form only when the membership set is itself an array
+column rather than a literal list.
+
+Conditional expressions
+-----------------------
+
+DataFusion provides :py:func:`~datafusion.functions.case` for the SQL
+``CASE`` expression in both its switched and searched forms, along with
+:py:func:`~datafusion.functions.when` as a standalone builder for the
+searched form.
+
+**Switched CASE** (one expression compared against several literal values):
+
+.. ipython:: python
+
+    df = ctx.from_pydict(
+        {"priority": ["1-URGENT", "2-HIGH", "3-MEDIUM", "5-LOW"]},
+    )
+
+    df.select(
+        col("priority"),
+        f.case(col("priority"))
+         .when(lit("1-URGENT"), lit(1))
+         .when(lit("2-HIGH"), lit(1))
+         .otherwise(lit(0))
+         .alias("is_high_priority"),
+    )
+
+**Searched CASE** (an independent boolean predicate per branch). Use this
+form whenever a branch tests more than simple equality — for example,
+checking whether a joined column is ``NULL`` to gate a computed value:
+
+.. ipython:: python
+
+    df = ctx.from_pydict(
+        {"volume": [10.0, 20.0, 30.0], "supplier_id": [1, None, 2]},
+    )
+
+    df.select(
+        col("volume"),
+        col("supplier_id"),
+        f.when(col("supplier_id").is_not_null(), col("volume"))
+         .otherwise(lit(0.0))
+         .alias("attributed_volume"),
+    )
+
+This searched-CASE pattern is idiomatic for "attribute the measure to the
+matching side of a left join, otherwise contribute zero" — a shape that
+appears in TPC-H Q08 and similar market-share calculations.
+
+If a switched CASE has only two or three branches that test equality, an
+``in_list`` filter combined with :py:meth:`~datafusion.expr.Expr.otherwise`
+is often simpler than the full ``case`` builder.
+
 Structs
 -------
 
diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst
index f669721a3..48249bb0d 100644
--- a/docs/source/user-guide/common-operations/udf-and-udfa.rst
+++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst
@@ -101,6 +101,124 @@ write Rust based UDFs and to expose them to Python. There is an example in the
 `DataFusion blog <https://datafusion.apache.org/blog/2024/11/19/datafusion-python-udf-comparisons/>`_
 describing how to do this.
 
+When not to use a UDF
+^^^^^^^^^^^^^^^^^^^^^
+
+A UDF is the right tool when the per-row computation genuinely cannot be
+expressed with built-in functions. It is often the *wrong* tool for a
+predicate that happens to be easier to write in Python. A UDF is opaque
+to the optimizer, which means filters expressed as UDFs lose several
+rewrites that the engine applies to filters built from native
+expressions. The most visible of these is **Parquet predicate pushdown**:
+a native predicate can prune entire row groups using the min/max
+statistics in the Parquet footer, while a UDF predicate cannot.
+
+The following example writes a small Parquet file, then filters it two
+ways: first with a native expression, then with a UDF that computes the
+same result. The filter itself is simple on purpose so we can compare
+the plans side by side.
+
+.. ipython:: python
+
+    import tempfile, os
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    from datafusion import SessionContext, col, lit, udf
+
+    tmpdir = tempfile.mkdtemp()
+    parquet_path = os.path.join(tmpdir, "items.parquet")
+    pq.write_table(
+        pa.table({
+            "id":    list(range(100)),
+            "brand": ["A", "B", "C", "D"] * 25,
+            "qty":   [i * 10 for i in range(100)],
+        }),
+        parquet_path,
+    )
+
+    ctx = SessionContext()
+    items = ctx.read_parquet(parquet_path)
+
+**Native-expression predicate.** The filter is a plain boolean tree
+over column references and literals, so the optimizer can analyze it:
+
+.. ipython:: python
+
+    native_filtered = items.filter(
+        (col("brand") == lit("A")) & (col("qty") >= lit(150))
+    )
+    print(native_filtered.execution_plan().display_indent())
+
+Notice the ``DataSourceExec`` line. It carries three annotations the
+optimizer computed from the predicate:
+
+- ``predicate=brand@1 = A AND qty@2 >= 150`` — the filter is pushed
+  into the Parquet scan itself, so the scan only reads matching rows.
+- ``pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ...
+  qty_max@4 >= 150`` — the scan prunes whole row groups by consulting
+  the Parquet min/max statistics in the footer *before* reading any
+  column data.
+- ``required_guarantees=[brand in (A)]`` — the scan uses this when a
+  bloom filter or dictionary is available to skip pages.
+
+**UDF predicate.** Now wrap the same logic in a Python UDF:
+
+.. ipython:: python
+
+    def brand_qty_filter(brand_arr: pa.Array, qty_arr: pa.Array) -> pa.Array:
+        return pa.array([
+            b.as_py() == "A" and q.as_py() >= 150
+            for b, q in zip(brand_arr, qty_arr)
+        ])
+
+    pred_udf = udf(
+        brand_qty_filter, [pa.string(), pa.int64()], pa.bool_(), "stable",
+    )
+    udf_filtered = items.filter(pred_udf(col("brand"), col("qty")))
+    print(udf_filtered.execution_plan().display_indent())
+
+The ``DataSourceExec`` now carries only ``predicate=brand_qty_filter(...)``.
+There is no ``pruning_predicate`` and no ``required_guarantees``: the
+scan has to materialize every row group and hand each row to the
+Python callback just to decide whether to keep it.
+
+At small scale the cost difference is invisible; on a Parquet file with
+many row groups, or data whose min/max statistics line up well with
+the predicate, the native form can skip most of the file. The UDF form
+reads all of it.
+
+**Takeaway.** Reach for a UDF when the per-row computation is genuinely
+not expressible as a tree of built-in functions (custom numerical work,
+external lookups, complex business rules). When it *is* expressible —
+even if the native form is a little more verbose — build the ``Expr``
+tree directly so the optimizer can see through it. For disjunctive
+predicates the idiom is to produce one clause per bucket and combine
+them with ``|``:
+
+.. code-block:: python
+
+    from functools import reduce
+    from operator import or_
+    from datafusion import col, lit, functions as f
+
+    buckets = {
+        "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5},
+        "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10},
+    }
+
+    def bucket_clause(brand, spec):
+        return (
+            (col("brand") == lit(brand))
+            & f.in_list(col("container"), [lit(c) for c in spec["containers"]])
+            & (col("quantity") >= lit(spec["min_qty"]))
+            & (col("quantity") <= lit(spec["min_qty"] + 10))
+            & (col("size") >= lit(1))
+            & (col("size") <= lit(spec["max_size"]))
+        )
+
+    predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items()))
+    df = df.filter(predicate)
+
 Aggregate Functions
 -------------------
 
diff --git a/examples/README.md b/examples/README.md
index 0ef194afe..3024c782f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -37,6 +37,7 @@ Here is a direct link to the file used in the examples:
 - [Query a Parquet file using the DataFrame API](./dataframe-parquet.py)
 - [Run a SQL query and store the results in a Pandas DataFrame](./sql-to-pandas.py)
 - [Query PyArrow Data](./query-pyarrow-data.py)
+- [Array operations: membership tests, array_agg patterns, array inspection](./array-operations.py)
 
 ### Running User-Defined Python Code
 
diff --git a/examples/array-operations.py b/examples/array-operations.py
new file mode 100644
index 000000000..884f93974
--- /dev/null
+++ b/examples/array-operations.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Array operations in DataFusion Python.
+
+Runnable reference for the idiomatic array-building and array-inspection
+patterns. No external data is required -- the example constructs all inputs
+through ``from_pydict``.
+
+Topics covered:
+
+- ``F.make_array`` to build a literal array expression.
+- ``F.array_position`` and ``F.in_list`` for membership tests.
+- ``F.array_length`` and ``F.array_element`` for inspecting an aggregated
+  array.
+- ``F.array_agg(distinct=True, filter=...)`` for building two related arrays
+  per group in one pass, and filtering groups by array size afterwards.
+
+Run with::
+
+    python examples/array-operations.py
+"""
+
+from datafusion import SessionContext, col, lit
+from datafusion import functions as F
+
+ctx = SessionContext()
+
+
+# ---------------------------------------------------------------------------
+# 1. Membership tests: in_list vs. array_position / make_array
+# ---------------------------------------------------------------------------
+
+shipments = ctx.from_pydict(
+    {
+        "order_id": [1, 2, 3, 4, 5],
+        "shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"],
+    }
+)
+
+print("\n== in_list: is shipmode one of {MAIL, SHIP}? ==")
+shipments.filter(F.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")])).show()
+
+print("\n== array_position / make_array: same question via a literal array ==")
+shipments.filter(
+    ~F.array_position(F.make_array(lit("MAIL"), lit("SHIP")), col("shipmode")).is_null()
+).show()
+
+
+# ---------------------------------------------------------------------------
+# 2. array_agg with filter to inspect groups of two related arrays
+# ---------------------------------------------------------------------------
+#
+# Input represents line items per order, each fulfilled by one supplier. The
+# `failed` column marks whether the supplier met the commit date. We want to
+# find orders with multiple suppliers where exactly one of them failed, and
+# report that single failing supplier.
+
+line_items = ctx.from_pydict(
+    {
+        "order_id": [1, 1, 1, 2, 2, 3, 3, 3, 3],
+        "supplier_id": [100, 101, 102, 200, 201, 300, 301, 302, 303],
+        "failed": [False, True, False, False, False, True, False, False, False],
+    }
+)
+
+grouped = line_items.aggregate(
+    [col("order_id")],
+    [
+        F.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"),
+        F.array_agg(
+            col("supplier_id"),
+            filter=col("failed"),
+            distinct=True,
+        ).alias("failed_suppliers"),
+    ],
+)
+
+print("\n== per-order supplier arrays ==")
+grouped.sort(col("order_id").sort()).show()
+
+print("\n== orders with >1 supplier and exactly one failure ==")
+singled_out = grouped.filter(
+    (F.array_length(col("failed_suppliers")) == lit(1))
+    & (F.array_length(col("all_suppliers")) > lit(1))
+).select(
+    col("order_id"),
+    F.array_element(col("failed_suppliers"), lit(1)).alias("bad_supplier"),
+)
+singled_out.sort(col("order_id").sort()).show()