From 28b9dafbe0fb0e8429038770368834dc643aac35 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 17:48:06 -0600 Subject: [PATCH 1/8] feat: add support for parse_url, url_encode, url_decode expressions Wire up Spark's URL functions to the datafusion-spark implementations: - parse_url: maps to parse_url/try_parse_url based on failOnError (ANSI mode). Supports 2-arg (url, part) and 3-arg (url, part, key) forms with all Spark URL parts (HOST, PATH, QUERY, REF, etc.) - url_encode/url_decode: Spark rewrites these as RuntimeReplaceable to StaticInvoke(UrlCodec, "encode"/"decode"). Added handlers in CometStaticInvoke to intercept these and route to the native url_encode/url_decode functions. Closes #4150 (partial) --- .../spark_expressions_support.md | 6 +- native/core/src/execution/jni_api.rs | 8 ++ .../apache/comet/serde/QueryPlanSerde.scala | 5 +- .../org/apache/comet/serde/statics.scala | 31 ++++++- .../scala/org/apache/comet/serde/url.scala | 36 ++++++++ .../sql-tests/expressions/url/parse_url.sql | 88 +++++++++++++++++++ .../sql-tests/expressions/url/url_decode.sql | 54 ++++++++++++ .../sql-tests/expressions/url/url_encode.sql | 54 ++++++++++++ 8 files changed, 276 insertions(+), 6 deletions(-) create mode 100644 spark/src/main/scala/org/apache/comet/serde/url.scala create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/url_decode.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/url_encode.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 1e4b4e34bc..c290f33227 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -476,9 +476,9 @@ ### url_funcs -- [ ] parse_url -- [ ] url_decode -- [ ] url_encode +- [x] parse_url +- [x] url_decode +- [x] url_encode ### window_funcs diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 8da688563c..aa00c3ce24 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -62,6 +62,10 @@ use datafusion_spark::function::string::char::CharFunc; use datafusion_spark::function::string::concat::SparkConcat; use datafusion_spark::function::string::luhn_check::SparkLuhnCheck; use datafusion_spark::function::string::space::SparkSpace; +use datafusion_spark::function::url::parse_url::ParseUrl as SparkParseUrl; +use datafusion_spark::function::url::try_parse_url::TryParseUrl as SparkTryParseUrl; +use datafusion_spark::function::url::url_decode::UrlDecode as SparkUrlDecode; +use datafusion_spark::function::url::url_encode::UrlEncode as SparkUrlEncode; use futures::poll; use futures::stream::StreamExt; use futures::FutureExt; @@ -567,6 +571,10 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkArrayContains::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBin::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkStrToMap::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkParseUrl::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryParseUrl::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlDecode::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 448c2c2cb3..842226e391 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -233,6 +233,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) + private val urlExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[ParseUrl] -> CometParseUrl) + private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( // TODO PromotePrecision classOf[Alias] -> CometAlias, @@ -257,7 +260,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { mathExpressions ++ hashExpressions ++ stringExpressions ++ conditionalExpressions ++ mapExpressions ++ predicateExpressions ++ structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++ - temporalExpressions ++ conversionExpressions + temporalExpressions ++ conversionExpressions ++ urlExpressions /** * Mapping of Spark aggregate expression class to Comet expression handler. diff --git a/spark/src/main/scala/org/apache/comet/serde/statics.scala b/spark/src/main/scala/org/apache/comet/serde/statics.scala index 9dbc6d169f..60a66d8b10 100644 --- a/spark/src/main/scala/org/apache/comet/serde/statics.scala +++ b/spark/src/main/scala/org/apache/comet/serde/statics.scala @@ -19,11 +19,12 @@ package org.apache.comet.serde -import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionImplUtils} +import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionImplUtils, UrlCodec} import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils import org.apache.comet.CometSparkSessionExtensions.withInfo +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { @@ -35,7 +36,9 @@ object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { Map( ("readSidePadding", classOf[CharVarcharCodegenUtils]) -> CometScalarFunction( "read_side_padding"), - ("isLuhnNumber", classOf[ExpressionImplUtils]) -> CometScalarFunction("luhn_check")) + ("isLuhnNumber", classOf[ExpressionImplUtils]) -> CometScalarFunction("luhn_check"), + ("encode", UrlCodec.getClass) -> CometUrlEncodeStaticInvoke, + ("decode", UrlCodec.getClass) -> CometUrlDecodeStaticInvoke) override def convert( expr: StaticInvoke, @@ -53,3 +56,27 @@ object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { } } } + +// UrlCodec.encode(child, "UTF-8") -> url_encode(child) +object CometUrlEncodeStaticInvoke extends CometExpressionSerde[StaticInvoke] { + override def convert( + expr: StaticInvoke, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val childExpr = exprToProtoInternal(expr.children.head, inputs, binding) + val optExpr = scalarFunctionExprToProto("url_encode", childExpr) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} + +// UrlCodec.decode(child, "UTF-8") -> url_decode(child) +object CometUrlDecodeStaticInvoke extends CometExpressionSerde[StaticInvoke] { + override def convert( + expr: StaticInvoke, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val childExpr = exprToProtoInternal(expr.children.head, inputs, binding) + val optExpr = scalarFunctionExprToProto("url_decode", childExpr) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala new file mode 100644 index 0000000000..4a7db808aa --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde + +import org.apache.spark.sql.catalyst.expressions.{Attribute, ParseUrl} + +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} + +object CometParseUrl extends CometExpressionSerde[ParseUrl] { + override def convert( + expr: ParseUrl, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val funcName = if (expr.failOnError) "parse_url" else "try_parse_url" + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto(funcName, childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql new file mode 100644 index 0000000000..9be2b94898 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -0,0 +1,88 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- parse_url function +statement +CREATE TABLE test_urls(url string) USING parquet + +statement +INSERT INTO test_urls VALUES + ('http://spark.apache.org/path?query=1'), + ('http://user:password@host:8080/path?key=value&key2=value2#ref'), + ('http://example.com/path'), + ('http://example.com?foo=bar&baz=qux'), + ('http://example.com#fragment'), + (''), + (NULL) + +-- extract HOST +query +SELECT parse_url(url, 'HOST') FROM test_urls + +-- extract PATH +query +SELECT parse_url(url, 'PATH') FROM test_urls + +-- extract QUERY +query +SELECT parse_url(url, 'QUERY') FROM test_urls + +-- extract REF (fragment) +query +SELECT parse_url(url, 'REF') FROM test_urls + +-- extract PROTOCOL +query +SELECT parse_url(url, 'PROTOCOL') FROM test_urls + +-- extract FILE +query +SELECT parse_url(url, 'FILE') FROM test_urls + +-- extract AUTHORITY +query +SELECT parse_url(url, 'AUTHORITY') FROM test_urls + +-- extract USERINFO +query +SELECT parse_url(url, 'USERINFO') FROM test_urls + +-- extract specific query parameter (3-arg form) +query +SELECT parse_url(url, 'QUERY', 'query') FROM test_urls + +query +SELECT parse_url(url, 'QUERY', 'key') FROM test_urls + +query +SELECT parse_url(url, 'QUERY', 'key2') FROM test_urls + +query +SELECT parse_url(url, 'QUERY', 'nonexistent') FROM test_urls + +-- literal arguments +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +query +SELECT parse_url(NULL, 'HOST') + +query +SELECT parse_url('http://example.com', 'QUERY') diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql new file mode 100644 index 0000000000..cdf04804c1 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql @@ -0,0 +1,54 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- url_decode function +statement +CREATE TABLE test_decode(s string) USING parquet + +statement +INSERT INTO test_decode VALUES + ('https%3A%2F%2Fspark.apache.org'), + ('hello+world'), + ('a%2Bb%3Dc%26d%3De'), + ('caf%C3%A9'), + (''), + (NULL), + ('no+encoding+needed') + +query +SELECT url_decode(s) FROM test_decode + +-- literal arguments +query +SELECT url_decode('https%3A%2F%2Fspark.apache.org') + +query +SELECT url_decode('hello+world') + +query +SELECT url_decode('') + +query +SELECT url_decode(NULL) + +-- roundtrip: encode then decode +query +SELECT url_decode(url_encode('hello world & goodbye')) + +-- multibyte UTF-8 +query +SELECT url_decode('%E6%97%A5%E6%9C%AC%E8%AA%9E%E3%83%86%E3%82%B9%E3%83%88') diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql new file mode 100644 index 0000000000..7a7491eb8d --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql @@ -0,0 +1,54 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- url_encode function +statement +CREATE TABLE test_encode(s string) USING parquet + +statement +INSERT INTO test_encode VALUES + ('https://spark.apache.org'), + ('hello world'), + ('a+b=c&d=e'), + ('café'), + (''), + (NULL), + ('foo bar/baz?x=1&y=2') + +query +SELECT url_encode(s) FROM test_encode + +-- literal arguments +query +SELECT url_encode('https://spark.apache.org') + +query +SELECT url_encode('hello world') + +query +SELECT url_encode('') + +query +SELECT url_encode(NULL) + +-- special characters +query +SELECT url_encode('a b+c&d=e/f') + +-- multibyte UTF-8 +query +SELECT url_encode('日本語テスト') From c1f6c13e27ab96654b3b67524638f8f24eb5068c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:22:25 -0600 Subject: [PATCH 2/8] feat: mark parse_url as Incompatible and split SQL tests parse_url diverges from Spark for empty-string URLs (Comet returns NULL where Spark returns "") and for FILE extraction on URLs without an explicit path (Comet inserts a leading "/"). Mark CometParseUrl as Incompatible so it falls back to Spark by default. Split the SQL tests into a default-mode fallback assertion and an opt-in parse_url_native.sql that exercises the native implementation under inputs that match Spark. --- .../scala/org/apache/comet/serde/url.scala | 9 +++ .../sql-tests/expressions/url/parse_url.sql | 72 ++--------------- .../expressions/url/parse_url_native.sql | 77 +++++++++++++++++++ 3 files changed, 91 insertions(+), 67 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala index 4a7db808aa..77bc4cf3b6 100644 --- a/spark/src/main/scala/org/apache/comet/serde/url.scala +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -24,6 +24,15 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, ParseUrl} import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} object CometParseUrl extends CometExpressionSerde[ParseUrl] { + + private val incompatibleReason = + "Comet returns NULL for an empty-string URL, while Spark returns an empty string" + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) + + override def getSupportLevel(expr: ParseUrl): SupportLevel = + Incompatible(Some(incompatibleReason)) + override def convert( expr: ParseUrl, inputs: Seq[Attribute], diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql index 9be2b94898..4f17909b11 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -15,74 +15,12 @@ -- specific language governing permissions and limitations -- under the License. --- parse_url function -statement -CREATE TABLE test_urls(url string) USING parquet +-- parse_url is marked Incompatible (see CometParseUrl). In the default +-- configuration, Comet falls back to Spark. See parse_url_native.sql for +-- coverage of the native implementation. -statement -INSERT INTO test_urls VALUES - ('http://spark.apache.org/path?query=1'), - ('http://user:password@host:8080/path?key=value&key2=value2#ref'), - ('http://example.com/path'), - ('http://example.com?foo=bar&baz=qux'), - ('http://example.com#fragment'), - (''), - (NULL) - --- extract HOST -query -SELECT parse_url(url, 'HOST') FROM test_urls - --- extract PATH -query -SELECT parse_url(url, 'PATH') FROM test_urls - --- extract QUERY -query -SELECT parse_url(url, 'QUERY') FROM test_urls - --- extract REF (fragment) -query -SELECT parse_url(url, 'REF') FROM test_urls - --- extract PROTOCOL -query -SELECT parse_url(url, 'PROTOCOL') FROM test_urls - --- extract FILE -query -SELECT parse_url(url, 'FILE') FROM test_urls - --- extract AUTHORITY -query -SELECT parse_url(url, 'AUTHORITY') FROM test_urls - --- extract USERINFO -query -SELECT parse_url(url, 'USERINFO') FROM test_urls - --- extract specific query parameter (3-arg form) -query -SELECT parse_url(url, 'QUERY', 'query') FROM test_urls - -query -SELECT parse_url(url, 'QUERY', 'key') FROM test_urls - -query -SELECT parse_url(url, 'QUERY', 'key2') FROM test_urls - -query -SELECT parse_url(url, 'QUERY', 'nonexistent') FROM test_urls - --- literal arguments -query +query expect_fallback(not fully compatible with Spark) SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') -query +query expect_fallback(not fully compatible with Spark) SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') - -query -SELECT parse_url(NULL, 'HOST') - -query -SELECT parse_url('http://example.com', 'QUERY') diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql new file mode 100644 index 0000000000..066634cd51 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql @@ -0,0 +1,77 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Exercises the native parse_url implementation. Inputs are restricted to +-- URLs with explicit paths to avoid known divergences (e.g. FILE on +-- path-less URLs, and empty-string input). + +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +statement +CREATE TABLE test_urls_native(url string) USING parquet + +statement +INSERT INTO test_urls_native VALUES + ('http://spark.apache.org/path?query=1'), + ('http://user:password@host:8080/path?key=value&key2=value2#ref'), + ('http://example.com/path'), + (NULL) + +query +SELECT parse_url(url, 'HOST') FROM test_urls_native + +query +SELECT parse_url(url, 'PATH') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY') FROM test_urls_native + +query +SELECT parse_url(url, 'REF') FROM test_urls_native + +query +SELECT parse_url(url, 'PROTOCOL') FROM test_urls_native + +query +SELECT parse_url(url, 'FILE') FROM test_urls_native + +query +SELECT parse_url(url, 'AUTHORITY') FROM test_urls_native + +query +SELECT parse_url(url, 'USERINFO') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'query') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'key') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'key2') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'nonexistent') FROM test_urls_native + +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +query +SELECT parse_url(NULL, 'HOST') From de725f7278801b00a61fec355a497b8c9d4181c2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:24:20 -0600 Subject: [PATCH 3/8] docs: link parse_url incompatibility to upstream issue 21943 Reference https://github.com/apache/datafusion/issues/21943 from the CometParseUrl serde and the two parse_url SQL test files so the source of the divergence is traceable. --- spark/src/main/scala/org/apache/comet/serde/url.scala | 5 ++++- .../test/resources/sql-tests/expressions/url/parse_url.sql | 5 ++++- .../resources/sql-tests/expressions/url/parse_url_native.sql | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala index 77bc4cf3b6..5bc68aefcb 100644 --- a/spark/src/main/scala/org/apache/comet/serde/url.scala +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -25,8 +25,11 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn object CometParseUrl extends CometExpressionSerde[ParseUrl] { + // See https://github.com/apache/datafusion/issues/21943 for the upstream + // tracking issue covering the divergences from Spark. private val incompatibleReason = - "Comet returns NULL for an empty-string URL, while Spark returns an empty string" + "Comet returns NULL for an empty-string URL, while Spark returns an empty string. " + + "See https://github.com/apache/datafusion/issues/21943." override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql index 4f17909b11..bddbf1b775 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -15,7 +15,10 @@ -- specific language governing permissions and limitations -- under the License. --- parse_url is marked Incompatible (see CometParseUrl). In the default +-- parse_url is marked Incompatible (see CometParseUrl) because the native +-- implementation diverges from Spark for empty-string input and for FILE +-- extraction on path-less URLs. Tracked upstream at +-- https://github.com/apache/datafusion/issues/21943. In the default -- configuration, Comet falls back to Spark. See parse_url_native.sql for -- coverage of the native implementation. diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql index 066634cd51..8d228d3768 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql @@ -16,8 +16,9 @@ -- under the License. -- Exercises the native parse_url implementation. Inputs are restricted to --- URLs with explicit paths to avoid known divergences (e.g. FILE on --- path-less URLs, and empty-string input). +-- URLs with explicit paths because the native implementation diverges from +-- Spark for empty-string input and for FILE extraction on path-less URLs. +-- Tracked upstream at https://github.com/apache/datafusion/issues/21943. -- Config: spark.comet.expression.ParseUrl.allowIncompatible=true From cd2b7d7adc458f30cb9da6bf15ded40c62c35f2f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:29:13 -0600 Subject: [PATCH 4/8] test: extend url_encode coverage and record audit notes Add boundary, already-encoded, and whitespace-control rows to the url_encode SQL test, and run it across both Parquet dictionary settings via ConfigMatrix. Record per-Spark-version audit notes for parse_url and url_encode in spark_expressions_support.md. --- .../spark_expressions_support.md | 6 ++++++ .../sql-tests/expressions/url/url_encode.sql | 19 ++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index c290f33227..906a904814 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -477,8 +477,14 @@ ### url_funcs - [x] parse_url + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29: marked Incompatible. Native impl returns NULL for empty-string input where Spark returns "", and inserts a leading "/" when extracting FILE from a URL with no explicit path. Tracked at https://github.com/apache/datafusion/issues/21943. - [x] url_decode - [x] url_encode + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29: replacement StaticInvoke is single-argument and inputs are collation-aware, but encoded output matches 3.4/3.5. No version-specific shim required. ### window_funcs diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql index 7a7491eb8d..29b3ab77ac 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql @@ -15,6 +15,8 @@ -- specific language governing permissions and limitations -- under the License. +-- ConfigMatrix: parquet.enable.dictionary=false,true + -- url_encode function statement CREATE TABLE test_encode(s string) USING parquet @@ -27,7 +29,10 @@ INSERT INTO test_encode VALUES ('café'), (''), (NULL), - ('foo bar/baz?x=1&y=2') + ('foo bar/baz?x=1&y=2'), + ('~*''()'), + ('a%20b'), + ('\t\n\r') query SELECT url_encode(s) FROM test_encode @@ -52,3 +57,15 @@ SELECT url_encode('a b+c&d=e/f') -- multibyte UTF-8 query SELECT url_encode('日本語テスト') + +-- boundary characters in the preserved set +query +SELECT url_encode('~*''()') + +-- already-encoded input (verify double-encoding of percent) +query +SELECT url_encode('a%20b') + +-- whitespace control characters +query +SELECT url_encode('\t\n\r') From 0831a2b6e02231529e26af56f06ad8587cc6cd7a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:30:47 -0600 Subject: [PATCH 5/8] test: drop dictionary ConfigMatrix from url_encode SQL test The url_encode test data has no duplicate rows so dictionary encoding is not exercised. Run the test once instead of doubling the runtime. --- .../src/test/resources/sql-tests/expressions/url/url_encode.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql index 29b3ab77ac..3cb5933223 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql @@ -15,8 +15,6 @@ -- specific language governing permissions and limitations -- under the License. --- ConfigMatrix: parquet.enable.dictionary=false,true - -- url_encode function statement CREATE TABLE test_encode(s string) USING parquet From 3e4a351a1f7dccf25cd2d45da263882268ed8802 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:39:54 -0600 Subject: [PATCH 6/8] test: add malformed-input and lowercase-hex coverage for url_decode Add a `query expect_error(%2s)` case locking in the contract that both Spark and Comet error on malformed percent-encoding with the bad sequence in the message. Add lowercase-hex inputs to verify case- insensitive percent-decoding. Record per-version url_decode audit notes in spark_expressions_support.md, including a pointer to issue #4155 for the Spark 4.0 `try_url_decode` gap. --- .../contributor-guide/spark_expressions_support.md | 3 +++ .../sql-tests/expressions/url/url_decode.sql | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 906a904814..74f198ed05 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -481,6 +481,9 @@ - 3.5.8, 2026-04-29 - 4.0.1, 2026-04-29: marked Incompatible. Native impl returns NULL for empty-string input where Spark returns "", and inserts a leading "/" when extracting FILE from a URL with no explicit path. Tracked at https://github.com/apache/datafusion/issues/21943. - [x] url_decode + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29: `try_url_decode` is not handled correctly. Spark rewrites `try_url_decode(x)` to `StaticInvoke(UrlCodec, "decode", [x, Literal(false)], ...)`, but `CometUrlDecodeStaticInvoke` drops the `failOnError=false` flag and emits a plain `url_decode(x)`, so Comet errors on malformed input where Spark returns NULL. Tracked at https://github.com/apache/datafusion-comet/issues/4155. - [x] url_encode - 3.4.3, 2026-04-29 - 3.5.8, 2026-04-29 diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql index cdf04804c1..42b22f575f 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql @@ -27,7 +27,9 @@ INSERT INTO test_decode VALUES ('caf%C3%A9'), (''), (NULL), - ('no+encoding+needed') + ('no+encoding+needed'), + ('%21%40%23%24%25%5E%26%2A%28%29%5F%2B'), + ('%2a%2b%2c') query SELECT url_decode(s) FROM test_decode @@ -52,3 +54,12 @@ SELECT url_decode(url_encode('hello world & goodbye')) -- multibyte UTF-8 query SELECT url_decode('%E6%97%A5%E6%9C%AC%E8%AA%9E%E3%83%86%E3%82%B9%E3%83%88') + +-- lowercase hex (RFC 3986 says hex digits are case-insensitive) +query +SELECT url_decode('%2a%2b%2c') + +-- malformed percent-encoding: both Spark and Comet must error and the bad +-- sequence must appear in the error message +query expect_error(%2s) +SELECT url_decode('http%3A%2F%2spark.apache.org') From ad3a1d21c69a1652df42f1ea3822b3c34c425780 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:50:23 -0600 Subject: [PATCH 7/8] test: extend parse_url coverage and trim incompat reason - Trim CometParseUrl.incompatibleReason to one sentence linking to the upstream issue, since the reason appears in EXPLAIN output. - parse_url.sql: document the three known divergent shapes and add a fallback assertion for trailing-slash PATH (Spark "/", native ""). - parse_url_native.sql: add an ANSI-mode invalid-URL expect_error case (locks in agreement on the "The url is invalid" message). - try_parse_url.sql: new file gated on MinSparkVersion 4.0 covering valid input, malformed input (NULL fallback), and NULL input. --- .../scala/org/apache/comet/serde/url.scala | 6 +-- .../sql-tests/expressions/url/parse_url.sql | 20 +++++++--- .../expressions/url/parse_url_native.sql | 7 ++++ .../expressions/url/try_parse_url.sql | 40 +++++++++++++++++++ 4 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala index 5bc68aefcb..c00c65977f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/url.scala +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -25,10 +25,10 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn object CometParseUrl extends CometExpressionSerde[ParseUrl] { - // See https://github.com/apache/datafusion/issues/21943 for the upstream - // tracking issue covering the divergences from Spark. + // The full list of edge-case divergences is tracked at + // https://github.com/apache/datafusion/issues/21943. private val incompatibleReason = - "Comet returns NULL for an empty-string URL, while Spark returns an empty string. " + + "Native parse_url diverges from Spark on several edge cases. " + "See https://github.com/apache/datafusion/issues/21943." override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql index bddbf1b775..20e80a2d4a 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -15,15 +15,23 @@ -- specific language governing permissions and limitations -- under the License. --- parse_url is marked Incompatible (see CometParseUrl) because the native --- implementation diverges from Spark for empty-string input and for FILE --- extraction on path-less URLs. Tracked upstream at --- https://github.com/apache/datafusion/issues/21943. In the default --- configuration, Comet falls back to Spark. See parse_url_native.sql for --- coverage of the native implementation. +-- parse_url is marked Incompatible (see CometParseUrl). Known divergences from +-- Spark, tracked upstream at https://github.com/apache/datafusion/issues/21943: +-- 1. empty-string URL returns NULL instead of "" for any part +-- 2. FILE on a URL without an explicit path returns "/?..." instead of "?..." +-- 3. PATH on a URL with a bare trailing slash returns "" instead of "/" +-- In the default configuration, Comet falls back to Spark. The two queries below +-- both verify a normal-shape URL takes the fallback path, and exercise one of +-- the divergent shapes (trailing-slash PATH) to lock in that fallback handles +-- it correctly. See parse_url_native.sql for native-execution coverage. query expect_fallback(not fully compatible with Spark) SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') query expect_fallback(not fully compatible with Spark) SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +-- Trailing-slash PATH: Spark returns "/", native impl returns "". Verifying +-- the fallback path emits Spark's "/". +query expect_fallback(not fully compatible with Spark) +SELECT parse_url('http://example.com/', 'PATH') diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql index 8d228d3768..654506ebaf 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql @@ -21,6 +21,7 @@ -- Tracked upstream at https://github.com/apache/datafusion/issues/21943. -- Config: spark.comet.expression.ParseUrl.allowIncompatible=true +-- Config: spark.sql.ansi.enabled=true statement CREATE TABLE test_urls_native(url string) USING parquet @@ -76,3 +77,9 @@ SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') query SELECT parse_url(NULL, 'HOST') + +-- ANSI-mode invalid URL: parse_url's failOnError is driven by spark.sql.ansi.enabled +-- (set above). Both Spark (INVALID_URL error class) and Comet's native impl +-- produce a message starting "The url is invalid". +query expect_error(The url is invalid) +SELECT parse_url('inva lid://user:pass@host/file', 'HOST') diff --git a/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql new file mode 100644 index 0000000000..e62dc329e4 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql @@ -0,0 +1,40 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_parse_url is Spark 4.0+. It rewrites to ParseUrl(_, failOnError=false), +-- so Comet emits the native try_parse_url scalar function. parse_url is marked +-- Incompatible (see CometParseUrl), so this test opts in via allowIncompatible. + +-- MinSparkVersion: 4.0 +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +-- Valid URL: same answer as parse_url. +query +SELECT try_parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query +SELECT try_parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +-- Malformed URL with a scheme: Spark returns NULL, Comet's try_parse_url +-- returns NULL (failOnError=false propagates through CometParseUrl to the +-- native try_parse_url UDF). +query +SELECT try_parse_url('inva lid://user:pass@host/file', 'HOST') + +-- NULL input. +query +SELECT try_parse_url(NULL, 'HOST') From db441ad8e937f51c66c620d47b4a86a82522b169 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 Apr 2026 18:51:04 -0600 Subject: [PATCH 8/8] docs: point parse_url support note at Comet issue 4156 --- docs/source/contributor-guide/spark_expressions_support.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 74f198ed05..857689b89a 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -479,7 +479,7 @@ - [x] parse_url - 3.4.3, 2026-04-29 - 3.5.8, 2026-04-29 - - 4.0.1, 2026-04-29: marked Incompatible. Native impl returns NULL for empty-string input where Spark returns "", and inserts a leading "/" when extracting FILE from a URL with no explicit path. Tracked at https://github.com/apache/datafusion/issues/21943. + - 4.0.1, 2026-04-29: marked Incompatible. Comet tracks the work at https://github.com/apache/datafusion-comet/issues/4156, with the divergences enumerated upstream at https://github.com/apache/datafusion/issues/21943. - [x] url_decode - 3.4.3, 2026-04-29 - 3.5.8, 2026-04-29