From fdc962a0cfb6e381f2ab1b55dbe94bf6bac855c0 Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 14:53:14 -0400 Subject: [PATCH 1/9] feat: add stringEncode in CommonStringExprs --- .../org/apache/comet/serde/strings.scala | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 968fe8cd69..10ab2348da 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -479,4 +479,28 @@ trait CommonStringExprs { None } } + + def stringEncode( + expr: Expression, + charset: Expression, + value: Expression, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = { + charset match { + case Literal(str, DataTypes.StringType) + if str.toString.toLowerCase(Locale.ROOT) == "utf-8" => + // encode(col, 'utf-8') is byte-equivalent to cast(string AS binary) + // because Spark's UTF8String already holds valid UTF-8 bytes. + val strExpr = exprToProtoInternal(value, inputs, binding) + if (strExpr.isDefined) { + CometCast.castToProto(expr, None, DataTypes.BinaryType, strExpr.get, CometEvalMode.LEGACY) + } else { + withInfo(expr, value) + None + } + case _ => + withInfo(expr, "Comet only supports encoding with 'utf-8'.") + None + } + } } From 91f1b5733510ed6ef56f4017089a8711b7d3c78e Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 15:31:45 -0400 Subject: [PATCH 2/9] feat: add encode check in CometExprShim --- .../org/apache/comet/shims/CometExprShim.scala | 3 ++- .../org/apache/comet/shims/CometExprShim.scala | 3 ++- .../org/apache/comet/shims/CometExprShim.scala | 13 +++++++++++++ .../org/apache/comet/shims/CometExprShim.scala | 13 +++++++++++++ .../org/apache/comet/shims/CometExprShim.scala | 13 +++++++++++++ 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala index f80a8909f6..09be02d8e0 100644 --- a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala @@ -43,7 +43,8 @@ trait CometExprShim extends CommonStringExprs { case s: StringDecode => // Right child is the encoding expression. stringDecode(expr, s.charset, s.bin, inputs, binding) - + case e: Encode => + stringEncode(expr, e.charset, e.value, inputs, binding) case _ => None } } diff --git a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala index d3e3270700..2b095249fd 100644 --- a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala @@ -46,7 +46,8 @@ trait CometExprShim extends CommonStringExprs { case s: StringDecode => // Right child is the encoding expression. stringDecode(expr, s.charset, s.bin, inputs, binding) - + case e: Encode => + stringEncode(expr, e.charset, e.value, inputs, binding) case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 3d5b34bfd2..7890214253 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -93,6 +93,19 @@ trait CometExprShim extends CommonStringExprs { val Seq(bin, charset, _, _) = s.arguments stringDecode(expr, charset, bin, inputs, binding) + case s: StaticInvoke + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => + val Seq(value, charset, _, _) = s.arguments + stringEncode(expr, charset, value, inputs, binding) + case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 5e906a0d83..a94669f9b4 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -92,6 +92,19 @@ trait CometExprShim extends CommonStringExprs { val Seq(bin, charset, _, _) = s.arguments stringDecode(expr, charset, bin, inputs, binding) + case s: StaticInvoke + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => + val Seq(value, charset, _, _) = s.arguments + stringEncode(expr, charset, value, inputs, binding) + case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 5e906a0d83..a94669f9b4 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -92,6 +92,19 @@ trait CometExprShim extends CommonStringExprs { val Seq(bin, charset, _, _) = s.arguments stringDecode(expr, charset, bin, inputs, binding) + case s: StaticInvoke + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => + val Seq(value, charset, _, _) = s.arguments + stringEncode(expr, charset, value, inputs, binding) + case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, From d46c128340e22a5c8ae99e80ef8e146e3048cc2b Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 15:58:38 -0400 Subject: [PATCH 3/9] chore: spotless check --- .../scala/org/apache/comet/serde/strings.scala | 7 ++++++- .../org/apache/comet/shims/CometExprShim.scala | 18 +++++++++--------- .../org/apache/comet/shims/CometExprShim.scala | 18 +++++++++--------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 10ab2348da..7e7bd8144f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -493,7 +493,12 @@ trait CommonStringExprs { // because Spark's UTF8String already holds valid UTF-8 bytes. val strExpr = exprToProtoInternal(value, inputs, binding) if (strExpr.isDefined) { - CometCast.castToProto(expr, None, DataTypes.BinaryType, strExpr.get, CometEvalMode.LEGACY) + CometCast.castToProto( + expr, + None, + DataTypes.BinaryType, + strExpr.get, + CometEvalMode.LEGACY) } else { withInfo(expr, value) None diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index a94669f9b4..60313f97aa 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -93,15 +93,15 @@ trait CometExprShim extends CommonStringExprs { stringDecode(expr, charset, bin, inputs, binding) case s: StaticInvoke - if s.staticObject == classOf[Encode] && - s.dataType.isInstanceOf[BinaryType] && - s.functionName == "encode" && - s.arguments.size == 4 && - s.inputTypes == Seq( - StringTypeWithCollation(supportsTrimCollation = true), - StringTypeWithCollation(supportsTrimCollation = true), - BooleanType, - BooleanType) => + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => val Seq(value, charset, _, _) = s.arguments stringEncode(expr, charset, value, inputs, binding) diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index a94669f9b4..60313f97aa 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -93,15 +93,15 @@ trait CometExprShim extends CommonStringExprs { stringDecode(expr, charset, bin, inputs, binding) case s: StaticInvoke - if s.staticObject == classOf[Encode] && - s.dataType.isInstanceOf[BinaryType] && - s.functionName == "encode" && - s.arguments.size == 4 && - s.inputTypes == Seq( - StringTypeWithCollation(supportsTrimCollation = true), - StringTypeWithCollation(supportsTrimCollation = true), - BooleanType, - BooleanType) => + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => val Seq(value, charset, _, _) = s.arguments stringEncode(expr, charset, value, inputs, binding) From 8ff992212cd8cb0593d1881807ae1bc2028440b5 Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 15:59:06 -0400 Subject: [PATCH 4/9] test: add encode sql --- .../sql-tests/expressions/string/encode.sql | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 spark/src/test/resources/sql-tests/expressions/string/encode.sql diff --git a/spark/src/test/resources/sql-tests/expressions/string/encode.sql b/spark/src/test/resources/sql-tests/expressions/string/encode.sql new file mode 100644 index 0000000000..2a6a142426 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/string/encode.sql @@ -0,0 +1,61 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Tests for the SQL `encode(str, charset)` function. +-- +-- Spark 3.x: Encode is a BinaryExpression(value, charset). +-- Spark 4.x+: Encode is RuntimeReplaceable; the analyzer rewrites it to +-- StaticInvoke(classOf[Encode], BinaryType, "encode", ...) + +statement +CREATE TABLE test_encode_utf8(s string) USING parquet + +statement +INSERT INTO test_encode_utf8 VALUES ('hello'), ('world'), (''), ('café'), (NULL) + +query +SELECT encode(s, 'utf-8') FROM test_encode_utf8 + +query +SELECT encode(s, 'UTF-8') FROM test_encode_utf8 + +-- Mixed-case charset literal exercises toLowerCase normalization +query +SELECT encode(s, 'Utf-8') FROM test_encode_utf8 + +query +SELECT encode('hello', 'utf-8'), encode('', 'utf-8'), encode(CAST(NULL AS STRING), 'utf-8') + +-- Different language(French, Japanese) +query +SELECT encode('café', 'utf-8'), encode('日本語', 'utf-8') + +-- non-UTF-8 falls back to Spark JVM +statement +CREATE TABLE test_encode_charset_safe(s string) USING parquet + +statement +INSERT INTO test_encode_charset_safe VALUES ('hello'), ('world'), (''), (NULL) + +query expect_fallback(Comet only supports encoding with 'utf-8'.) +SELECT encode(s, 'UTF-16BE') FROM test_encode_charset_safe + +query expect_fallback(Comet only supports encoding with 'utf-8'.) +SELECT encode(s, 'US-ASCII') FROM test_encode_charset_safe + +query expect_fallback(Comet only supports encoding with 'utf-8'.) +SELECT encode(s, 'ISO-8859-1') FROM test_encode_charset_safe \ No newline at end of file From 3c08105819becf1717fd1746b7634b3f3af50c62 Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 16:04:20 -0400 Subject: [PATCH 5/9] docs: support encode --- docs/source/contributor-guide/spark_expressions_support.md | 2 +- docs/source/user-guide/latest/expressions.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 588ae5b457..06d971e81e 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -519,7 +519,7 @@ - [x] contains - [ ] decode - [ ] elt -- [ ] encode +- [x] encode - [x] endswith - [ ] find_in_set - [ ] format_number diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 3842148a43..0969f72a90 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -55,13 +55,14 @@ of expressions that be disabled. ## String Functions | Expression | -| --------------- | +|-----------------| | Ascii | | BitLength | | Chr | | Concat | | ConcatWs | | Contains | +| Encode | | EndsWith | | InitCap | | Left | From 16d4f89f1606dee6d76fa678e2493fd039da7c08 Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 17:19:06 -0400 Subject: [PATCH 6/9] refactor: extract encode to common shim --- .../apache/comet/shims/CometExprShim.scala | 18 +----- .../apache/comet/shims/CometExprShim.scala | 4 +- .../apache/comet/shims/CometExprShim.scala | 18 +----- .../apache/comet/shims/ShimCometExprs.scala | 55 +++++++++++++++++++ 4 files changed, 61 insertions(+), 34 deletions(-) create mode 100644 spark/src/main/spark-4.x/org/apache/comet/shims/ShimCometExprs.scala diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 7890214253..97450d24bd 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -37,7 +37,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends ShimCometExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -93,19 +93,6 @@ trait CometExprShim extends CommonStringExprs { val Seq(bin, charset, _, _) = s.arguments stringDecode(expr, charset, bin, inputs, binding) - case s: StaticInvoke - if s.staticObject == classOf[Encode] && - s.dataType.isInstanceOf[BinaryType] && - s.functionName == "encode" && - s.arguments.size == 4 && - s.inputTypes == Seq( - StringTypeWithCollation(supportsTrimCollation = true), - StringTypeWithCollation(supportsTrimCollation = true), - BooleanType, - BooleanType) => - val Seq(value, charset, _, _) = s.arguments - stringEncode(expr, charset, value, inputs, binding) - case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, @@ -181,8 +168,7 @@ trait CometExprShim extends CommonStringExprs { childExpr) optExprWithInfo(mapSortExpr, ms, ms.child) } - - case _ => None + case _ => sparkExprToProto(expr, inputs, binding) } } } diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 60313f97aa..81a6971631 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -37,7 +37,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends ShimCometExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -181,7 +181,7 @@ trait CometExprShim extends CommonStringExprs { optExprWithInfo(mapSortExpr, ms, ms.child) } - case _ => None + case _ => sparkExprToProto(expr, inputs, binding) } } } diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 60313f97aa..909428cbaf 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -37,7 +37,7 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithIn /** * `CometExprShim` acts as a shim for parsing expressions from different Spark versions. */ -trait CometExprShim extends CommonStringExprs { +trait CometExprShim extends ShimCometExprs { protected def evalMode(c: Cast): CometEvalMode.Value = CometEvalModeUtil.fromSparkEvalMode(c.evalMode) @@ -92,19 +92,6 @@ trait CometExprShim extends CommonStringExprs { val Seq(bin, charset, _, _) = s.arguments stringDecode(expr, charset, bin, inputs, binding) - case s: StaticInvoke - if s.staticObject == classOf[Encode] && - s.dataType.isInstanceOf[BinaryType] && - s.functionName == "encode" && - s.arguments.size == 4 && - s.inputTypes == Seq( - StringTypeWithCollation(supportsTrimCollation = true), - StringTypeWithCollation(supportsTrimCollation = true), - BooleanType, - BooleanType) => - val Seq(value, charset, _, _) = s.arguments - stringEncode(expr, charset, value, inputs, binding) - case expr @ ToPrettyString(child, timeZoneId) => val castSupported = CometCast.isSupported( child.dataType, @@ -180,8 +167,7 @@ trait CometExprShim extends CommonStringExprs { childExpr) optExprWithInfo(mapSortExpr, ms, ms.child) } - - case _ => None + case _ => sparkExprToProto(expr, inputs, binding) } } } diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/ShimCometExprs.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimCometExprs.scala new file mode 100644 index 0000000000..af43b9e76f --- /dev/null +++ b/spark/src/main/spark-4.x/org/apache/comet/shims/ShimCometExprs.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.shims + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke +import org.apache.spark.sql.internal.types.StringTypeWithCollation +import org.apache.spark.sql.types.{BinaryType, BooleanType} + +import org.apache.comet.serde.CommonStringExprs +import org.apache.comet.serde.ExprOuterClass.Expr + +trait ShimCometExprs extends CommonStringExprs { + + protected def sparkExprToProto( + expr: Expression, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = { + expr match { + // encode(str, 'utf-8') -> cast(string AS binary) — Arrow's Utf8->Binary + // is a zero-copy reinterpret, matching Spark's UTF8String.getBytes() exactly. + case s: StaticInvoke + if s.staticObject == classOf[Encode] && + s.dataType.isInstanceOf[BinaryType] && + s.functionName == "encode" && + s.arguments.size == 4 && + s.inputTypes == Seq( + StringTypeWithCollation(supportsTrimCollation = true), + StringTypeWithCollation(supportsTrimCollation = true), + BooleanType, + BooleanType) => + val Seq(value, charset, _, _) = s.arguments + stringEncode(expr, charset, value, inputs, binding) + + case _ => None + } + } +} From bc1bbcf9cfec99aa98ce0c81503bb32afc270708 Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 17:33:24 -0400 Subject: [PATCH 7/9] feat: add null check --- spark/src/main/scala/org/apache/comet/serde/strings.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 30544fe2df..e09b134ed9 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -479,7 +479,7 @@ trait CommonStringExprs { binding: Boolean): Option[Expr] = { charset match { case Literal(str, DataTypes.StringType) - if str.toString.toLowerCase(Locale.ROOT) == "utf-8" => + if str != null && str.toString.toLowerCase(Locale.ROOT) == "utf-8" => // decode(col, 'utf-8') can be treated as a cast with "try" eval mode that puts nulls // for invalid strings. // Left child is the binary expression. @@ -504,7 +504,7 @@ trait CommonStringExprs { binding: Boolean): Option[Expr] = { charset match { case Literal(str, DataTypes.StringType) - if str.toString.toLowerCase(Locale.ROOT) == "utf-8" => + if str != null && str.toString.toLowerCase(Locale.ROOT) == "utf-8" => // encode(col, 'utf-8') is byte-equivalent to cast(string AS binary) // because Spark's UTF8String already holds valid UTF-8 bytes. val strExpr = exprToProtoInternal(value, inputs, binding) From 3e55030c6ae8e716f3dc9b49809fefce06cedaaf Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Wed, 13 May 2026 20:42:59 -0400 Subject: [PATCH 8/9] fix: lint issue --- docs/source/user-guide/latest/expressions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index d9913111e7..dac3969b73 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -56,7 +56,7 @@ of expressions that be disabled. ## String Functions | Expression | -|-----------------| +| --------------- | | Ascii | | BitLength | | Chr | From dd2f63d10b513cac707b81635b48ccdc4073585d Mon Sep 17 00:00:00 2001 From: Bolin Lin Date: Thu, 14 May 2026 12:32:44 -0400 Subject: [PATCH 9/9] fix: scalafix check --- .../main/spark-4.0/org/apache/comet/shims/CometExprShim.scala | 2 +- .../main/spark-4.1/org/apache/comet/shims/CometExprShim.scala | 2 +- .../main/spark-4.2/org/apache/comet/shims/CometExprShim.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala index 97450d24bd..085ee8cc9e 100644 --- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} +import org.apache.comet.serde.{Compatible, ExprOuterClass, Incompatible, SupportLevel} import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType, supportedScalarSortElementType} diff --git a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala index 81a6971631..e8b2b94acb 100644 --- a/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.1/org/apache/comet/shims/CometExprShim.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} +import org.apache.comet.serde.{Compatible, ExprOuterClass, Incompatible, SupportLevel} import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType, supportedScalarSortElementType} diff --git a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala index 909428cbaf..ee0c8d6810 100644 --- a/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala +++ b/spark/src/main/spark-4.2/org/apache/comet/shims/CometExprShim.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, DataTypes import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} -import org.apache.comet.serde.{CommonStringExprs, Compatible, ExprOuterClass, Incompatible, SupportLevel} +import org.apache.comet.serde.{Compatible, ExprOuterClass, Incompatible, SupportLevel} import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr} import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType, supportedScalarSortElementType}