diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 588ae5b457..e74a97bc69 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -238,7 +238,11 @@ - [x] dayofyear - [x] extract - [x] from_unixtime -- [ ] from_utc_timestamp +- [x] from_utc_timestamp + - Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-12): baseline. + - Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings. + - Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution. - [x] hour - [x] last_day - [ ] localtimestamp @@ -270,7 +274,11 @@ - [ ] to_timestamp_ltz - [ ] to_timestamp_ntz - [ ] to_unix_timestamp -- [ ] to_utc_timestamp +- [x] to_utc_timestamp + - Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8. + - Spark 3.5.8 (audited 2026-05-12): baseline. + - Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings. + - Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution. - [x] trunc - [ ] try_make_interval - [ ] try_make_timestamp diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 5d3dbb8266..6dbcc20354 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -48,8 +48,10 @@ use datafusion_spark::function::bitwise::bit_get::SparkBitGet; use datafusion_spark::function::bitwise::bitwise_not::SparkBitwiseNot; use datafusion_spark::function::datetime::date_add::SparkDateAdd; use datafusion_spark::function::datetime::date_sub::SparkDateSub; +use datafusion_spark::function::datetime::from_utc_timestamp::SparkFromUtcTimestamp; use datafusion_spark::function::datetime::last_day::SparkLastDay; use datafusion_spark::function::datetime::next_day::SparkNextDay; +use datafusion_spark::function::datetime::to_utc_timestamp::SparkToUtcTimestamp; use datafusion_spark::function::hash::crc32::SparkCrc32; use datafusion_spark::function::hash::sha1::SparkSha1; use datafusion_spark::function::hash::sha2::SparkSha2; @@ -555,8 +557,10 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitGet::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateAdd::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateSub::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFromUtcTimestamp::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkLastDay::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkNextDay::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkToUtcTimestamp::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha1::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkConcat::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitwiseNot::default())); diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 2d138450e9..f1581a92a8 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -221,6 +221,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[DateSub] -> CometDateSub, classOf[UnixDate] -> CometUnixDate, classOf[FromUnixTime] -> CometFromUnixTime, + classOf[FromUTCTimestamp] -> CometFromUTCTimestamp, + classOf[ToUTCTimestamp] -> CometToUTCTimestamp, classOf[LastDay] -> CometLastDay, classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index cb3be75717..0f01e1ccfd 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -364,6 +364,45 @@ object CometDateAdd extends CometScalarFunction[DateAdd]("date_add") object CometDateSub extends CometScalarFunction[DateSub]("date_sub") +private object UTCTimestampSerde { + val tzParseIncompatReason: String = + "Comet's native timezone parser only accepts IANA zone IDs (e.g." + + " `America/Los_Angeles`) and fixed offsets in `+HH:MM` form. Spark also" + + " accepts forms such as `GMT+1`, `UTC+1`, or three-letter abbreviations like" + + " `PST`; queries using those forms will throw a native parse error at" + + " execution time." +} + +object CometFromUTCTimestamp extends CometExpressionSerde[FromUTCTimestamp] { + + override def getIncompatibleReasons(): Seq[String] = + Seq(UTCTimestampSerde.tzParseIncompatReason) + + override def convert( + expr: FromUTCTimestamp, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto("from_utc_timestamp", childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} + +object CometToUTCTimestamp extends CometExpressionSerde[ToUTCTimestamp] { + + override def getIncompatibleReasons(): Seq[String] = + Seq(UTCTimestampSerde.tzParseIncompatReason) + + override def convert( + expr: ToUTCTimestamp, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto("to_utc_timestamp", childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} + object CometNextDay extends CometScalarFunction[NextDay]("next_day") object CometMakeDate extends CometScalarFunction[MakeDate]("make_date") diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_utc_timestamp.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_utc_timestamp.sql new file mode 100644 index 0000000000..4e6a125fae --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_utc_timestamp.sql @@ -0,0 +1,67 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- The result of from_utc_timestamp is a shift of the underlying microsecond +-- value, so it must not depend on the session timezone. Verify across two +-- representative session zones. +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +statement +CREATE TABLE test_from_utc_timestamp(ts timestamp, tz string) USING parquet + +-- Includes a summer and a winter row for the LA timezone so that both DST +-- branches are exercised. The third row uses a fixed-offset tz to match the +-- form Spark's own DateFunctionsSuite covers via CEST = +02:00. +statement +INSERT INTO test_from_utc_timestamp VALUES + (timestamp('2015-07-24 00:00:00'), 'America/Los_Angeles'), + (timestamp('2015-01-24 00:00:00'), 'America/Los_Angeles'), + (timestamp('2024-06-15 10:30:45'), '+02:00'), + (timestamp('2024-01-01 00:00:00'), 'Asia/Seoul'), + (timestamp('1969-12-31 23:59:59'), 'UTC'), + (NULL, 'UTC'), + (timestamp('2024-06-15 10:30:45'), NULL), + (NULL, NULL) + +-- column timestamp, literal IANA timezone +query +SELECT from_utc_timestamp(ts, 'America/Los_Angeles') FROM test_from_utc_timestamp + +query +SELECT from_utc_timestamp(ts, 'Asia/Seoul') FROM test_from_utc_timestamp + +query +SELECT from_utc_timestamp(ts, 'UTC') FROM test_from_utc_timestamp + +-- column timestamp, literal fixed-offset timezone +query +SELECT from_utc_timestamp(ts, '+02:00') FROM test_from_utc_timestamp + +-- column timestamp, column timezone (mix of IANA and fixed-offset values) +query +SELECT from_utc_timestamp(ts, tz) FROM test_from_utc_timestamp + +-- literal arguments +query +SELECT from_utc_timestamp(timestamp('2017-07-14 02:40:00'), 'Etc/GMT-1') + +query +SELECT from_utc_timestamp(timestamp('2016-08-31 00:00:00'), 'Asia/Seoul') + +-- null handling +query +SELECT from_utc_timestamp(NULL, 'UTC'), from_utc_timestamp(timestamp('2024-01-01 00:00:00'), NULL), from_utc_timestamp(NULL, NULL) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_utc_timestamp.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_utc_timestamp.sql new file mode 100644 index 0000000000..c44f66bdee --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_utc_timestamp.sql @@ -0,0 +1,67 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- The result of to_utc_timestamp is a shift of the underlying microsecond +-- value, so it must not depend on the session timezone. Verify across two +-- representative session zones. +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +statement +CREATE TABLE test_to_utc_timestamp(ts timestamp, tz string) USING parquet + +-- Includes a summer and a winter row for the LA timezone so that both DST +-- branches are exercised. The third row uses a fixed-offset tz to match the +-- form Spark's own DateFunctionsSuite covers via CEST = +02:00. +statement +INSERT INTO test_to_utc_timestamp VALUES + (timestamp('2015-07-24 00:00:00'), 'America/Los_Angeles'), + (timestamp('2015-01-24 00:00:00'), 'America/Los_Angeles'), + (timestamp('2024-06-15 10:30:45'), '+02:00'), + (timestamp('2024-01-01 00:00:00'), 'Asia/Seoul'), + (timestamp('1969-12-31 23:59:59'), 'UTC'), + (NULL, 'UTC'), + (timestamp('2024-06-15 10:30:45'), NULL), + (NULL, NULL) + +-- column timestamp, literal IANA timezone +query +SELECT to_utc_timestamp(ts, 'America/Los_Angeles') FROM test_to_utc_timestamp + +query +SELECT to_utc_timestamp(ts, 'Asia/Seoul') FROM test_to_utc_timestamp + +query +SELECT to_utc_timestamp(ts, 'UTC') FROM test_to_utc_timestamp + +-- column timestamp, literal fixed-offset timezone +query +SELECT to_utc_timestamp(ts, '+02:00') FROM test_to_utc_timestamp + +-- column timestamp, column timezone (mix of IANA and fixed-offset values) +query +SELECT to_utc_timestamp(ts, tz) FROM test_to_utc_timestamp + +-- literal arguments +query +SELECT to_utc_timestamp(timestamp('2017-07-14 02:40:00'), 'Etc/GMT-1') + +query +SELECT to_utc_timestamp(timestamp('2016-08-31 00:00:00'), 'Asia/Seoul') + +-- null handling +query +SELECT to_utc_timestamp(NULL, 'UTC'), to_utc_timestamp(timestamp('2024-01-01 00:00:00'), NULL), to_utc_timestamp(NULL, NULL)