diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 63b9d0cd37..790cafa5f5 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -74,6 +74,6 @@ jobs: with: platforms: linux/amd64,linux/arm64 push: true - tags: ghcr.io/apache/datafusion-comet:spark-3.5-scala-2.12-${{ env.COMET_VERSION }} + tags: ghcr.io/apache/datafusion-comet:spark-4.0-scala-2.13-${{ env.COMET_VERSION }} file: kube/Dockerfile no-cache: true diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml index a8d925b1c2..f57a95c0f7 100644 --- a/.github/workflows/pr_build_linux.yml +++ b/.github/workflows/pr_build_linux.yml @@ -444,7 +444,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ env.RUST_VERSION }} - jdk-version: 11 + jdk-version: 17 - name: Download native library uses: actions/download-artifact@v8 @@ -502,7 +502,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ env.RUST_VERSION }} - jdk-version: 11 + jdk-version: 17 - name: Download native library uses: actions/download-artifact@v8 diff --git a/docs/source/contributor-guide/benchmarking_aws_ec2.md b/docs/source/contributor-guide/benchmarking_aws_ec2.md index 81f15d64ea..bc02d2bf79 100644 --- a/docs/source/contributor-guide/benchmarking_aws_ec2.md +++ b/docs/source/contributor-guide/benchmarking_aws_ec2.md @@ -104,7 +104,7 @@ make release Set `COMET_JAR` environment variable. ```shell -export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +export COMET_JAR=/home/ec2-user/datafusion-comet/spark/target/comet-spark-spark4.0_2.13-$COMET_VERSION.jar ``` ## Run Benchmarks diff --git a/docs/source/contributor-guide/benchmarking_macos.md b/docs/source/contributor-guide/benchmarking_macos.md index e75261e8d5..20ec0f6f11 100644 --- a/docs/source/contributor-guide/benchmarking_macos.md +++ b/docs/source/contributor-guide/benchmarking_macos.md @@ -55,13 +55,13 @@ export DF_BENCH=`pwd` ## Install Spark -Install Apache Spark. This example refers to 3.5.4 version. +Install Apache Spark. This example refers to 4.0.2 version. ```shell -wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz -tar xzf spark-3.5.4-bin-hadoop3.tgz -sudo mv spark-3.5.4-bin-hadoop3 /opt -export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/ +wget https://archive.apache.org/dist/spark/spark-4.0.2/spark-4.0.2-bin-hadoop3.tgz +tar xzf spark-4.0.2-bin-hadoop3.tgz +sudo mv spark-4.0.2-bin-hadoop3 /opt +export SPARK_HOME=/opt/spark-4.0.2-bin-hadoop3/ ``` Start Spark in standalone mode: @@ -129,7 +129,7 @@ make release COMET_FEATURES=mimalloc Set `COMET_JAR` to point to the location of the Comet jar file. Example for Comet 0.8 ```shell -export COMET_JAR=`pwd`/spark/target/comet-spark-spark3.5_2.12-0.8.0-SNAPSHOT.jar +export COMET_JAR=`pwd`/spark/target/comet-spark-spark4.0_2.13-0.8.0-SNAPSHOT.jar ``` Run the following command (the `--data` parameter will need to be updated to point to your S3 bucket): diff --git a/docs/source/contributor-guide/benchmarking_spark_sql_perf.md b/docs/source/contributor-guide/benchmarking_spark_sql_perf.md index 538539759c..b9af6469b7 100644 --- a/docs/source/contributor-guide/benchmarking_spark_sql_perf.md +++ b/docs/source/contributor-guide/benchmarking_spark_sql_perf.md @@ -34,8 +34,8 @@ partitioning and writing to Parquet format automatically. ## Prerequisites -- Java 17 (for Spark 3.5+) -- Apache Spark 3.5.x +- Java 17 +- Apache Spark 4.0.x - SBT (Scala Build Tool) - C compiler toolchain (`gcc`, `make`, `flex`, `bison`, `byacc`) @@ -225,7 +225,7 @@ Build Comet from source and launch `spark-shell` with both the Comet and spark-s ```shell make release -export COMET_JAR=$(pwd)/spark/target/comet-spark-spark3.5_2.12-*.jar +export COMET_JAR=$(pwd)/spark/target/comet-spark-spark4.0_2.13-*.jar $SPARK_HOME/bin/spark-shell \ --master $SPARK_MASTER \ diff --git a/docs/source/contributor-guide/debugging.md b/docs/source/contributor-guide/debugging.md index 3356a83893..2c9909c910 100644 --- a/docs/source/contributor-guide/debugging.md +++ b/docs/source/contributor-guide/debugging.md @@ -136,7 +136,7 @@ make release COMET_FEATURES=backtrace Set `RUST_BACKTRACE=1` for the Spark worker/executor process, or for `spark-submit` if running in local mode. ```console -RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true +RUST_BACKTRACE=1 $SPARK_HOME/spark-shell --jars spark/target/comet-spark-spark4.0_2.13-$COMET_VERSION.jar --conf spark.plugins=org.apache.spark.CometPlugin --conf spark.comet.enabled=true --conf spark.comet.exec.enabled=true ``` Get the expanded exception details diff --git a/docs/source/contributor-guide/iceberg-spark-tests.md b/docs/source/contributor-guide/iceberg-spark-tests.md index 38becc0208..cd73ff089f 100644 --- a/docs/source/contributor-guide/iceberg-spark-tests.md +++ b/docs/source/contributor-guide/iceberg-spark-tests.md @@ -40,7 +40,7 @@ Here is an overview of the changes that the diffs make to Iceberg: Run `make release` in Comet to install the Comet JAR into the local Maven repository, specifying the Spark version. ```shell -PROFILES="-Pspark-3.5" make release +PROFILES="-Pspark-4.0" make release ``` ## 2. Clone Iceberg and Apply Diff diff --git a/docs/source/user-guide/latest/datasources.md b/docs/source/user-guide/latest/datasources.md index b79831d804..ef1e343cb5 100644 --- a/docs/source/user-guide/latest/datasources.md +++ b/docs/source/user-guide/latest/datasources.md @@ -69,12 +69,12 @@ Unlike to native Comet reader the Datafusion reader fully supports nested types To build Comet with native DataFusion reader and remote HDFS support it is required to have a JDK installed Example: -Build a Comet for `spark-3.5` provide a JDK path in `JAVA_HOME` +Build a Comet for `spark-4.0` provide a JDK path in `JAVA_HOME` Provide the JRE linker path in `RUSTFLAGS`, the path can vary depending on the system. Typically JRE linker is a part of installed JDK ```shell -export JAVA_HOME="/opt/homebrew/opt/openjdk@11" -make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server" +export JAVA_HOME="/opt/homebrew/opt/openjdk@17" +make release PROFILES="-Pspark-4.0" COMET_FEATURES=hdfs RUSTFLAGS="-L $JAVA_HOME/libexec/openjdk.jdk/Contents/Home/lib/server" ``` Start Comet with experimental reader and HDFS support as [described](installation.md/#run-spark-shell-with-comet-enabled) @@ -149,7 +149,7 @@ docker compose -f kube/local/hdfs-docker-compose.yml up - Build a project with HDFS support ```shell -JAVA_HOME="/opt/homebrew/opt/openjdk@11" make release PROFILES="-Pspark-3.5" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home/lib/server" +JAVA_HOME="/opt/homebrew/opt/openjdk@17" make release PROFILES="-Pspark-4.0" COMET_FEATURES=hdfs RUSTFLAGS="-L /opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home/lib/server" ``` - Run local test diff --git a/docs/source/user-guide/latest/iceberg.md b/docs/source/user-guide/latest/iceberg.md index 24a4bda057..12418b9545 100644 --- a/docs/source/user-guide/latest/iceberg.md +++ b/docs/source/user-guide/latest/iceberg.md @@ -31,7 +31,7 @@ reader is enabled by default. To disable it, set `spark.comet.scan.icebergNative ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark3.5_2.12:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark4.0_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkCatalog \ @@ -106,7 +106,7 @@ configure Spark to use a REST catalog with Comet's native Iceberg scan: ```shell $SPARK_HOME/bin/spark-shell \ - --packages org.apache.datafusion:comet-spark-spark3.5_2.12:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ + --packages org.apache.datafusion:comet-spark-spark4.0_2.13:0.14.0,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1,org.apache.iceberg:iceberg-core:1.8.1 \ --repositories https://repo1.maven.org/maven2/ \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.rest_cat=org.apache.iceberg.spark.SparkCatalog \ diff --git a/docs/source/user-guide/latest/installation.md b/docs/source/user-guide/latest/installation.md index fdae0f126c..50bd430f10 100644 --- a/docs/source/user-guide/latest/installation.md +++ b/docs/source/user-guide/latest/installation.md @@ -105,7 +105,7 @@ See the [Comet Kubernetes Guide](kubernetes.md) guide. Make sure `SPARK_HOME` points to the same Spark version as Comet was built for. ```shell -export COMET_JAR=spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +export COMET_JAR=spark/target/comet-spark-spark4.0_2.13-$COMET_VERSION.jar $SPARK_HOME/bin/spark-shell \ --jars $COMET_JAR \ @@ -161,7 +161,7 @@ explicitly contain Comet otherwise Spark may use a different class-loader for th components which will then fail at runtime. For example: ``` ---driver-class-path spark/target/comet-spark-spark3.5_2.12-$COMET_VERSION.jar +--driver-class-path spark/target/comet-spark-spark4.0_2.13-$COMET_VERSION.jar ``` Some cluster managers may require additional configuration, see diff --git a/docs/source/user-guide/latest/kubernetes.md b/docs/source/user-guide/latest/kubernetes.md index 2fb037d630..718da49d32 100644 --- a/docs/source/user-guide/latest/kubernetes.md +++ b/docs/source/user-guide/latest/kubernetes.md @@ -69,30 +69,30 @@ metadata: spec: type: Scala mode: cluster - image: apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11 + image: apache/datafusion-comet:0.7.0-spark4.0.2-scala2.13-java17 imagePullPolicy: IfNotPresent mainClass: org.apache.spark.examples.SparkPi - mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.5.jar + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.13-4.0.2.jar sparkConf: - "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" - "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" + "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark4.0_2.13-0.7.0.jar" + "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark4.0_2.13-0.7.0.jar" "spark.plugins": "org.apache.spark.CometPlugin" "spark.comet.enabled": "true" "spark.comet.exec.enabled": "true" "spark.comet.exec.shuffle.enabled": "true" "spark.comet.exec.shuffle.mode": "auto" "spark.shuffle.manager": "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager" - sparkVersion: 3.5.6 + sparkVersion: 4.0.2 driver: labels: - version: 3.5.6 + version: 4.0.2 cores: 1 coreLimit: 1200m memory: 512m serviceAccount: spark-operator-spark executor: labels: - version: 3.5.6 + version: 4.0.2 instances: 1 cores: 1 coreLimit: 1200m diff --git a/docs/source/user-guide/latest/source.md b/docs/source/user-guide/latest/source.md index eb56e1f21b..5ad33ecfe1 100644 --- a/docs/source/user-guide/latest/source.md +++ b/docs/source/user-guide/latest/source.md @@ -38,7 +38,7 @@ cd apache-datafusion-comet-$COMET_VERSION Build ```console -make release-nogit PROFILES="-Pspark-3.5" +make release-nogit PROFILES="-Pspark-4.0" ``` ## Building from the GitHub repository @@ -53,17 +53,17 @@ Build Comet for a specific Spark version: ```console cd datafusion-comet -make release PROFILES="-Pspark-3.5" +make release PROFILES="-Pspark-4.0" ``` -Note that the project builds for Scala 2.12 by default but can be built for Scala 2.13 using an additional profile: +Note that the project builds for Scala 2.13 by default but can be built for Scala 2.12 using an additional profile: ```console -make release PROFILES="-Pspark-3.5 -Pscala-2.13" +make release PROFILES="-Pspark-3.5 -Pscala-2.12" ``` To build Comet from the source distribution on an isolated environment without an access to `github.com` it is necessary to disable `git-commit-id-maven-plugin`, otherwise you will face errors that there is no access to the git during the build process. In that case you may use: ```console -make release-nogit PROFILES="-Pspark-3.5" +make release-nogit PROFILES="-Pspark-4.0" ``` diff --git a/kube/Dockerfile b/kube/Dockerfile index 699aeeb210..a078277407 100644 --- a/kube/Dockerfile +++ b/kube/Dockerfile @@ -15,14 +15,14 @@ # limitations under the License. # -FROM apache/spark:3.5.8 AS builder +FROM apache/spark:4.0.2 AS builder USER root -# Installing JDK11 as the image comes with JRE +# Installing JDK17 as the image comes with JRE RUN apt update \ && apt install -y curl \ - && apt install -y openjdk-11-jdk \ + && apt install -y openjdk-17-jdk \ && apt clean RUN apt install -y gcc-10 g++-10 cpp-10 unzip @@ -37,8 +37,8 @@ ENV PATH="$PATH:/root/.local/bin" RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" ENV RUSTFLAGS="-C debuginfo=line-tables-only -C incremental=false" -ENV SPARK_VERSION=3.5 -ENV SCALA_VERSION=2.12 +ENV SPARK_VERSION=4.0 +ENV SCALA_VERSION=2.13 # copy source files to Docker image RUN mkdir /comet @@ -70,9 +70,9 @@ RUN mkdir -p /root/.m2 && \ RUN cd /comet \ && JAVA_HOME=$(readlink -f $(which javac) | sed "s/\/bin\/javac//") make release-nogit PROFILES="-Pspark-$SPARK_VERSION -Pscala-$SCALA_VERSION" -FROM apache/spark:3.5.8 -ENV SPARK_VERSION=3.5 -ENV SCALA_VERSION=2.12 +FROM apache/spark:4.0.2 +ENV SPARK_VERSION=4.0 +ENV SCALA_VERSION=2.13 USER root # note the use of a wildcard in the file name so that this works with both snapshot and final release versions diff --git a/pom.xml b/pom.xml index b83a6fd45b..5199c8a453 100644 --- a/pom.xml +++ b/pom.xml @@ -65,24 +65,24 @@ under the License. 1.7.0 3.6.1 0.16.1 - 2.12.18 - 2.12 + 2.13.16 + 2.13 4.9.6 3.2.16 2.2.0 - 3.5.8 - 3.5 + 4.0.2 + 4.0 provided 3.25.5 - 1.13.1 + 1.15.2 provided 3.3.4 18.3.0 1.9.13 2.43.0 0.8.11 - 4.8.8 - 2.0.7 + 4.13.6 + 2.0.16 33.2.1-jre 1.21.0 2.31.51 @@ -116,8 +116,8 @@ under the License. -Djdk.reflect.useDirectMethodHandle=false -ea -Xmx4g -Xss4m ${extraJavaTestArgs} - spark-3.x - spark-3.5 + spark-4.x + spark-4.0 @@ -635,10 +635,13 @@ under the License. spark-3.4 2.12.17 + 2.12 3.4.3 3.4 1.13.1 + 4.8.8 2.0.6 + spark-3.x spark-3.4 11 ${java.version} @@ -650,10 +653,13 @@ under the License. spark-3.5 2.12.18 + 2.12 3.5.8 3.5 1.13.1 + 4.8.8 2.0.7 + spark-3.x spark-3.5 11 ${java.version} @@ -662,10 +668,8 @@ under the License. - spark-4.0 - 2.13.16 2.13 4.0.2 @@ -675,7 +679,6 @@ under the License. 2.0.16 spark-4.x spark-4.0 - 17 ${java.version} ${java.version} @@ -729,6 +732,11 @@ under the License. scala-2.12 + + 2.12.18 + 2.12 + 4.8.8 + diff --git a/spark/pom.xml b/spark/pom.xml index 722cebade3..fc0fc5c99a 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -223,9 +223,6 @@ under the License. spark-3.5 - - true - org.apache.iceberg @@ -251,6 +248,9 @@ under the License. spark-4.0 + + true + org.apache.iceberg