rebase master

panbingkun · Nov 29, 2024 · f7dbc20 · f7dbc20
1 parent 54b1e42
commit f7dbc20
Show file tree

Hide file tree

Showing 4,937 changed files with 175,700 additions and 89,879 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -31,6 +31,8 @@ github:
     merge: false
     squash: true
     rebase: true
+  ghp_branch: master
+  ghp_path: /docs
 
 notifications:
   pullrequests: reviews@spark.apache.org

diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -26,16 +26,14 @@ INFRA:
      '.asf.yaml',
      '.gitattributes',
      '.gitignore',
-     'dev/merge_spark_pr.py',
-     'dev/run-tests-jenkins*'
+     'dev/merge_spark_pr.py'
     ]
 
 BUILD:
   - changed-files:
     - all-globs-to-any-file: [
      'dev/**/*',
-     '!dev/merge_spark_pr.py',
-     '!dev/run-tests-jenkins*'
+     '!dev/merge_spark_pr.py'
     ]
     - any-glob-to-any-file: [
      'build/**/*',
@@ -199,6 +197,7 @@ YARN:
 KUBERNETES:
   - changed-files:
     - any-glob-to-any-file: [
+     'bin/docker-image-tool.sh',
      'resource-managers/kubernetes/**/*'
     ]
 
@@ -225,7 +224,7 @@ DEPLOY:
 CONNECT:
   - changed-files:
     - any-glob-to-any-file: [
-     'connect/**/*',
+     'sql/connect/**/*',
      'connector/connect/**/*',
      'python/pyspark/sql/**/connect/**/*',
      'python/pyspark/ml/**/connect/**/*'

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
@@ -27,6 +27,9 @@ on:
     - 'branch-*'
     paths:
     - 'dev/infra/Dockerfile'
+    - 'dev/spark-test-image/docs/Dockerfile'
+    - 'dev/spark-test-image/lint/Dockerfile'
+    - 'dev/spark-test-image/sparkr/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
   create:
@@ -51,7 +54,7 @@ jobs:
           password: ${{ secrets.GITHUB_TOKEN }}
       - name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: ./dev/infra/
           push: true
@@ -60,3 +63,42 @@ jobs:
           cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }},mode=max
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}
+      - name: Build and push (Documentation)
+        if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
+        id: docker_build_docs
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/docs/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (Documentation)
+        if: hashFiles('dev/spark-test-image/docs/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_docs.outputs.digest }}
+      - name: Build and push (Linter)
+        if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+        id: docker_build_lint
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/lint/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (Linter)
+        if: hashFiles('dev/spark-test-image/lint/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_lint.outputs.digest }}
+      - name: Build and push (SparkR)
+        if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+        id: docker_build_sparkr
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/sparkr/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (SparkR)
+        if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml
@@ -30,8 +30,3 @@ jobs:
     name: Run
     uses: ./.github/workflows/maven_test.yml
     if: github.repository == 'apache/spark'
-    with:
-      envs: >-
-        {
-          "SKIP_SPARK_RELEASE_VERSIONS": "3.4.2"
-        }
diff --git a/.../workflows/build_maven_java21_macos14.yml → .../workflows/build_maven_java21_macos15.yml b/.../workflows/build_maven_java21_macos14.yml → .../workflows/build_maven_java21_macos15.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, macos-14)"
+name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
 
 on:
   schedule:
@@ -32,7 +32,7 @@ jobs:
     if: github.repository == 'apache/spark'
     with:
       java: 21
-      os: macos-14
+      os: macos-15
       envs: >-
         {
           "OBJC_DISABLE_INITIALIZE_FORK_SAFETY": "YES"

diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: "Build / Python-only (master, Python 3.11, MacOS)"
+
+on:
+  schedule:
+    - cron: '0 21 * * *'
+
+jobs:
+  run-build:
+    permissions:
+      packages: write
+    name: Run
+    uses: ./.github/workflows/python_macos_test.yml
+    if: github.repository == 'apache/spark'
diff --git a/.github/workflows/build_branch34.yml → .github/workflows/build_python_3.13.yml b/.github/workflows/build_branch34.yml → .github/workflows/build_python_3.13.yml
@@ -17,11 +17,11 @@
 # under the License.
 #
 
-name: "Build (branch-3.4, Scala 2.13, Hadoop 3, JDK 8)"
+name: "Build / Python-only (master, Python 3.13)"
 
 on:
   schedule:
-    - cron: '0 9 * * *'
+    - cron: '0 20 * * *'
 
 jobs:
   run-build:
@@ -31,21 +31,15 @@ jobs:
     uses: ./.github/workflows/build_and_test.yml
     if: github.repository == 'apache/spark'
     with:
-      java: 8
-      branch: branch-3.4
+      java: 17
+      branch: master
       hadoop: hadoop3
       envs: >-
         {
-          "SCALA_PROFILE": "scala2.13",
-          "PYTHON_TO_TEST": "",
-          "ORACLE_DOCKER_IMAGE_NAME": "gvenzl/oracle-xe:21.3.0"
+          "PYTHON_TO_TEST": "python3.13"
         }
       jobs: >-
         {
-          "build": "true",
-          "sparkr": "true",
-          "tpcds-1g": "true",
-          "docker-integration-tests": "true",
-          "k8s-integration-tests": "true",
-          "lint" : "true"
+          "pyspark": "true",
+          "pyspark-pandas": "true"
         }
diff --git a/.github/workflows/build_branch34_python.yml → .github/workflows/build_python_3.9.yml b/.github/workflows/build_branch34_python.yml → .github/workflows/build_python_3.9.yml
@@ -17,11 +17,11 @@
 # under the License.
 #
 
-name: "Build / Python-only (branch-3.4)"
+name: "Build / Python-only (master, Python 3.9)"
 
 on:
   schedule:
-    - cron: '0 9 * * *'
+    - cron: '0 21 * * *'
 
 jobs:
   run-build:
@@ -31,12 +31,12 @@ jobs:
     uses: ./.github/workflows/build_and_test.yml
     if: github.repository == 'apache/spark'
     with:
-      java: 8
-      branch: branch-3.4
+      java: 17
+      branch: master
       hadoop: hadoop3
       envs: >-
         {
-          "PYTHON_TO_TEST": ""
+          "PYTHON_TO_TEST": "python3.9"
         }
       jobs: >-
         {

diff --git a/.github/workflows/build_python_connect.yml b/.github/workflows/build_python_connect.yml
@@ -71,7 +71,7 @@ jobs:
           python packaging/connect/setup.py sdist
           cd dist
           pip install pyspark*connect-*.tar.gz
-          pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting
+          pip install 'six==1.16.0' 'pandas<=2.2.2' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' torch torchvision torcheval deepspeed unittest-xml-reporting 'plotly>=4.8'
       - name: Run tests
         env:
           SPARK_TESTING: 1
@@ -84,7 +84,7 @@ jobs:
           # Start a Spark Connect server for local
           PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
-            --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+            --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
           # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
           mv python/lib lib.back
@@ -93,7 +93,7 @@ jobs:
           # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
-          ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
 
           # Stop Spark Connect server.
           ./sbin/stop-connect-server.sh
@@ -104,7 +104,7 @@ jobs:
           PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
             --master "local-cluster[2, 4, 1024]" \
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
-            --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+            --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
           # Remove Py4J and PySpark zipped library to make sure there is no JVM connection
           mv python/lib lib.back

diff --git a/.github/workflows/build_python_connect35.yml b/.github/workflows/build_python_connect35.yml
@@ -70,7 +70,7 @@ jobs:
           pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
 
           # Add Python deps for Spark Connect.
-          pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' 'graphviz==0.20.3'
+          pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
 
           # Add torch as a testing dependency for TorchDistributor
           pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval
@@ -87,7 +87,7 @@ jobs:
           # Start a Spark Connect server for local
           PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \
             --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \
-            --jars "`find connect/server/target -name spark-connect-*SNAPSHOT.jar`,`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
+            --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`"
 
           # Checkout to branch-3.5 to use the tests in branch-3.5.
           cd ..
@@ -98,7 +98,7 @@ jobs:
           # Run branch-3.5 tests
           ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect
           # None of tests are dependent on each other in Pandas API on Spark so run them in parallel
-          ./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
+          ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
       - name: Upload test results to report
         if: always()
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/build_python_pypy3.9.yml → .github/workflows/build_python_pypy3.10.yml b/.github/workflows/build_python_pypy3.9.yml → .github/workflows/build_python_pypy3.10.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build / Python-only (master, PyPy 3.9)"
+name: "Build / Python-only (master, PyPy 3.10)"
 
 on:
   schedule:

diff --git a/.github/workflows/build_sparkr_window.yml b/.github/workflows/build_sparkr_window.yml
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-name: "Build / SparkR-only (master, 4.4.1, windows-2022)"
+name: "Build / SparkR-only (master, 4.4.2, windows-2022)"
 
 on:
   schedule:
@@ -50,10 +50,10 @@ jobs:
       with:
         distribution: zulu
         java-version: 17
-    - name: Install R 4.4.1
+    - name: Install R 4.4.2
       uses: r-lib/actions/setup-r@v2
       with:
-        r-version: 4.4.1
+        r-version: 4.4.2
     - name: Install R dependencies
       run: |
         Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
@@ -85,6 +85,7 @@ jobs:
       shell: cmd
       env:
         NOT_CRAN: true
+        SPARKR_SUPPRESS_DEPRECATION_WARNING: 1
         # See SPARK-27848. Currently installing some dependent packages causes
         # "(converted from warning) unable to identify current timezone 'C':" for an unknown reason.
         # This environment variable works around to test SparkR against a higher version.

diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml
@@ -40,7 +40,7 @@ on:
         description: OS to run this build.
         required: false
         type: string
-        default: ubuntu-22.04
+        default: ubuntu-latest
       envs:
         description: Additional environment variables to set when running the tests. Should be in JSON format.
         required: false
@@ -178,7 +178,7 @@ jobs:
       - name: Install Python packages (Python 3.11)
         if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
         run: |
-          python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
+          python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3'
           python3.11 -m pip list
       # Run the tests.
       - name: Run tests
@@ -194,7 +194,7 @@ jobs:
           if [[ "$INCLUDED_TAGS" != "" ]]; then
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
           elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
-            ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connect/common,connect/server test -fae
+            ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,sql/connect/common,sql/connect/server test -fae
           elif [[ "$EXCLUDED_TAGS" != "" ]]; then
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
           elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then