Skip to content

Commit 3f1cf4f

Browse files
Merge remote-tracking branch 'spark/master' into shuffle-spec-direct-partition
2 parents 350ebca + 0e10341 commit 3f1cf4f

File tree

1,794 files changed

+77492
-13121
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,794 files changed

+77492
-13121
lines changed

.github/workflows/build_and_test.yml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ jobs:
362362
- name: Install Python packages (Python 3.11)
363363
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') || contains(matrix.modules, 'yarn')
364364
run: |
365-
python3.11 -m pip install 'numpy>=1.22' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1'
365+
python3.11 -m pip install 'numpy>=1.22' pyarrow pandas pyyaml scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5'
366366
python3.11 -m pip list
367367
# Run the tests.
368368
- name: Run tests
@@ -514,33 +514,34 @@ jobs:
514514
pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger
515515
- >-
516516
pyspark-mllib, pyspark-ml, pyspark-ml-connect, pyspark-pipelines
517+
- >-
518+
pyspark-structured-streaming, pyspark-structured-streaming-connect
517519
- >-
518520
pyspark-connect
519521
- >-
520522
pyspark-pandas
521523
- >-
522524
pyspark-pandas-slow
523525
- >-
524-
pyspark-pandas-connect-part0, pyspark-pandas-connect-part3
526+
pyspark-pandas-connect
525527
- >-
526-
pyspark-pandas-connect-part1, pyspark-pandas-connect-part2
528+
pyspark-pandas-slow-connect
527529
exclude:
528530
# Always run if pyspark == 'true', even infra-image is skip (such as non-master job)
529531
# In practice, the build will run in individual PR, but not against the individual commit
530532
# in Apache Spark repository.
531533
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-sql, pyspark-resource, pyspark-testing' }}
532534
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger' }}
533535
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-mllib, pyspark-ml, pyspark-ml-connect' }}
536+
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-structured-streaming, pyspark-structured-streaming-connect' }}
534537
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-connect' }}
535538
# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
536539
# In practice, the build will run in individual PR, but not against the individual commit
537540
# in Apache Spark repository.
538541
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
539542
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
540-
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }}
541-
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
542-
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
543-
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
543+
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect' }}
544+
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow-connect' }}
544545
env:
545546
MODULES_TO_TEST: ${{ matrix.modules }}
546547
HADOOP_PROFILE: ${{ inputs.hadoop }}
@@ -765,7 +766,7 @@ jobs:
765766
python-version: '3.11'
766767
- name: Install dependencies for Python CodeGen check
767768
run: |
768-
python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
769+
python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.5' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
769770
python3.11 -m pip list
770771
- name: Python CodeGen check for branch-3.5
771772
if: inputs.branch == 'branch-3.5'
@@ -947,7 +948,7 @@ jobs:
947948
- uses: actions/setup-java@v4
948949
with:
949950
distribution: zulu
950-
java-version: 25-ea
951+
java-version: 25
951952
- name: Build with Maven
952953
run: |
953954
export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
@@ -1325,7 +1326,7 @@ jobs:
13251326
- name: Start Minikube
13261327
uses: medyagh/[email protected]
13271328
with:
1328-
kubernetes-version: "1.33.0"
1329+
kubernetes-version: "1.34.0"
13291330
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
13301331
cpus: 2
13311332
memory: 6144m

.github/workflows/build_infra_images_cache.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ on:
3939
- 'dev/spark-test-image/python-312/Dockerfile'
4040
- 'dev/spark-test-image/python-313/Dockerfile'
4141
- 'dev/spark-test-image/python-313-nogil/Dockerfile'
42+
- 'dev/spark-test-image/python-314/Dockerfile'
4243
- 'dev/spark-test-image/numpy-213/Dockerfile'
4344
- '.github/workflows/build_infra_images_cache.yml'
4445
# Create infra image when cutting down branches/tags
@@ -230,6 +231,19 @@ jobs:
230231
- name: Image digest (PySpark with Python 3.13 no GIL)
231232
if: hashFiles('dev/spark-test-image/python-313-nogil/Dockerfile') != ''
232233
run: echo ${{ steps.docker_build_pyspark_python_313_nogil.outputs.digest }}
234+
- name: Build and push (PySpark with Python 3.14)
235+
if: hashFiles('dev/spark-test-image/python-314/Dockerfile') != ''
236+
id: docker_build_pyspark_python_314
237+
uses: docker/build-push-action@v6
238+
with:
239+
context: ./dev/spark-test-image/python-314/
240+
push: true
241+
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }}-static
242+
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }}
243+
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }},mode=max
244+
- name: Image digest (PySpark with Python 3.14)
245+
if: hashFiles('dev/spark-test-image/python-314/Dockerfile') != ''
246+
run: echo ${{ steps.docker_build_pyspark_python_314.outputs.digest }}
233247
- name: Build and push (PySpark with Numpy 2.1.3)
234248
if: hashFiles('dev/spark-test-image/numpy-213/Dockerfile') != ''
235249
id: docker_build_pyspark_numpy_213

.github/workflows/build_maven_java21_macos15.yml renamed to .github/workflows/build_maven_java21_macos26.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# under the License.
1818
#
1919

20-
name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
20+
name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-26)"
2121

2222
on:
2323
schedule:
@@ -33,7 +33,7 @@ jobs:
3333
if: github.repository == 'apache/spark'
3434
with:
3535
java: 21
36-
os: macos-15
36+
os: macos-26
3737
arch: arm64
3838
envs: >-
3939
{
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
#
19+
20+
name: "Build / Python-only (master, Python 3.14)"
21+
22+
on:
23+
schedule:
24+
- cron: '0 21 * * *'
25+
workflow_dispatch:
26+
27+
jobs:
28+
run-build:
29+
permissions:
30+
packages: write
31+
name: Run
32+
uses: ./.github/workflows/build_and_test.yml
33+
if: github.repository == 'apache/spark'
34+
with:
35+
java: 17
36+
branch: master
37+
hadoop: hadoop3
38+
envs: >-
39+
{
40+
"PYSPARK_IMAGE_TO_TEST": "python-314",
41+
"PYTHON_TO_TEST": "python3.14"
42+
}
43+
jobs: >-
44+
{
45+
"pyspark": "true",
46+
"pyspark-pandas": "true"
47+
}

.github/workflows/build_python_connect.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
python packaging/client/setup.py sdist
7373
cd dist
7474
pip install pyspark*client-*.tar.gz
75-
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' 'six==1.16.0' 'pandas==2.3.2' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' 'torch<2.6.0' torchvision torcheval deepspeed unittest-xml-reporting
75+
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' 'six==1.16.0' 'pandas==2.3.2' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' 'torch<2.6.0' torchvision torcheval deepspeed unittest-xml-reporting
7676
- name: List Python packages
7777
run: python -m pip list
7878
- name: Run tests (local)
@@ -96,7 +96,7 @@ jobs:
9696
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
9797
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
9898
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
99-
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
99+
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect
100100
101101
# Stop Spark Connect server.
102102
./sbin/stop-connect-server.sh

.github/workflows/build_python_connect35.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
pip install 'numpy==1.25.1' 'pyarrow>=18.0.0' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
7272
7373
# Add Python deps for Spark Connect.
74-
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
74+
pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3'
7575
7676
# Add torch as a testing dependency for TorchDistributor
7777
pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval

.github/workflows/maven_test.yml

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,19 +78,13 @@ jobs:
7878
connector#kafka-0-10,connector#kafka-0-10-sql,connector#kafka-0-10-token-provider,connector#spark-ganglia-lgpl,connector#protobuf,connector#avro,connector#kinesis-asl
7979
- >-
8080
sql#api,sql#catalyst,resource-managers#yarn,resource-managers#kubernetes#core
81+
- >-
82+
connect
8183
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
8284
included-tags: [ "" ]
8385
excluded-tags: [ "" ]
8486
comment: [ "" ]
8587
include:
86-
# Connect tests
87-
- modules: connect
88-
java: ${{ inputs.java }}
89-
hadoop: ${{ inputs.hadoop }}
90-
hive: hive2.3
91-
# TODO(SPARK-47110): Reenble AmmoniteTest tests in Maven builds
92-
excluded-tags: org.apache.spark.tags.AmmoniteTest
93-
comment: ""
9488
# Hive tests
9589
- modules: sql#hive
9690
java: ${{ inputs.java }}
@@ -181,12 +175,20 @@ jobs:
181175
- name: Install Python packages (Python 3.11)
182176
if: contains(matrix.modules, 'resource-managers#yarn') || (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
183177
run: |
184-
python3.11 -m pip install 'numpy>=1.22' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1'
178+
python3.11 -m pip install 'numpy>=1.22' pyarrow pandas pyyaml scipy unittest-xml-reporting 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5'
185179
python3.11 -m pip list
186-
# Run the tests.
180+
# Run the tests using script command.
181+
# BSD's script command doesn't support -c option, and the usage is different from Linux's one.
182+
# The kind of script command is tested by `script -qec true`.
187183
- name: Run tests
188184
env: ${{ fromJSON(inputs.envs) }}
185+
shell: |
186+
bash -c "if script -qec true 2>/dev/null; then script -qec bash\ {0}; else script -qe /dev/null bash {0}; fi"
189187
run: |
188+
# Fix for TTY related issues when launching the Ammonite REPL in tests.
189+
export TERM=vt100
190+
# `set -e` to make the exit status as expected due to use script command to run the commands
191+
set -e
190192
export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
191193
export MAVEN_CLI_OPTS="--no-transfer-progress"
192194
export JAVA_VERSION=${{ matrix.java }}
@@ -204,7 +206,10 @@ jobs:
204206
if [[ "$INCLUDED_TAGS" != "" ]]; then
205207
./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
206208
elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
207-
./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl sql/connect/client/jvm,sql/connect/common,sql/connect/server test -fae
209+
./build/mvn $MAVEN_CLI_OPTS -Djava.version=${JAVA_VERSION/-ea} -pl sql/connect/client/jdbc,sql/connect/client/jvm,sql/connect/common,sql/connect/server test -fae
210+
elif [[ "$MODULES_TO_TEST" == "connect" && "$INPUT_BRANCH" == "branch-4.0" ]]; then
211+
# SPARK-53914: Remove sql/connect/client/jdbc from `-pl` for branch-4.0, this branch can be deleted after the EOL of branch-4.0.
212+
./build/mvn $MAVEN_CLI_OPTS -Djava.version=${JAVA_VERSION/-ea} -pl sql/connect/client/jvm,sql/connect/common,sql/connect/server test -fae
208213
elif [[ "$EXCLUDED_TAGS" != "" ]]; then
209214
./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
210215
elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then

.github/workflows/pages.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ jobs:
6363
pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
6464
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' pyarrow 'pandas==2.3.2' 'plotly>=4.8' 'docutils<0.18.0' \
6565
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.12.1' \
66-
'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
66+
'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
6767
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
6868
- name: Install Ruby for documentation generation
6969
uses: ruby/setup-ruby@v1

.github/workflows/python_hosted_runner_test.yml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,18 @@ jobs:
7474
pyspark-core, pyspark-errors, pyspark-streaming
7575
- >-
7676
pyspark-mllib, pyspark-ml, pyspark-ml-connect
77+
- >-
78+
pyspark-structured-streaming, pyspark-structured-streaming-connect
7779
- >-
7880
pyspark-connect
7981
- >-
8082
pyspark-pandas
8183
- >-
8284
pyspark-pandas-slow
8385
- >-
84-
pyspark-pandas-connect-part0
85-
- >-
86-
pyspark-pandas-connect-part1
87-
- >-
88-
pyspark-pandas-connect-part2
86+
pyspark-pandas-connect
8987
- >-
90-
pyspark-pandas-connect-part3
88+
pyspark-pandas-slow-connect
9189
env:
9290
MODULES_TO_TEST: ${{ matrix.modules }}
9391
PYTHON_TO_TEST: python${{inputs.python}}
@@ -150,7 +148,7 @@ jobs:
150148
python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
151149
python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
152150
python${{matrix.python}} -m pip install numpy 'pyarrow>=21.0.0' 'six==1.16.0' 'pandas==2.3.2' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
153-
python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
151+
python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \
154152
python${{matrix.python}} -m pip cache purge
155153
- name: List Python packages
156154
run: python${{matrix.python}} -m pip list

.github/workflows/release.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,11 @@ jobs:
114114
with:
115115
repository: apache/spark
116116
ref: "${{ inputs.branch }}"
117+
- name: Free up disk space
118+
run: |
119+
if [ -f ./dev/free_disk_space ]; then
120+
./dev/free_disk_space
121+
fi
117122
- name: Release Apache Spark
118123
env:
119124
GIT_BRANCH: "${{ inputs.branch }}"

0 commit comments

Comments
 (0)