Skip to content

Commit

Permalink
Merge branch 'master' into remove_spark_2
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonathan Vexler committed Sep 30, 2024
2 parents 468e6f8 + d504a99 commit 7f7a7a8
Show file tree
Hide file tree
Showing 41 changed files with 1,062 additions and 340 deletions.
85 changes: 42 additions & 43 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -624,49 +624,48 @@ jobs:
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/ci_run.sh hudi_docker_java17 $HUDI_VERSION openjdk17
# integration-tests:
# runs-on: ubuntu-latest
# strategy:
# matrix:
# include:
# - sparkProfile: 'sparkX.X'
# sparkArchive: 'spark-X.X.X/spark-X.X.X-bin-hadoop2.7.tgz'
# steps:
# - uses: actions/checkout@v3
# - name: Set up JDK 8
# uses: actions/setup-java@v3
# with:
# java-version: '8'
# distribution: 'temurin'
# architecture: x64
# cache: maven
# - name: Build Project
# env:
# SPARK_PROFILE: ${{ matrix.sparkProfile }}
# SCALA_PROFILE: '-Dscala-2.1X -Dscala.binary.version=2.1X'
# run:
# mvn clean install -T 2 $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipTests=true $MVN_ARGS
# - name: 'UT integ-test'
# env:
# SPARK_PROFILE: ${{ matrix.sparkProfile }}
# SCALA_PROFILE: '-Dscala-2.1X -Dscala.binary.version=2.1X'
# run:
# mvn test $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test $MVN_ARGS
# - name: 'IT'
# env:
# SPARK_PROFILE: ${{ matrix.sparkProfile }}
# SPARK_ARCHIVE: ${{ matrix.sparkArchive }}
# SCALA_PROFILE: '-Dscala-2.1X -Dscala.binary.version=2.1X'
# run: |
# echo "Downloading $SPARK_ARCHIVE"
# curl https://archive.apache.org/dist/spark/$SPARK_ARCHIVE --create-dirs -o $GITHUB_WORKSPACE/$SPARK_ARCHIVE
# tar -xvf $GITHUB_WORKSPACE/$SPARK_ARCHIVE -C $GITHUB_WORKSPACE/
# mkdir /tmp/spark-events/
# SPARK_ARCHIVE_BASENAME=$(basename $SPARK_ARCHIVE)
# export SPARK_HOME=$GITHUB_WORKSPACE/${SPARK_ARCHIVE_BASENAME%.*}
# rm -f $GITHUB_WORKSPACE/$SPARK_ARCHIVE
# docker system prune --all --force
# mvn verify $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -pl !hudi-flink-datasource/hudi-flink $MVN_ARGS
integration-tests:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- sparkProfile: 'spark3.5'
sparkArchive: 'spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz'
steps:
- uses: actions/checkout@v3
- name: Set up JDK 8
uses: actions/setup-java@v3
with:
java-version: '8'
distribution: 'temurin'
architecture: x64
- name: Build Project
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SCALA_PROFILE: '-Dscala-2.12 -Dscala.binary.version=2.12'
run:
mvn clean install -T 2 $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipTests=true $MVN_ARGS
- name: 'UT integ-test'
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SCALA_PROFILE: '-Dscala-2.12 -Dscala.binary.version=2.12'
run:
mvn test $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test $MVN_ARGS
- name: 'IT'
env:
SPARK_PROFILE: ${{ matrix.sparkProfile }}
SPARK_ARCHIVE: ${{ matrix.sparkArchive }}
SCALA_PROFILE: '-Dscala-2.12 -Dscala.binary.version=2.12'
run: |
echo "Downloading $SPARK_ARCHIVE"
curl https://archive.apache.org/dist/spark/$SPARK_ARCHIVE --create-dirs -o $GITHUB_WORKSPACE/$SPARK_ARCHIVE
tar -xvf $GITHUB_WORKSPACE/$SPARK_ARCHIVE -C $GITHUB_WORKSPACE/
mkdir /tmp/spark-events/
SPARK_ARCHIVE_BASENAME=$(basename $SPARK_ARCHIVE)
export SPARK_HOME=$GITHUB_WORKSPACE/${SPARK_ARCHIVE_BASENAME%.*}
rm -f $GITHUB_WORKSPACE/$SPARK_ARCHIVE
docker system prune --all --force
mvn verify $SCALA_PROFILE -D"$SPARK_PROFILE" -Pintegration-tests -pl !hudi-flink-datasource/hudi-flink $MVN_ARGS
build-spark-java17:
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

version: "3.3"

services:

namenode:
image: apachehudi/hudi-hadoop_2.8.4-namenode:latest
platform: linux/amd64
hostname: namenode
container_name: namenode
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
- CLUSTER_NAME=hudi_hadoop284_hive232_spark353
ports:
- "50070:50070"
- "8020:8020"
Expand All @@ -38,10 +37,11 @@ services:

datanode1:
image: apachehudi/hudi-hadoop_2.8.4-datanode:latest
platform: linux/amd64
container_name: datanode1
hostname: datanode1
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
- CLUSTER_NAME=hudi_hadoop284_hive232_spark353
env_file:
- ./hadoop.env
ports:
Expand All @@ -62,10 +62,11 @@ services:

historyserver:
image: apachehudi/hudi-hadoop_2.8.4-history:latest
platform: linux/amd64
hostname: historyserver
container_name: historyserver
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
- CLUSTER_NAME=hudi_hadoop284_hive232_spark353
depends_on:
- "namenode"
links:
Expand All @@ -91,6 +92,7 @@ services:

hivemetastore:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
platform: linux/amd64
hostname: hivemetastore
container_name: hivemetastore
links:
Expand All @@ -116,6 +118,7 @@ services:

hiveserver:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
platform: linux/amd64
hostname: hiveserver
container_name: hiveserver
env_file:
Expand All @@ -136,7 +139,8 @@ services:
- ${HUDI_WS}:/var/hoodie/ws

sparkmaster:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:latest
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_3.5.3:latest
platform: linux/amd64
hostname: sparkmaster
container_name: sparkmaster
env_file:
Expand All @@ -155,7 +159,8 @@ services:
- "namenode"

spark-worker-1:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:latest
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_3.5.3:latest
platform: linux/amd64
hostname: spark-worker-1
container_name: spark-worker-1
env_file:
Expand Down Expand Up @@ -197,6 +202,7 @@ services:
container_name: presto-coordinator-1
hostname: presto-coordinator-1
image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest
platform: linux/amd64
ports:
- "8090:8090"
# JVM debugging port (will be mapped to a random port on host)
Expand All @@ -218,6 +224,7 @@ services:
container_name: presto-worker-1
hostname: presto-worker-1
image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest
platform: linux/amd64
depends_on: [ "presto-coordinator-1" ]
environment:
- PRESTO_JVM_MAX_HEAP=512M
Expand All @@ -239,6 +246,7 @@ services:
container_name: trino-coordinator-1
hostname: trino-coordinator-1
image: apachehudi/hudi-hadoop_2.8.4-trinocoordinator_368:latest
platform: linux/amd64
ports:
- "8091:8091"
# JVM debugging port (will be mapped to a random port on host)
Expand All @@ -253,6 +261,7 @@ services:
container_name: trino-worker-1
hostname: trino-worker-1
image: apachehudi/hudi-hadoop_2.8.4-trinoworker_368:latest
platform: linux/amd64
depends_on: [ "trino-coordinator-1" ]
ports:
- "8092:8092"
Expand All @@ -277,7 +286,8 @@ services:
- 8126:8126

adhoc-1:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_3.5.3:latest
platform: linux/amd64
hostname: adhoc-1
container_name: adhoc-1
env_file:
Expand All @@ -301,7 +311,8 @@ services:
- ${HUDI_WS}:/var/hoodie/ws

adhoc-2:
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_3.5.3:latest
platform: linux/amd64
hostname: adhoc-2
container_name: adhoc-2
env_file:
Expand Down
1 change: 1 addition & 0 deletions docker/demo/sparksql-incremental.commands
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.FileSystem;

val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val beginInstantTime = HoodieDataSourceHelpers.listCommitsSince(fs, "/user/hive/warehouse/stock_ticks_cow", "00000").get(0)
println("Begin instant time for incremental query: " + beginInstantTime)
val hoodieIncQueryDF = spark.read.format("org.apache.hudi").
option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL).
option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), beginInstantTime).
Expand Down
2 changes: 1 addition & 1 deletion docker/hoodie/hadoop/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM openjdk:8u212-jdk-slim-stretch
FROM openjdk:8u342-jdk-slim-bullseye
MAINTAINER Hoodie
USER root

Expand Down
2 changes: 1 addition & 1 deletion docker/hoodie/hadoop/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
<properties>
<skipITs>false</skipITs>
<docker.build.skip>true</docker.build.skip>
<docker.spark.version>2.4.4</docker.spark.version>
<docker.spark.version>3.5.3</docker.spark.version>
<docker.hive.version>2.3.3</docker.hive.version>
<docker.hadoop.version>2.8.4</docker.hadoop.version>
<docker.presto.version>0.271</docker.presto.version>
Expand Down
7 changes: 4 additions & 3 deletions docker/hoodie/hadoop/spark_base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG HADOOP_VERSION=2.8.4
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}

ENV ENABLE_INIT_DAEMON true
ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon
ENV INIT_DAEMON_STEP spark_master_init

ARG SPARK_VERSION=2.4.4
ARG SPARK_HADOOP_VERSION=2.7
ARG SPARK_VERSION=3.5.3
ARG SPARK_HADOOP_VERSION=3

ENV SPARK_VERSION ${SPARK_VERSION}
ENV HADOOP_VERSION ${SPARK_HADOOP_VERSION}
Expand All @@ -33,6 +33,7 @@ COPY wait-for-step.sh /
COPY execute-step.sh /
COPY finish-step.sh /

# Need to do this all in one step because running separate commands doubles the image size
RUN echo "Installing Spark-version (${SPARK_VERSION})" \
&& wget http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
Expand Down
4 changes: 2 additions & 2 deletions docker/hoodie/hadoop/sparkadhoc/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG HADOOP_VERSION=2.8.4
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.4.4
ARG SPARK_VERSION=3.5.3
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}

ARG PRESTO_VERSION=0.268
Expand Down
4 changes: 2 additions & 2 deletions docker/hoodie/hadoop/sparkmaster/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG HADOOP_VERSION=2.8.4
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.4.4
ARG SPARK_VERSION=3.5.3
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}

COPY master.sh /opt/spark
Expand Down
4 changes: 2 additions & 2 deletions docker/hoodie/hadoop/sparkworker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG HADOOP_VERSION=2.8.4
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.4.4
ARG SPARK_VERSION=3.5.3
FROM apachehudi/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}

COPY worker.sh /opt/spark
Expand Down
2 changes: 1 addition & 1 deletion docker/setup_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244.yml"
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark353_amd64.yml"
if [ "$HUDI_DEMO_ENV" = "--mac-aarch64" ]; then
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml"
fi
Expand Down
2 changes: 1 addition & 1 deletion docker/stop_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
# set up root directory
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244.yml"
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark353_amd64.yml"
if [ "$HUDI_DEMO_ENV" = "--mac-aarch64" ]; then
COMPOSE_FILE_NAME="docker-compose_hadoop284_hive233_spark244_mac_aarch64.yml"
fi
Expand Down
6 changes: 4 additions & 2 deletions hudi-aws/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@
<name>amazon/dynamodb-local:${dynamodb-local.version}</name>
<alias>it-database</alias>
<run>
<platform>linux/amd64</platform>
<ports>
<port>${dynamodb-local.port}:${dynamodb-local.port}</port>
</ports>
Expand All @@ -268,11 +269,12 @@
</run>
</image>
<image>
<name>motoserver/moto:${moto.version}</name>
<name>apachehudi/moto:${moto.version}</name>
<alias>it-aws</alias>
<run>
<platform>linux/amd64</platform>
<ports>
<port>${moto.port}:${moto.port}</port>
<port>${moto.port}:5000</port>
</ports>
<wait>
<http>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@

@Disabled("HUDI-7475 The tests do not work. Disabling them to unblock Azure CI")
public class ITTestGluePartitionPushdown {

private static final String MOTO_ENDPOINT = "http://localhost:5000";
// This port number must be the same as {@code moto.port} defined in pom.xml
private static final int MOTO_PORT = 5002;
private static final String MOTO_ENDPOINT = "http://localhost:" + MOTO_PORT;
private static final String DB_NAME = "db_name";
private static final String TABLE_NAME = "tbl_name";
private String basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,9 @@ private Pair<Integer, HoodieData<HoodieRecord>> initializeFunctionalIndexPartiti
}

private Set<String> getFunctionalIndexPartitionsToInit() {
if (dataMetaClient.getIndexMetadata().isEmpty()) {
return Collections.emptySet();
}
Set<String> functionalIndexPartitions = dataMetaClient.getIndexMetadata().get().getIndexDefinitions().keySet();
Set<String> completedMetadataPartitions = dataMetaClient.getTableConfig().getMetadataPartitions();
functionalIndexPartitions.removeAll(completedMetadataPartitions);
Expand Down Expand Up @@ -1050,6 +1053,9 @@ public void update(HoodieCommitMetadata commitMetadata, HoodieData<HoodieRecord>
* Update functional index from {@link HoodieCommitMetadata}.
*/
private void updateFunctionalIndexIfPresent(HoodieCommitMetadata commitMetadata, String instantTime, Map<String, HoodieData<HoodieRecord>> partitionToRecordMap) {
if (!dataWriteConfig.getMetadataConfig().isFunctionalIndexEnabled()) {
return;
}
dataMetaClient.getTableConfig().getMetadataPartitions()
.stream()
.filter(partition -> partition.startsWith(HoodieTableMetadataUtil.PARTITION_NAME_FUNCTIONAL_INDEX_PREFIX))
Expand Down
Loading

0 comments on commit 7f7a7a8

Please sign in to comment.