-
Notifications
You must be signed in to change notification settings - Fork 695
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SEDONA-285] Sedona Spark Jupyterlab docker image (#939)
Co-authored-by: Hadiya Kartikey <kartikey.hadiya@esri.in> Co-authored-by: Kartikey <56991178+kartikeyhadiya@users.noreply.github.com> Co-authored-by: Kartikey <hadiyakartikey123@gmail.com> Co-authored-by: yyy1000 <992364620@qq.com>
- Loading branch information
1 parent
8d5c76d
commit bb87200
Showing
11 changed files
with
431 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
name: Docker build | ||
|
||
on: | ||
push: | ||
branches: | ||
- master | ||
paths: | ||
- 'docker/**' | ||
pull_request: | ||
branches: | ||
- '*' | ||
|
||
env: | ||
MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=60 | ||
|
||
jobs: | ||
build: | ||
strategy: | ||
fail-fast: true | ||
matrix: | ||
os: ['ubuntu-latest', 'macos-latest'] | ||
include: | ||
- spark: 3.4.1 | ||
sedona: 1.4.1 | ||
- spark: 3.4.1 | ||
sedona: latest | ||
- spark: 3.3.2 | ||
sedona: latest | ||
runs-on: ${{ matrix.os }} | ||
defaults: | ||
run: | ||
shell: bash | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: actions/setup-java@v1 | ||
with: | ||
java-version: 11 | ||
- name: Cache Maven packages | ||
uses: actions/cache@v2 | ||
with: | ||
path: ~/.m2 | ||
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} | ||
restore-keys: ${{ runner.os }}-m2 | ||
- name: Setup docker (missing on MacOS) | ||
if: runner.os == 'macos' | ||
run: | | ||
brew install docker | ||
colima start | ||
- env: | ||
SPARK_VERSION: ${{ matrix.spark }} | ||
SEDONA_VERSION: ${{ matrix.sedona }} | ||
run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Dockerfile | ||
compose.yml | ||
README.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
commands.txt | ||
docker-compose-orig.yml | ||
Dockerfile.bak | ||
log.txt | ||
examples | ||
.ipynb_checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/bin/bash | ||
|
||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
SPARK_VERSION=$1 | ||
SEDONA_VERSION=$2 | ||
BUILD_MODE=$3 | ||
|
||
if [ "$SEDONA_VERSION" = "latest" ]; then | ||
# The compilation must take place outside Docker to avoid unnecessary maven packages | ||
mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools -Dscala=2.12 | ||
fi | ||
|
||
# -- Building the image | ||
|
||
if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then | ||
# If local, build the image for the local environment | ||
docker build \ | ||
--build-arg spark_version="${SPARK_VERSION}" \ | ||
--build-arg sedona_version="${SEDONA_VERSION}" \ | ||
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \ | ||
-t sedona/sedona-jupyterlab:${SEDONA_VERSION} . | ||
else | ||
# If release, build the image for cross-platform | ||
docker buildx build --platform linux/amd64,linux/arm64 \ | ||
--progress=plain \ | ||
--output type=registry \ | ||
--build-arg spark_version="${SPARK_VERSION}" \ | ||
--build-arg sedona_version="${SEDONA_VERSION}" \ | ||
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \ | ||
-t drjiayu/sedona-jupyterlab:${SEDONA_VERSION} . | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
pandas==1.3.5 | ||
fiona==1.8.22 | ||
geopandas==0.10.2 | ||
keplergl==0.3.2 | ||
pydeck==0.8.0 | ||
attrs | ||
matplotlib | ||
descartes | ||
ipywidgets | ||
jupyterlab-widgets | ||
ipykernel | ||
jupyterlab==3.6.4 |
73 changes: 73 additions & 0 deletions
73
docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
FROM ubuntu:22.04 | ||
|
||
ARG shared_workspace=/opt/workspace | ||
ARG spark_version=3.3.2 | ||
ARG hadoop_version=3 | ||
ARG hadoop_s3_version=3.3.4 | ||
ARG aws_sdk_version=1.12.402 | ||
ARG spark_xml_version=0.16.0 | ||
ARG sedona_version=1.4.1 | ||
ARG geotools_wrapper_version=1.4.0-28.2 | ||
|
||
# Set up envs | ||
ENV SHARED_WORKSPACE=${shared_workspace} | ||
ENV SPARK_HOME /opt/spark | ||
RUN mkdir ${SPARK_HOME} | ||
ENV SEDONA_HOME /opt/sedona | ||
RUN mkdir ${SEDONA_HOME} | ||
|
||
ENV SPARK_MASTER_HOST localhost | ||
ENV SPARK_MASTER_PORT 7077 | ||
ENV PYTHONPATH=$SPARK_HOME/python | ||
ENV PYSPARK_PYTHON python3 | ||
ENV PYSPARK_DRIVER_PYTHON jupyter | ||
|
||
COPY ./ ${SEDONA_HOME}/ | ||
|
||
RUN chmod +x ${SEDONA_HOME}/docker/spark.sh | ||
RUN chmod +x ${SEDONA_HOME}/docker/sedona.sh | ||
RUN ${SEDONA_HOME}/docker/spark.sh ${spark_version} ${hadoop_version} ${hadoop_s3_version} ${aws_sdk_version} ${spark_xml_version} | ||
RUN ${SEDONA_HOME}/docker/sedona.sh ${sedona_version} ${geotools_wrapper_version} ${spark_version} | ||
|
||
# Install Python dependencies | ||
COPY docker/sedona-spark-jupyterlab/requirements.txt /opt/requirements.txt | ||
RUN pip3 install -r /opt/requirements.txt | ||
|
||
COPY binder/*.ipynb /opt/workspace/examples/ | ||
COPY binder/*.py /opt/workspace/examples/ | ||
COPY binder/data /opt/workspace/examples/data | ||
|
||
# Add the master IP address to all notebooks | ||
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i 's/config = SedonaContext.builder()/config = SedonaContext.builder().master(\\"spark:\/\/localhost:7077\\")/' {} + | ||
# Delete packages configured by the notebooks | ||
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/spark\.jars\.packages/d' {} + | ||
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/org\.apache\.sedona:sedona-spark-shaded-/d' {} + | ||
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/org\.datasyslab:geotools-wrapper:/d' {} + | ||
|
||
RUN rm -rf ${SEDONA_HOME} | ||
|
||
EXPOSE 8888 | ||
EXPOSE 8080 | ||
EXPOSE 8081 | ||
EXPOSE 4040 | ||
|
||
WORKDIR ${SHARED_WORKSPACE} | ||
|
||
CMD service ssh start && ${SPARK_HOME}/sbin/start-all.sh && jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
|
||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
set -e | ||
|
||
# Define variables | ||
sedona_version=$1 | ||
geotools_wrapper_version=$2 | ||
spark_version=$3 | ||
|
||
lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1) | ||
if [ $lower_version = "3.4" ]; then | ||
sedona_spark_version=3.4 | ||
else | ||
sedona_spark_version=3.0 | ||
fi | ||
|
||
if [ $sedona_version = "latest" ]; then | ||
# Code to execute when SEDONA_VERSION is "latest" | ||
cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar ${SPARK_HOME}/jars/ | ||
cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install . | ||
else | ||
# Code to execute when SEDONA_VERSION is not "latest" | ||
# Download Sedona | ||
curl https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar -o $SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar | ||
curl https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar -o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar | ||
|
||
# Install Sedona Python | ||
pip3 install shapely==1.8.4 | ||
pip3 install apache-sedona==${sedona_version} | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/bin/bash | ||
|
||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
set -e | ||
|
||
# Define variables | ||
spark_version=$1 | ||
hadoop_version=$2 | ||
hadoop_s3_version=$3 | ||
aws_sdk_version=$4 | ||
spark_xml_version=$5 | ||
|
||
# Set up OS libraries | ||
apt-get update | ||
apt-get install -y openjdk-19-jdk-headless curl python3-pip maven | ||
pip3 install --upgrade pip && pip3 install pipenv | ||
|
||
# Download Spark jar and set up PySpark | ||
curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz | ||
tar -xf spark.tgz && mv spark-${spark_version}-bin-hadoop${hadoop_version}/* ${SPARK_HOME}/ | ||
rm spark.tgz && rm -rf spark-${spark_version}-bin-hadoop${hadoop_version} | ||
pip3 install pyspark==${spark_version} | ||
|
||
# Add S3 jars | ||
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar -o ${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar | ||
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_sdk_version}/aws-java-sdk-bundle-${aws_sdk_version}.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-${aws_sdk_version}.jar | ||
|
||
# Add spark-xml jar | ||
curl https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/${spark_xml_version}/spark-xml_2.12-${spark_xml_version}.jar -o ${SPARK_HOME}/jars/spark-xml_2.12-${spark_xml_version}.jar | ||
|
||
# Set up master IP address and executor memory | ||
cp ${SPARK_HOME}/conf/spark-defaults.conf.template ${SPARK_HOME}/conf/spark-defaults.conf | ||
echo "spark.driver.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf | ||
echo "spark.executor.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf | ||
|
||
# Install required libraries for GeoPandas on Apple chip mac | ||
apt-get install -y gdal-bin libgdal-dev | ||
|
||
# Install OpenSSH for cluster mode | ||
apt-get install -y openssh-client openssh-server | ||
systemctl enable ssh | ||
|
||
# Enable nopassword ssh | ||
ssh-keygen -t rsa -f ~/.ssh/id_rsa -N "" | ||
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys | ||
chmod 600 ~/.ssh/authorized_keys |
Oops, something went wrong.