Skip to content

Commit

Permalink
Merge pull request duckdb#27 from Fokko/fd-test-rest
Browse files Browse the repository at this point in the history
Add REST Catalog and generate some test data
  • Loading branch information
samansmink authored Nov 27, 2023
2 parents 8136370 + f102378 commit 10dfb33
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 1 deletion.
64 changes: 64 additions & 0 deletions .github/workflows/Rest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Rest
on: [pull_request, repository_dispatch]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
cancel-in-progress: true
defaults:
run:
shell: bash

jobs:
rest:
name: Test against Rest Catalog
runs-on: ubuntu-latest
env:
VCPKG_TARGET_TRIPLET: 'x64-linux'
GEN: Ninja
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake

steps:
- name: Install required ubuntu packages
run: |
sudo apt-get update -y -qq
sudo apt-get install -y -qq software-properties-common
sudo add-apt-repository ppa:git-core/ppa
sudo apt-get update -y -qq
sudo apt-get install -y -qq ninja-build make gcc-multilib g++-multilib libssl-dev wget openjdk-8-jdk zip maven unixodbc-dev libc6-dev-i386 lib32readline6-dev libssl-dev libcurl4-gnutls-dev libexpat1-dev gettext unzip build-essential checkinstall libffi-dev curl libz-dev openssh-client
sudo apt-get install -y -qq tar pkg-config
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
- name: Install Git 2.18.5
run: |
wget https://github.com/git/git/archive/refs/tags/v2.18.5.tar.gz
tar xvf v2.18.5.tar.gz
cd git-2.18.5
make
sudo make prefix=/usr install
git --version
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'true'

- name: Setup ManyLinux2014
run: |
./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias
- name: Setup vcpkg
uses: lukka/run-vcpkg@v11.1
with:
vcpkgGitCommitId: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1

- name: Build extension
env:
GEN: ninja
STATIC_LIBCPP: 1
run: |
make release
- name: Start Rest Catalog
working-directory: scripts/
run: |
./start-rest-catalog.sh
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ testext
test/python/__pycache__/
.Rhistory
test/sql/tmp.test
data/iceberg/generated_*
data/iceberg/generated_*
scripts/metastore_db/
scripts/derby.log
scripts/test-script-with-path.sql
52 changes: 52 additions & 0 deletions scripts/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
version: "3"

services:
rest:
image: tabulario/iceberg-rest
container_name: iceberg-rest
networks:
iceberg_net:
ports:
- 8181:8181
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
- CATALOG_WAREHOUSE=s3://warehouse/
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
minio:
image: minio/minio
container_name: minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
iceberg_net:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
container_name: mc
networks:
iceberg_net:
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
entrypoint: >
/bin/sh -c "
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
/usr/bin/mc rm -r --force minio/warehouse;
/usr/bin/mc mb minio/warehouse;
/usr/bin/mc policy set public minio/warehouse;
"
networks:
iceberg_net:
153 changes: 153 additions & 0 deletions scripts/provision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from pyspark.sql import SparkSession

import os

os.environ[
"PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2,org.apache.iceberg:iceberg-aws-bundle:1.4.2 pyspark-shell"
os.environ["AWS_REGION"] = "us-east-1"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"

spark = (
SparkSession.builder.appName("DuckDB REST Integeration test")
.config(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
)
.config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
.config("spark.sql.catalog.demo.type", "rest")
.config("spark.sql.catalog.demo.uri", "http://127.0.0.1:8181")
.config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
.config("spark.sql.catalog.demo.warehouse", "s3://warehouse/wh/")
.config("spark.sql.catalog.demo.s3.endpoint", "http://127.0.0.1:9000")
.config("spark.sql.catalog.demo.s3.path-style-access", "true")
.config("spark.sql.defaultCatalog", "demo")
.config("spark.sql.catalogImplementation", "in-memory")
.getOrCreate()
)

spark.sql(
"""
CREATE DATABASE IF NOT EXISTS default;
"""
)

spark.sql(
"""
CREATE OR REPLACE TABLE default.table_unpartitioned (
dt date,
number integer,
letter string
)
USING iceberg
"""
)

spark.sql(
"""
INSERT INTO default.table_unpartitioned
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)


spark.sql(
"""
CREATE OR REPLACE TABLE default.table_partitioned (
dt date,
number integer,
letter string
)
USING iceberg
PARTITIONED BY (days(dt))
"""
)

spark.sql(
"""
INSERT INTO default.table_partitioned
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)

# By default, Spark uses merge on write deletes
# which optimize for read-performance

spark.sql(
"""
CREATE OR REPLACE TABLE default.table_mor_deletes (
dt date,
number integer,
letter string
)
USING iceberg
TBLPROPERTIES (
'write.delete.mode'='merge-on-read',
'write.update.mode'='merge-on-read',
'write.merge.mode'='merge-on-read',
'format-version'='2'
);
"""
)


spark.sql(
"""
INSERT INTO default.table_mor_deletes
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)
1 change: 1 addition & 0 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyspark==3.4.1
37 changes: 37 additions & 0 deletions scripts/start-rest-catalog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

set -ex

docker-compose kill
docker-compose rm -f
docker-compose up -d
docker-compose logs -f mc

pip3 install -r requirements.txt

python3 provision.py

# Would be nice to have rest support in there :)
UNPARTITIONED_TABLE_PATH=$(curl -s http://127.0.0.1:8181/v1/namespaces/default/tables/table_unpartitioned | jq -r '."metadata-location"')

SQL=$(cat <<-END
INSTALL iceberg;
LOAD iceberg;
SET s3_access_key_id='admin';
SET s3_secret_access_key='password';
SET s3_endpoint='127.0.0.1:9000';
SET s3_url_style='path';
SET s3_use_ssl=false;
SELECT * FROM iceberg_scan('${UNPARTITIONED_TABLE_PATH}');
END
)

if test -f "../build/release/duckdb"
then
# in CI
../build/release/duckdb -s "$SQL"
else
duckdb -s "$SQL"
fi

0 comments on commit 10dfb33

Please sign in to comment.