Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add REST Catalog and generate some test data #27

Merged
merged 4 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/Rest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Rest
on: [pull_request, repository_dispatch]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
cancel-in-progress: true
defaults:
run:
shell: bash

jobs:
rest:
name: Test against Rest Catalog
runs-on: ubuntu-latest
env:
VCPKG_TARGET_TRIPLET: 'x64-linux'
GEN: Ninja
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake

steps:
- name: Install required ubuntu packages
run: |
sudo apt-get update -y -qq
sudo apt-get install -y -qq software-properties-common
sudo add-apt-repository ppa:git-core/ppa
sudo apt-get update -y -qq
sudo apt-get install -y -qq ninja-build make gcc-multilib g++-multilib libssl-dev wget openjdk-8-jdk zip maven unixodbc-dev libc6-dev-i386 lib32readline6-dev libssl-dev libcurl4-gnutls-dev libexpat1-dev gettext unzip build-essential checkinstall libffi-dev curl libz-dev openssh-client
sudo apt-get install -y -qq tar pkg-config
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose

- name: Install Git 2.18.5
run: |
wget https://github.com/git/git/archive/refs/tags/v2.18.5.tar.gz
tar xvf v2.18.5.tar.gz
cd git-2.18.5
make
sudo make prefix=/usr install
git --version

- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'true'

- name: Setup ManyLinux2014
run: |
./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias

- name: Setup vcpkg
uses: lukka/run-vcpkg@v11.1
with:
vcpkgGitCommitId: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1

- name: Build extension
env:
GEN: ninja
STATIC_LIBCPP: 1
run: |
make release

- name: Start Rest Catalog
working-directory: scripts/
run: |
./start-rest-catalog.sh
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ testext
test/python/__pycache__/
.Rhistory
test/sql/tmp.test
data/iceberg/generated_*
data/iceberg/generated_*
scripts/metastore_db/
scripts/derby.log
scripts/test-script-with-path.sql
52 changes: 52 additions & 0 deletions scripts/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
version: "3"

services:
rest:
image: tabulario/iceberg-rest
container_name: iceberg-rest
networks:
iceberg_net:
ports:
- 8181:8181
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
- CATALOG_WAREHOUSE=s3://warehouse/
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
minio:
image: minio/minio
container_name: minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
iceberg_net:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
container_name: mc
networks:
iceberg_net:
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
entrypoint: >
/bin/sh -c "
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
/usr/bin/mc rm -r --force minio/warehouse;
/usr/bin/mc mb minio/warehouse;
/usr/bin/mc policy set public minio/warehouse;
"
networks:
iceberg_net:
153 changes: 153 additions & 0 deletions scripts/provision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from pyspark.sql import SparkSession

import os

os.environ[
"PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2,org.apache.iceberg:iceberg-aws-bundle:1.4.2 pyspark-shell"
os.environ["AWS_REGION"] = "us-east-1"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"

spark = (
SparkSession.builder.appName("DuckDB REST Integeration test")
.config(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
)
.config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog")
.config("spark.sql.catalog.demo.type", "rest")
.config("spark.sql.catalog.demo.uri", "http://127.0.0.1:8181")
.config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
.config("spark.sql.catalog.demo.warehouse", "s3://warehouse/wh/")
.config("spark.sql.catalog.demo.s3.endpoint", "http://127.0.0.1:9000")
.config("spark.sql.catalog.demo.s3.path-style-access", "true")
.config("spark.sql.defaultCatalog", "demo")
.config("spark.sql.catalogImplementation", "in-memory")
.getOrCreate()
)

spark.sql(
"""
CREATE DATABASE IF NOT EXISTS default;
"""
)

spark.sql(
"""
CREATE OR REPLACE TABLE default.table_unpartitioned (
dt date,
number integer,
letter string
)
USING iceberg
"""
)

spark.sql(
"""
INSERT INTO default.table_unpartitioned
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)


spark.sql(
"""
CREATE OR REPLACE TABLE default.table_partitioned (
dt date,
number integer,
letter string
)
USING iceberg
PARTITIONED BY (days(dt))
"""
)

spark.sql(
"""
INSERT INTO default.table_partitioned
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)

# By default, Spark uses merge on write deletes
# which optimize for read-performance

spark.sql(
"""
CREATE OR REPLACE TABLE default.table_mor_deletes (
dt date,
number integer,
letter string
)
USING iceberg
TBLPROPERTIES (
'write.delete.mode'='merge-on-read',
'write.update.mode'='merge-on-read',
'write.merge.mode'='merge-on-read',
'format-version'='2'
);
"""
)


spark.sql(
"""
INSERT INTO default.table_mor_deletes
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)
1 change: 1 addition & 0 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyspark==3.4.1
37 changes: 37 additions & 0 deletions scripts/start-rest-catalog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

set -ex

docker-compose kill
docker-compose rm -f
docker-compose up -d
docker-compose logs -f mc

pip3 install -r requirements.txt

python3 provision.py

# Would be nice to have rest support in there :)
UNPARTITIONED_TABLE_PATH=$(curl -s http://127.0.0.1:8181/v1/namespaces/default/tables/table_unpartitioned | jq -r '."metadata-location"')

SQL=$(cat <<-END
INSTALL iceberg;
LOAD iceberg;

SET s3_access_key_id='admin';
SET s3_secret_access_key='password';
SET s3_endpoint='127.0.0.1:9000';
SET s3_url_style='path';
SET s3_use_ssl=false;

SELECT * FROM iceberg_scan('${UNPARTITIONED_TABLE_PATH}');
END

)

if test -f "../build/release/duckdb"
then
# in CI
../build/release/duckdb -s "$SQL"
else
duckdb -s "$SQL"
fi
Loading