coiled · fjetter · Oct 13, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -1,7 +1,12 @@
 channels:
   - conda-forge
 dependencies:
-  - python >=3.9
+  - conda
+  - python =3.11
+  - pyspark ==3.4.1
+  - openjdk ==20.0.2
+  - python-duckdb ==0.9.1
+  - polars ==0.19.8
   - pip
   - coiled >=0.2.54
   - numpy ==1.24.4

diff --git a/cluster_kwargs.yaml b/cluster_kwargs.yaml
@@ -57,9 +57,30 @@ uber_lyft_large:
   n_workers: 50
   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
 
+tpch_pyspark:
+  n_workers: 20
+  worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
+  # Keep the same, as spark.driver/executor.memory calculated from scheduler size
+  # pyspark doesn't automatically calculate available memory for executors, 
+  # so we do it in pyspark_queries.utils, but this takes place on the scheduler
+  scheduler_vm_types: [m6i.xlarge]
+  scheduler_options:
+    idle_timeout: null  # Scheduler won't get any Dask tasks.
+  backend_options:
+    ingress:
+      - ports: [8786, 8787, 7077, 8080, 4040, 9797]
+        cidr: "0.0.0.0/0"
+    spot: false
+    spot_on_demand_fallback: true
+    multizone: true
+
 tpch:
   n_workers: 20
   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
+  backend_options:
+    spot: false
+    spot_on_demand_fallback: true
+    multizone: true
 
 # For tests/workflows/test_pytorch_optuna.py
 pytorch_optuna:

diff --git a/tests/benchmarks/tpch/__init__.py b/tests/benchmarks/tpch/__init__.py
@@ -0,0 +1 @@
+from . import test_dask, test_duckdb, test_polars, test_pyspark  # noqa: F401
diff --git a/tests/benchmarks/tpch/conftest.py b/tests/benchmarks/tpch/conftest.py
@@ -0,0 +1,71 @@
+import functools
+import os
+
+import coiled
+import pytest
+
+DATASETS = {
+    "local": "./tpch-data/scale10/",
+    "scale 100": "s3://coiled-runtime-ci/tpch_scale_100/",
+    "scale 1000": "s3://coiled-runtime-ci/tpch-scale-1000/",
+}
+
+ENABLED_DATASET = os.getenv("TPCH_SCALE")
+if ENABLED_DATASET is not None:
+    if ENABLED_DATASET not in DATASETS:
+        raise ValueError("Unknown tpch dataset: ", ENABLED_DATASET)
+else:
+    ENABLED_DATASET = "scale 100"
+
+machine = {
+    "memory": "256 GiB",
+}
+
+
+@pytest.fixture(scope="module")
+def warm_start():
+    @coiled.function(**machine)
+    def _():
+        pass
+
+    _()  # run once to give us a warm start
+
+
+@pytest.fixture(scope="function")
+def restart(warm_start):
+    @coiled.function(**machine)
+    def _():
+        pass
+
+    _.client.restart()
+    yield
+
+
+def coiled_function(**kwargs):
+    # Shouldn't be necessary
+    # See https://github.com/coiled/platform/issues/3519
+    def _(function):
+        return functools.wraps(function)(coiled.function(**kwargs, **machine)(function))
+
+    return _
+
+
+@pytest.fixture
+def tpch_dataset_name():
+    return ENABLED_DATASET
+
+
+@pytest.fixture
+def tpch_dataset_path(tpch_dataset_name):
+    return DATASETS[tpch_dataset_name]
+
+
+@pytest.fixture
+def vm_type():
+    return "m6i.16xlarge"
+
+
+@pytest.fixture
+def region():
+    # Region of the TPCH data
+    return "us-east-2"
diff --git a/tests/benchmarks/tpch/pyspark_queries/__init__.py b/tests/benchmarks/tpch/pyspark_queries/__init__.py
@@ -0,0 +1,25 @@
+from . import (  # noqa: F401
+    q1,
+    q2,
+    q3,
+    q4,
+    q5,
+    q6,
+    q7,
+    q8,
+    q9,
+    q10,
+    q11,
+    q12,
+    q13,
+    q14,
+    q15,
+    q16,
+    q17,
+    q18,
+    q19,
+    q20,
+    q21,
+    q22,
+    utils,
+)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q1.py b/tests/benchmarks/tpch/pyspark_queries/q1.py
@@ -0,0 +1,28 @@
+query = """select
+        l_returnflag,
+        l_linestatus,
+        sum(l_quantity) as sum_qty,
+        sum(l_extendedprice) as sum_base_price,
+        sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+        sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+        avg(l_quantity) as avg_qty,
+        avg(l_extendedprice) as avg_price,
+        avg(l_discount) as avg_disc,
+        count(*) as count_order
+    from
+        lineitem
+    where
+        l_shipdate <= date('1998-09-02')
+    group by
+        l_returnflag,
+        l_linestatus
+    order by
+        l_returnflag,
+        l_linestatus
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    read_parquet_spark(spark, "lineitem", "lineitem")
diff --git a/tests/benchmarks/tpch/pyspark_queries/q2.py b/tests/benchmarks/tpch/pyspark_queries/q2.py
@@ -0,0 +1,52 @@
+query = """select
+        s_acctbal,
+        s_name,
+        n_name,
+        p_partkey,
+        p_mfgr,
+        s_address,
+        s_phone,
+        s_comment
+    from
+        part,
+        supplier,
+        partsupp,
+        nation,
+        region
+    where
+        p_partkey = ps_partkey
+        and s_suppkey = ps_suppkey
+        and p_size = 15
+        and p_type like '%BRASS'
+        and s_nationkey = n_nationkey
+        and n_regionkey = r_regionkey
+        and r_name = 'EUROPE'
+        and ps_supplycost = (
+            select
+                min(ps_supplycost)
+            from
+                partsupp,
+                supplier,
+                nation,
+                region
+            where
+                p_partkey = ps_partkey
+                and s_suppkey = ps_suppkey
+                and s_nationkey = n_nationkey
+                and n_regionkey = r_regionkey
+                and r_name = 'EUROPE'
+        )
+    order by
+        s_acctbal desc,
+        n_name,
+        s_name,
+        p_partkey
+    limit 100
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    for name in ("part", "supplier", "partsupp", "nation", "region"):
+        read_parquet_spark(spark, name, name)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q22.py b/tests/benchmarks/tpch/pyspark_queries/q22.py
@@ -0,0 +1,45 @@
+query = """
+    select
+        cntrycode,
+        count(*) as numcust,
+        sum(c_acctbal) as totacctbal
+    from (
+        select
+            substring(c_phone from 1 for 2) as cntrycode,
+            c_acctbal
+        from
+            customer
+        where
+            substring(c_phone from 1 for 2) in
+                ("13","31","23", "29", "30", "18", "17")
+            and c_acctbal > (
+                select
+                    avg(c_acctbal)
+                from
+                    customer
+                where
+                    c_acctbal > 0.00
+                    and substring (c_phone from 1 for 2) in
+                        ("13","31","23", "29", "30", "18", "17")
+            )
+            and not exists (
+                select
+                    *
+                from
+                    orders
+                where
+                    o_custkey = c_custkey
+            )
+        ) as custsale
+    group by
+        cntrycode
+    order by
+        cntrycode
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    for name in ("orders", "customer"):
+        read_parquet_spark(spark, name, name)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q3.py b/tests/benchmarks/tpch/pyspark_queries/q3.py
@@ -0,0 +1,32 @@
+query = """
+    select
+        l_orderkey,
+        sum(l_extendedprice * (1 - l_discount)) as revenue,
+        o_orderdate,
+        o_shippriority
+    from
+        customer,
+        orders,
+        lineitem
+    where
+        c_mktsegment = 'BUILDING'
+        and c_custkey = o_custkey
+        and l_orderkey = o_orderkey
+        and o_orderdate < date '1995-03-15'
+        and l_shipdate > date '1995-03-15'
+    group by
+        l_orderkey,
+        o_orderdate,
+        o_shippriority
+    order by
+        revenue desc,
+        o_orderdate
+    limit 10
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    for name in ("customer", "orders", "lineitem"):
+        read_parquet_spark(spark, name, name)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q4.py b/tests/benchmarks/tpch/pyspark_queries/q4.py
@@ -0,0 +1,30 @@
+query = """
+    select
+        o_orderpriority,
+        count(*) as order_count
+    from
+        orders
+    where
+        o_orderdate >= date '1993-07-01'
+        and o_orderdate < date '1993-07-01' + interval '3' month
+        and exists (
+            select
+                *
+            from
+                lineitem
+            where
+                l_orderkey = o_orderkey
+                and l_commitdate < l_receiptdate
+        )
+    group by
+        o_orderpriority
+    order by
+        o_orderpriority
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    for name in ("orders", "lineitem"):
+        read_parquet_spark(spark, name, name)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q5.py b/tests/benchmarks/tpch/pyspark_queries/q5.py
@@ -0,0 +1,40 @@
+query = """
+    select
+        n_name,
+        sum(l_extendedprice * (1 - l_discount)) as revenue
+    from
+        customer,
+        orders,
+        lineitem,
+        supplier,
+        nation,
+        region
+    where
+        c_custkey = o_custkey
+        and l_orderkey = o_orderkey
+        and l_suppkey = s_suppkey
+        and c_nationkey = s_nationkey
+        and s_nationkey = n_nationkey
+        and n_regionkey = r_regionkey
+        and r_name = 'ASIA'
+        and o_orderdate >= date '1994-01-01'
+        and o_orderdate < date '1994-01-01' + interval '1' year
+    group by
+        n_name
+    order by
+        revenue desc
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    for name in (
+        "customer",
+        "orders",
+        "lineitem",
+        "supplier",
+        "nation",
+        "region",
+    ):
+        read_parquet_spark(spark, name, name)
diff --git a/tests/benchmarks/tpch/pyspark_queries/q6.py b/tests/benchmarks/tpch/pyspark_queries/q6.py
@@ -0,0 +1,17 @@
+query = """
+select
+        sum(l_extendedprice * l_discount) as revenue
+    from
+        lineitem
+    where
+        l_shipdate >= date '1994-01-01'
+        and l_shipdate < date '1994-01-01' + interval '1' year
+        and l_discount between .06 - 0.01 and .06 + 0.01
+        and l_quantity < 24
+"""
+
+
+def setup(spark):
+    from .utils import read_parquet_spark
+
+    read_parquet_spark(spark, "lineitem", "lineitem")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from . import test_dask, test_duckdb, test_polars, test_pyspark # noqa: F401