From 7a29fe6bd75b038a89aaa8da8c3d1973e2dfaa5a Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 26 Apr 2023 14:03:47 -0400
Subject: [PATCH] Add bench script to benchmark datafusion against itself

---
 benchmarks/.gitignore  |   3 +-
 benchmarks/README.md   | 142 +++++++++++++++++---
 benchmarks/bench.sh    | 293 +++++++++++++++++++++++++++++++++++++++++
 benchmarks/tpch-gen.sh |  42 ------
 4 files changed, 421 insertions(+), 59 deletions(-)
 create mode 100755 benchmarks/bench.sh
 delete mode 100755 benchmarks/tpch-gen.sh

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
index 6320cd248dd8..2c574ff30d12 100644
--- a/benchmarks/.gitignore
+++ b/benchmarks/.gitignore
@@ -1 +1,2 @@
-data
\ No newline at end of file
+data
+results
\ No newline at end of file
diff --git a/benchmarks/README.md b/benchmarks/README.md
index d397def8f8e2..cf8a20a823f5 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -19,29 +19,139 @@
 
 # DataFusion Benchmarks
 
-This crate contains benchmarks based on popular public data sets and open source benchmark suites, making it easy to
-run real-world benchmarks to help with performance and scalability testing and for comparing performance with other Arrow
-implementations as well as other query engines.
+This crate contains benchmarks based on popular public data sets and
+open source benchmark suites, making it easy to run more realistic
+benchmarks to help with performance and scalability testing of DataFusion.
 
-## Benchmark derived from TPC-H
+# Benchmarks Against Other Engines
 
-These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers:
-https://github.com/databricks/tpch-dbgen.git, based on [2.17.1](https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf) version of TPC-H.
+DataFusion is included in the benchmark setups for several popular
+benchmarks that compare performance with other engines. For example:
 
-## Generating Test Data
+* [ClickBench] scripts are in the [ClickBench repo](https://github.com/ClickHouse/ClickBench/tree/main/datafusion)
+* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](db-benchmark) directory
 
-TPC-H data can be generated using the `tpch-gen.sh` script, which creates a Docker image containing the TPC-DS data
-generator.
+[ClickBench]: https://github.com/ClickHouse/ClickBench/tree/main
+[H2o.ai `db-benchmark`]: https://github.com/h2oai/db-benchmark
 
-```bash
-# scale_factor: scale of the database population. scale 1.0 represents ~1 GB of data
-./tpch-gen.sh <scale_factor>
+# Running the benchmarks
+
+## Running Benchmarks
+
+The easiest way to run benchmarks from DataFusion source checkouts is
+to use the [bench.sh](bench.sh) script. Usage instructions can be
+found with:
+
+```shell
+# show usage
+./bench.sh
+```
+
+## Generating Data
+
+You can create data for all these benchmarks using the [bench.sh](bench.sh) script:
+
+```shell
+./bench.sh data
+```
+
+Data is generated in the `data` subdirectory and will not be checked
+in because this directory has been added to the `.gitignore` file.
+
+
+## Example to compare peformance on main to a branch
+
+```shell
+git checkout main
+
+# Create the data
+./benchmarks/bench.sh data
+
+# Gather baseline data for tpch benchmark
+./benchmarks/bench.sh run tpch
+
+# Switch to the branch the branch name is mybranch and gather data
+git checkout mybranch
+./benchmarks/bench.sh run tpch
+
+# Compare results in the two branches:
+./bench.sh compare main mybranch
 ```
 
-Data will be generated into the `data` subdirectory and will not be checked in because this directory has been added
-to the `.gitignore` file.
+This produces results like:
+
+```shell
+Comparing main and mybranch
+--------------------
+Benchmark tpch.json
+--------------------
+┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Query        ┃         main ┃     mybranch ┃        Change ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ QQuery 1     │    2520.52ms │    2795.09ms │  1.11x slower │
+│ QQuery 2     │     222.37ms │     216.01ms │     no change │
+│ QQuery 3     │     248.41ms │     239.07ms │     no change │
+│ QQuery 4     │     144.01ms │     129.28ms │ +1.11x faster │
+│ QQuery 5     │     339.54ms │     327.53ms │     no change │
+│ QQuery 6     │     147.59ms │     138.73ms │ +1.06x faster │
+│ QQuery 7     │     605.72ms │     631.23ms │     no change │
+│ QQuery 8     │     326.35ms │     372.12ms │  1.14x slower │
+│ QQuery 9     │     579.02ms │     634.73ms │  1.10x slower │
+│ QQuery 10    │     403.38ms │     420.39ms │     no change │
+│ QQuery 11    │     201.94ms │     212.12ms │  1.05x slower │
+│ QQuery 12    │     235.94ms │     254.58ms │  1.08x slower │
+│ QQuery 13    │     738.40ms │     789.67ms │  1.07x slower │
+│ QQuery 14    │     198.73ms │     206.96ms │     no change │
+│ QQuery 15    │     183.32ms │     179.53ms │     no change │
+│ QQuery 16    │     168.57ms │     186.43ms │  1.11x slower │
+│ QQuery 17    │    2032.57ms │    2108.12ms │     no change │
+│ QQuery 18    │    1912.80ms │    2134.82ms │  1.12x slower │
+│ QQuery 19    │     391.64ms │     368.53ms │ +1.06x faster │
+│ QQuery 20    │     648.22ms │     691.41ms │  1.07x slower │
+│ QQuery 21    │     866.25ms │    1020.37ms │  1.18x slower │
+│ QQuery 22    │     115.94ms │     117.27ms │     no change │
+└──────────────┴──────────────┴──────────────┴───────────────┘
+--------------------
+Benchmark tpch_mem.json
+--------------------
+┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Query        ┃         main ┃     mybranch ┃        Change ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ QQuery 1     │    2182.44ms │    2390.39ms │  1.10x slower │
+│ QQuery 2     │     181.16ms │     153.94ms │ +1.18x faster │
+│ QQuery 3     │      98.89ms │      95.51ms │     no change │
+│ QQuery 4     │      61.43ms │      66.15ms │  1.08x slower │
+│ QQuery 5     │     260.20ms │     283.65ms │  1.09x slower │
+│ QQuery 6     │      24.24ms │      23.39ms │     no change │
+│ QQuery 7     │     545.87ms │     653.34ms │  1.20x slower │
+│ QQuery 8     │     147.48ms │     136.00ms │ +1.08x faster │
+│ QQuery 9     │     371.53ms │     363.61ms │     no change │
+│ QQuery 10    │     197.91ms │     190.37ms │     no change │
+│ QQuery 11    │     197.91ms │     183.70ms │ +1.08x faster │
+│ QQuery 12    │     100.32ms │     103.08ms │     no change │
+│ QQuery 13    │     428.02ms │     440.26ms │     no change │
+│ QQuery 14    │      38.50ms │      27.11ms │ +1.42x faster │
+│ QQuery 15    │     101.15ms │      63.25ms │ +1.60x faster │
+│ QQuery 16    │     171.15ms │     142.44ms │ +1.20x faster │
+│ QQuery 17    │    1885.05ms │    1953.58ms │     no change │
+│ QQuery 18    │    1549.92ms │    1914.06ms │  1.23x slower │
+│ QQuery 19    │     106.53ms │     104.28ms │     no change │
+│ QQuery 20    │     532.11ms │     610.62ms │  1.15x slower │
+│ QQuery 21    │     723.39ms │     823.34ms │  1.14x slower │
+│ QQuery 22    │      91.84ms │      89.89ms │     no change │
+└──────────────┴──────────────┴──────────────┴───────────────┘
+```
+
+
+# Benchmark Descriptions:
+
+## `tpch` Benchmark derived from TPC-H
+
+These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers:
+https://github.com/databricks/tpch-dbgen.git, based on [2.17.1](https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf) version of TPC-H.
+
 
-## Running the DataFusion Benchmarks
+### Running the DataFusion Benchmarks Manually
 
 The benchmark can then be run (assuming the data created from `dbgen` is in `./data`) with a command such as:
 
@@ -126,7 +236,7 @@ This will produce output like
 └──────────────┴──────────────┴──────────────┴───────────────┘
 ```
 
-## Expected output
+### Expected output
 
 The result of query 1 should produce the following output when executed against the SF=1 dataset.
 
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
new file mode 100755
index 000000000000..c8c265e5d76a
--- /dev/null
+++ b/benchmarks/bench.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is meant for developers of DataFusion -- it is runnable
+# from the standard DataFusion development environment and uses cargo,
+# etc.
+
+# Exit on error
+set -e
+
+# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+
+# Set Defaults
+COMMAND=
+BENCHMARK=all
+DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
+DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
+#CARGO_COMMAND=$CARGO_COMMAND:"cargo run --release"}
+CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"}  # TEMP: for faster iterations
+
+usage() {
+    echo "
+DataFusion Benchmark script
+
+This script orchestrates running benchmarks for DataFusion
+
+Usage:
+$0 data [benchmark]
+$0 run [benchmark]
+$0 compare <branch1> <branch2>
+
+**********
+Examples:
+**********
+
+# Create the datasets for all benchmarks in $DATA_DIR
+./bench.sh data
+
+# Run the 'tpch' benchmark on the datafusion checkout in /source/arrow-datafusion
+DATAFASION_DIR=/source/arrow-datafusion ./bench.sh run tpch
+
+**********
+* Commands
+**********
+
+data:         Generates data needed for benchmarking
+run:          Runs the named benchmark
+compare:      Comares results from benchmark runs
+
+**********
+* Benchmarks
+**********
+
+all(default): Data/Run/Compare for all benchmarks
+tpch:         TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
+tpch_mem:     TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
+
+
+**********
+* Environment Variables
+**********
+
+The following environment variables to control this script:
+
+DATA_DIR = directory to store datasets
+CARGO_COMMAND = command that runs the benchmark binary
+
+
+"
+    exit 1
+}
+
+# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        # -e|--extension)
+        #   EXTENSION="$2"
+        #   shift # past argument
+        #   shift # past value
+        #   ;;
+        -h|--help)
+            shift # past argument
+            usage
+            ;;
+        -*|--*)
+            echo "Unknown option $1"
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1") # save positional arg
+            shift # past argument
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters
+COMMAND=${1:-"${COMMAND}"}
+ARG2=$2
+ARG3=$3
+
+# Do what is requested
+main() {
+    # Command Dispatch
+    case "$COMMAND" in
+        data)
+            BENCHMARK=${ARG2:-"${BENCHMARK}"}
+            echo "***************************"
+            echo "DataFusion Benchmark Data Generation"
+            echo "COMMAND: ${COMMAND}"
+            echo "BENCHMARK: ${BENCHMARK}"
+            echo "DATA_DIR: ${DATA_DIR}"
+            echo "CARGO_COMMAND: ${CARGO_COMMAND}"
+            echo "***************************"
+            case "$BENCHMARK" in
+                all)
+                    data_tpch
+                    ;;
+                tpch)
+                    data_tpch
+                    ;;
+                tpch_mem)
+                    # same data for tpch_mem
+                    data_tpch
+                    ;;
+                *)
+                    echo "Error: unknown benchmark '$BENCHMARK' for data generation"
+                    usage
+                    ;;
+            esac
+            ;;
+        run)
+            # Parse positional paraleters
+            BENCHMARK=${ARG2:-"${BENCHMARK}"}
+            BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD)
+            BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _
+            RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$BRANCH_NAME"}
+
+            echo "***************************"
+            echo "DataFusion Benchmark Script"
+            echo "COMMAND: ${COMMAND}"
+            echo "BENCHMARK: ${BENCHMARK}"
+            echo "DATAFUSION_DIR: ${DATAFUSION_DIR}"
+            echo "BRACH_NAME: ${BRANCH_NAME}"
+            echo "DATA_DIR: ${DATA_DIR}"
+            echo "RESULTS_DIR: ${RESULTS_DIR}"
+            echo "CARGO_COMMAND: ${CARGO_COMMAND}"
+            echo "***************************"
+
+            # navigate to the appropriate directory
+            pushd "${DATAFUSION_DIR}/benchmarks" > /dev/null
+            mkdir -p "${RESULTS_DIR}"
+            case "$BENCHMARK" in
+                all)
+                    run_tpch
+                    run_tpch_mem
+                    ;;
+                tpch)
+                    run_tpch
+                    ;;
+                tpch_mem)
+                    run_tpch_mem
+                    ;;
+                *)
+                    echo "Error: unknown benchmark '$BENCHMARK' for run"
+                    usage
+                    ;;
+            esac
+            popd > /dev/null
+            echo "Done"
+            ;;
+        compare)
+            BRANCH1=$1
+            BRANCH2=$2
+            compare_benchmarks
+            ;;
+        *)
+            echo "Error: unknown command: $COMMAND"
+            usage
+            ;;
+    esac
+}
+
+
+
+# Creates TPCH data if it doesn't already exist
+data_tpch() {
+    echo "Creating tpch dataset..."
+
+    # Ensure the target data directory exists
+    mkdir -p "${DATA_DIR}"
+
+    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
+    SCALE_FACTOR=1
+    FILE="${DATA_DIR}/supplier.tbl"
+    if test -f "${FILE}"; then
+        echo " tbl files exist ($FILE exists)."
+    else
+        echo " creating tbl files with tpch_dbgen..."
+        docker run -v "${DATA_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR}
+    fi
+
+    # Copy expected answers into the ./data/answers directory if it does not already exist
+    FILE="${DATA_DIR}/answers/q1.out"
+    if test -f "${FILE}"; then
+        echo " Expected answers exist (${FILE} exists)."
+    else
+        echo " Copying answers to ${DATA_DIR}/answers"
+        mkdir -p "${DATA_DIR}/answers"
+        docker run -v "${DATA_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
+    fi
+
+    # Create 'parquet' files from tbl
+    FILE="${DATA_DIR}/supplier"
+    if test -d "${FILE}"; then
+        echo " parquet files exist ($FILE exists)."
+    else
+        echo " creating parquet files using benchmark binary ..."
+        pushd "${SCRIPT_DIR}" > /dev/null
+        $CARGO_COMMAND --bin tpch -- convert --input "${DATA_DIR}" --output "${DATA_DIR}" --format parquet
+        popd > /dev/null
+    fi
+}
+
+# Runs the tpch benchmark
+run_tpch() {
+    RESULTS_FILE="${RESULTS_DIR}/tpch.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running tpch benchmark..."
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" --format parquet -o ${RESULTS_FILE}
+}
+
+# Runs the tpch in memory
+run_tpch_mem() {
+    RESULTS_FILE="${RESULTS_DIR}/tpch_mem.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running tpch_mem benchmark..."
+    # -m means in memory
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE}
+}
+
+compare_benchmarks() {
+    BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
+    BRANCH1="${ARG2}"
+    BRANCH2="${ARG3}"
+    if [ -z "$BRANCH1" ] ; then
+        echo "<branch1> not specified. Available branches:"
+        ls -1 "${BASE_RESULTS_DIR}"
+        exit 1
+    fi
+
+    if [ -z "$BRANCH2" ] ; then
+        echo "<branch2> not specified"
+        ls -1 "${BASE_RESULTS_DIR}"
+        exit 1
+    fi
+
+    echo "Comparing ${BRANCH1} and ${BRANCH2}"
+    for bench in `ls ${BASE_RESULTS_DIR}/${BRANCH1}` ; do
+        RESULTS_FILE1="${BASE_RESULTS_DIR}/${BRANCH1}/${bench}"
+        RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${bench}"
+        if test -f "${RESULTS_FILE2}" ; then
+            echo "--------------------"
+            echo "Benchmark ${bench}"
+            echo "--------------------"
+            python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+        else
+            echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
+        fi
+    done
+
+}
+
+# And start the process up
+main
diff --git a/benchmarks/tpch-gen.sh b/benchmarks/tpch-gen.sh
deleted file mode 100755
index 90230e666e4c..000000000000
--- a/benchmarks/tpch-gen.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-mkdir -p data/answers 2>/dev/null
-
-set -e
-
-pushd ..
-. ./dev/build-set-env.sh
-popd
-
-# Generate data into the ./data directory if it does not already exist
-FILE=./data/supplier.tbl
-if test -f "$FILE"; then
-    echo "$FILE exists."
-else
-  docker run -v `pwd`/data:/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s $1
-  ls -l data
-fi
-
-# Copy expected answers (at SF=1) into the ./data/answers directory if it does not already exist
-FILE=./data/answers/q1.out
-if test -f "$FILE"; then
-    echo "$FILE exists."
-else
-  docker run -v `pwd`/data:/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
-fi
\ No newline at end of file