diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore index 6320cd248dd8..2c574ff30d12 100644 --- a/benchmarks/.gitignore +++ b/benchmarks/.gitignore @@ -1 +1,2 @@ -data \ No newline at end of file +data +results \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md index d397def8f8e2..cf8a20a823f5 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -19,29 +19,139 @@ # DataFusion Benchmarks -This crate contains benchmarks based on popular public data sets and open source benchmark suites, making it easy to -run real-world benchmarks to help with performance and scalability testing and for comparing performance with other Arrow -implementations as well as other query engines. +This crate contains benchmarks based on popular public data sets and +open source benchmark suites, making it easy to run more realistic +benchmarks to help with performance and scalability testing of DataFusion. -## Benchmark derived from TPC-H +# Benchmarks Against Other Engines -These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers: -https://github.com/databricks/tpch-dbgen.git, based on [2.17.1](https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf) version of TPC-H. +DataFusion is included in the benchmark setups for several popular +benchmarks that compare performance with other engines. For example: -## Generating Test Data +* [ClickBench] scripts are in the [ClickBench repo](https://github.com/ClickHouse/ClickBench/tree/main/datafusion) +* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](db-benchmark) directory -TPC-H data can be generated using the `tpch-gen.sh` script, which creates a Docker image containing the TPC-DS data -generator. +[ClickBench]: https://github.com/ClickHouse/ClickBench/tree/main +[H2o.ai `db-benchmark`]: https://github.com/h2oai/db-benchmark -```bash -# scale_factor: scale of the database population. scale 1.0 represents ~1 GB of data -./tpch-gen.sh +# Running the benchmarks + +## Running Benchmarks + +The easiest way to run benchmarks from DataFusion source checkouts is +to use the [bench.sh](bench.sh) script. Usage instructions can be +found with: + +```shell +# show usage +./bench.sh +``` + +## Generating Data + +You can create data for all these benchmarks using the [bench.sh](bench.sh) script: + +```shell +./bench.sh data +``` + +Data is generated in the `data` subdirectory and will not be checked +in because this directory has been added to the `.gitignore` file. + + +## Example to compare peformance on main to a branch + +```shell +git checkout main + +# Create the data +./benchmarks/bench.sh data + +# Gather baseline data for tpch benchmark +./benchmarks/bench.sh run tpch + +# Switch to the branch the branch name is mybranch and gather data +git checkout mybranch +./benchmarks/bench.sh run tpch + +# Compare results in the two branches: +./bench.sh compare main mybranch ``` -Data will be generated into the `data` subdirectory and will not be checked in because this directory has been added -to the `.gitignore` file. +This produces results like: + +```shell +Comparing main and mybranch +-------------------- +Benchmark tpch.json +-------------------- +┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Query ┃ main ┃ mybranch ┃ Change ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ QQuery 1 │ 2520.52ms │ 2795.09ms │ 1.11x slower │ +│ QQuery 2 │ 222.37ms │ 216.01ms │ no change │ +│ QQuery 3 │ 248.41ms │ 239.07ms │ no change │ +│ QQuery 4 │ 144.01ms │ 129.28ms │ +1.11x faster │ +│ QQuery 5 │ 339.54ms │ 327.53ms │ no change │ +│ QQuery 6 │ 147.59ms │ 138.73ms │ +1.06x faster │ +│ QQuery 7 │ 605.72ms │ 631.23ms │ no change │ +│ QQuery 8 │ 326.35ms │ 372.12ms │ 1.14x slower │ +│ QQuery 9 │ 579.02ms │ 634.73ms │ 1.10x slower │ +│ QQuery 10 │ 403.38ms │ 420.39ms │ no change │ +│ QQuery 11 │ 201.94ms │ 212.12ms │ 1.05x slower │ +│ QQuery 12 │ 235.94ms │ 254.58ms │ 1.08x slower │ +│ QQuery 13 │ 738.40ms │ 789.67ms │ 1.07x slower │ +│ QQuery 14 │ 198.73ms │ 206.96ms │ no change │ +│ QQuery 15 │ 183.32ms │ 179.53ms │ no change │ +│ QQuery 16 │ 168.57ms │ 186.43ms │ 1.11x slower │ +│ QQuery 17 │ 2032.57ms │ 2108.12ms │ no change │ +│ QQuery 18 │ 1912.80ms │ 2134.82ms │ 1.12x slower │ +│ QQuery 19 │ 391.64ms │ 368.53ms │ +1.06x faster │ +│ QQuery 20 │ 648.22ms │ 691.41ms │ 1.07x slower │ +│ QQuery 21 │ 866.25ms │ 1020.37ms │ 1.18x slower │ +│ QQuery 22 │ 115.94ms │ 117.27ms │ no change │ +└──────────────┴──────────────┴──────────────┴───────────────┘ +-------------------- +Benchmark tpch_mem.json +-------------------- +┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Query ┃ main ┃ mybranch ┃ Change ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ QQuery 1 │ 2182.44ms │ 2390.39ms │ 1.10x slower │ +│ QQuery 2 │ 181.16ms │ 153.94ms │ +1.18x faster │ +│ QQuery 3 │ 98.89ms │ 95.51ms │ no change │ +│ QQuery 4 │ 61.43ms │ 66.15ms │ 1.08x slower │ +│ QQuery 5 │ 260.20ms │ 283.65ms │ 1.09x slower │ +│ QQuery 6 │ 24.24ms │ 23.39ms │ no change │ +│ QQuery 7 │ 545.87ms │ 653.34ms │ 1.20x slower │ +│ QQuery 8 │ 147.48ms │ 136.00ms │ +1.08x faster │ +│ QQuery 9 │ 371.53ms │ 363.61ms │ no change │ +│ QQuery 10 │ 197.91ms │ 190.37ms │ no change │ +│ QQuery 11 │ 197.91ms │ 183.70ms │ +1.08x faster │ +│ QQuery 12 │ 100.32ms │ 103.08ms │ no change │ +│ QQuery 13 │ 428.02ms │ 440.26ms │ no change │ +│ QQuery 14 │ 38.50ms │ 27.11ms │ +1.42x faster │ +│ QQuery 15 │ 101.15ms │ 63.25ms │ +1.60x faster │ +│ QQuery 16 │ 171.15ms │ 142.44ms │ +1.20x faster │ +│ QQuery 17 │ 1885.05ms │ 1953.58ms │ no change │ +│ QQuery 18 │ 1549.92ms │ 1914.06ms │ 1.23x slower │ +│ QQuery 19 │ 106.53ms │ 104.28ms │ no change │ +│ QQuery 20 │ 532.11ms │ 610.62ms │ 1.15x slower │ +│ QQuery 21 │ 723.39ms │ 823.34ms │ 1.14x slower │ +│ QQuery 22 │ 91.84ms │ 89.89ms │ no change │ +└──────────────┴──────────────┴──────────────┴───────────────┘ +``` + + +# Benchmark Descriptions: + +## `tpch` Benchmark derived from TPC-H + +These benchmarks are derived from the [TPC-H][1] benchmark. And we use this repo as the source of tpch-gen and answers: +https://github.com/databricks/tpch-dbgen.git, based on [2.17.1](https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf) version of TPC-H. + -## Running the DataFusion Benchmarks +### Running the DataFusion Benchmarks Manually The benchmark can then be run (assuming the data created from `dbgen` is in `./data`) with a command such as: @@ -126,7 +236,7 @@ This will produce output like └──────────────┴──────────────┴──────────────┴───────────────┘ ``` -## Expected output +### Expected output The result of query 1 should produce the following output when executed against the SF=1 dataset. diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh new file mode 100755 index 000000000000..24286014a06e --- /dev/null +++ b/benchmarks/bench.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc. + +# Exit on error +set -e + +# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + + +# Set Defaults +COMMAND= +BENCHMARK=all +DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..} +DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data} +#CARGO_COMMAND=$CARGO_COMMAND:"cargo run --release"} +CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"} # TEMP: for faster iterations + +usage() { + echo " +Orchestrates running benchmarks against DataFusion checkouts + +Usage: +$0 data [benchmark] +$0 run [benchmark] +$0 compare + +********** +Examples: +********** +# Create the datasets for all benchmarks in $DATA_DIR +./bench.sh data + +# Run the 'tpch' benchmark on the datafusion checkout in /source/arrow-datafusion +DATAFASION_DIR=/source/arrow-datafusion ./bench.sh run tpch + +********** +* Commands +********** +data: Generates data needed for benchmarking +run: Runs the named benchmark +compare: Comares results from benchmark runs + +********** +* Benchmarks +********** +all(default): Data/Run/Compare for all benchmarks +tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table +tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory + +********** +* Supported Configuration (Environment Variables) +********** +DATA_DIR directory to store datasets +CARGO_COMMAND command that runs the benchmark binary +DATAFASION_DIR directory to use (default $DATAFUSION_DIR) +" + exit 1 +} + +# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + # -e|--extension) + # EXTENSION="$2" + # shift # past argument + # shift # past value + # ;; + -h|--help) + shift # past argument + usage + ;; + -*|--*) + echo "Unknown option $1" + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters +COMMAND=${1:-"${COMMAND}"} +ARG2=$2 +ARG3=$3 + +# Do what is requested +main() { + # Command Dispatch + case "$COMMAND" in + data) + BENCHMARK=${ARG2:-"${BENCHMARK}"} + echo "***************************" + echo "DataFusion Benchmark Data Generation" + echo "COMMAND: ${COMMAND}" + echo "BENCHMARK: ${BENCHMARK}" + echo "DATA_DIR: ${DATA_DIR}" + echo "CARGO_COMMAND: ${CARGO_COMMAND}" + echo "***************************" + case "$BENCHMARK" in + all) + data_tpch + ;; + tpch) + data_tpch + ;; + tpch_mem) + # same data for tpch_mem + data_tpch + ;; + *) + echo "Error: unknown benchmark '$BENCHMARK' for data generation" + usage + ;; + esac + ;; + run) + # Parse positional paraleters + BENCHMARK=${ARG2:-"${BENCHMARK}"} + BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD) + BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ + RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$BRANCH_NAME"} + + echo "***************************" + echo "DataFusion Benchmark Script" + echo "COMMAND: ${COMMAND}" + echo "BENCHMARK: ${BENCHMARK}" + echo "DATAFUSION_DIR: ${DATAFUSION_DIR}" + echo "BRACH_NAME: ${BRANCH_NAME}" + echo "DATA_DIR: ${DATA_DIR}" + echo "RESULTS_DIR: ${RESULTS_DIR}" + echo "CARGO_COMMAND: ${CARGO_COMMAND}" + echo "***************************" + + # navigate to the appropriate directory + pushd "${DATAFUSION_DIR}/benchmarks" > /dev/null + mkdir -p "${RESULTS_DIR}" + case "$BENCHMARK" in + all) + run_tpch + run_tpch_mem + ;; + tpch) + run_tpch + ;; + tpch_mem) + run_tpch_mem + ;; + *) + echo "Error: unknown benchmark '$BENCHMARK' for run" + usage + ;; + esac + popd > /dev/null + echo "Done" + ;; + compare) + BRANCH1=$1 + BRANCH2=$2 + compare_benchmarks + ;; + *) + echo "Error: unknown command: $COMMAND" + usage + ;; + esac +} + + + +# Creates TPCH data if it doesn't already exist +data_tpch() { + echo "Creating tpch dataset..." + + # Ensure the target data directory exists + mkdir -p "${DATA_DIR}" + + # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist + SCALE_FACTOR=1 + FILE="${DATA_DIR}/supplier.tbl" + if test -f "${FILE}"; then + echo " tbl files exist ($FILE exists)." + else + echo " creating tbl files with tpch_dbgen..." + docker run -v "${DATA_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR} + fi + + # Copy expected answers into the ./data/answers directory if it does not already exist + FILE="${DATA_DIR}/answers/q1.out" + if test -f "${FILE}"; then + echo " Expected answers exist (${FILE} exists)." + else + echo " Copying answers to ${DATA_DIR}/answers" + mkdir -p "${DATA_DIR}/answers" + docker run -v "${DATA_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" + fi + + # Create 'parquet' files from tbl + FILE="${DATA_DIR}/supplier" + if test -d "${FILE}"; then + echo " parquet files exist ($FILE exists)." + else + echo " creating parquet files using benchmark binary ..." + pushd "${SCRIPT_DIR}" > /dev/null + $CARGO_COMMAND --bin tpch -- convert --input "${DATA_DIR}" --output "${DATA_DIR}" --format parquet + popd > /dev/null + fi +} + +# Runs the tpch benchmark +run_tpch() { + RESULTS_FILE="${RESULTS_DIR}/tpch.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running tpch benchmark..." + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" --format parquet -o ${RESULTS_FILE} +} + +# Runs the tpch in memory +run_tpch_mem() { + RESULTS_FILE="${RESULTS_DIR}/tpch_mem.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running tpch_mem benchmark..." + # -m means in memory + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE} +} + +compare_benchmarks() { + BASE_RESULTS_DIR="${SCRIPT_DIR}/results" + BRANCH1="${ARG2}" + BRANCH2="${ARG3}" + if [ -z "$BRANCH1" ] ; then + echo " not specified. Available branches:" + ls -1 "${BASE_RESULTS_DIR}" + exit 1 + fi + + if [ -z "$BRANCH2" ] ; then + echo " not specified" + ls -1 "${BASE_RESULTS_DIR}" + exit 1 + fi + + echo "Comparing ${BRANCH1} and ${BRANCH2}" + for bench in `ls ${BASE_RESULTS_DIR}/${BRANCH1}` ; do + RESULTS_FILE1="${BASE_RESULTS_DIR}/${BRANCH1}/${bench}" + RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${bench}" + if test -f "${RESULTS_FILE2}" ; then + echo "--------------------" + echo "Benchmark ${bench}" + echo "--------------------" + python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}" + else + echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist" + fi + done + +} + +# And start the process up +main diff --git a/benchmarks/tpch-gen.sh b/benchmarks/tpch-gen.sh deleted file mode 100755 index 90230e666e4c..000000000000 --- a/benchmarks/tpch-gen.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -mkdir -p data/answers 2>/dev/null - -set -e - -pushd .. -. ./dev/build-set-env.sh -popd - -# Generate data into the ./data directory if it does not already exist -FILE=./data/supplier.tbl -if test -f "$FILE"; then - echo "$FILE exists." -else - docker run -v `pwd`/data:/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s $1 - ls -l data -fi - -# Copy expected answers (at SF=1) into the ./data/answers directory if it does not already exist -FILE=./data/answers/q1.out -if test -f "$FILE"; then - echo "$FILE exists." -else - docker run -v `pwd`/data:/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" -fi \ No newline at end of file