Skip to content

Commit

Permalink
tpcd benchmarkrunner : add orc format support (#639)
Browse files Browse the repository at this point in the history
Co-authored-by: zhangleisx3662 <zhangleiSX3662@autohome.com.cn>
  • Loading branch information
leizhang5s and zhangleisx3662 authored Oct 30, 2024
1 parent 566b197 commit ee4e2ec
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 3 deletions.
1 change: 1 addition & 0 deletions tpcds/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ If benchmarking with Blaze, ensure that the Blaze jar package is correctly built
# use correct SPARK_HOME and data location
SPARK_HOME=$HOME/software/spark ./bin/run \
--data-location /user/hive/data/tpcds-1000 \
--format parquet \
--output-dir ./benchmark-result
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class TPCDSBenchmarkArgs(val args: Array[String]) {
var outputDir: String = _
var queryFilter: Set[String] = Set.empty
var round: Int = 2
var format: String = "parquet"

parseArgs(args.toList)
validateArguments()
Expand Down Expand Up @@ -53,6 +54,10 @@ class TPCDSBenchmarkArgs(val args: Array[String]) {
round = value.toInt
args = tail

case optName :: value :: tail if optionMatch("--format", optName) =>
format = value
args = tail

case _ =>
System.err.println("Unknown/unsupported param " + args)
printUsageAndExit(1)
Expand All @@ -68,6 +73,7 @@ class TPCDSBenchmarkArgs(val args: Array[String]) {
| --output-dir Output directory for results
| --query-filter Queries to filter, e.g., q3,q5,q13
| --round Run each query for a specified number of rounds, default: 2
| --format Data format, e.g. orc,parquet,default: parquet
| """.stripMargin)
System.exit(exitCode)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ import java.util.Date

import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object TPCDSBenchmarkRunner {
def main(args: Array[String]): Unit = {
Expand Down Expand Up @@ -79,7 +78,13 @@ object TPCDSBenchmarkRunner {
"web_returns", "web_site", "reason", "call_center", "warehouse", "ship_mode", "income_band",
"time_dim", "web_page")
tables.par.foreach { tableName =>
spark.read.parquet(s"$dataLocation/$tableName").createOrReplaceTempView(tableName)
val df: DataFrame = benchmarkArgs.format match {
case "orc" => spark.read.orc(s"$dataLocation/$tableName")
case "parquet" => spark.read.parquet(s"$dataLocation/$tableName")
case _ => throw new RuntimeException(
s"Unknown format , avaliable formats: orc,parquet, current input: ${benchmarkArgs.format}")
}
df.createOrReplaceTempView(tableName)
tableName -> spark.table(tableName).count()
}

Expand Down

0 comments on commit ee4e2ec

Please sign in to comment.