From 408f574fd17730362dac0a6b966d4470df71cc1c Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Fri, 6 Jun 2014 13:51:06 -0700 Subject: [PATCH 1/6] SPARK-1704: Add CommandStrategy and ExplainCommandPhysical. --- .../org/apache/spark/sql/SQLContext.scala | 6 ++ .../spark/sql/execution/SparkStrategies.scala | 11 +++ .../apache/spark/sql/execution/commands.scala | 68 +++++++++++++++++++ .../apache/spark/sql/hive/HiveContext.scala | 3 +- .../sql/hive/execution/HiveQuerySuite.scala | 1 + 5 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 043be58edc91b..a2d095debfc87 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext) val sparkContext = self.sparkContext val strategies: Seq[Strategy] = + CommandStrategy(self) :: TakeOrdered :: PartialAggregation :: HashJoin :: @@ -255,6 +256,11 @@ class SQLContext(@transient val sparkContext: SparkContext) Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil } + // TODO: or should we make QueryExecution protected[sql]? + protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution { + val logical = plan + } + /** * The primary workflow for executing relational queries using Spark. Designed to allow easy * access to the intermediate phases of query execution for developers. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index cfa8bdae58b11..7a7ce66fd82d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -217,4 +217,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case _ => Nil } } + + // TODO: this should be merged with SPARK-1508's SetCommandStrategy + case class CommandStrategy(context: SQLContext) extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case logical.ExplainCommand(child) => + val qe = context.mkQueryExecution(child) + Seq(execution.ExplainCommandPhysical(qe.executedPlan)(context)) + case _ => Nil + } + } + } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala new file mode 100644 index 0000000000000..85e0fa9594e03 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{SQLContext, Row} +import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan + +/** + * :: DeveloperApi :: + */ +//@DeveloperApi +//case class SetCommandPhysical( +// key: Option[String], +// value: Option[String], +// context: SQLContext) +// extends LeafNode { +// +// "hi".split("=", 2) +// +// def execute(): RDD[Row] = (key, value) match { +// case (Some(k), Some(v)) => context.emptyResult +// case (Some(k), None) => +// val resultString = context.sqlConf.getOption(k) match { +// case Some(v) => s"$k=$v" +// case None => s"$k is undefined" +// } +// context.sparkContext.parallelize(Seq(new GenericRow(Array[Any](resultString))), 1) +// case (None, None) => +// val pairs = context.sqlConf.getAll +// val rows = pairs.map { case (k, v) => +// new GenericRow(Array[Any](s"$k=$v")) +// }.toSeq +// // Assume config parameters can fit into one split (machine) ;) +// context.sparkContext.parallelize(rows, 1) +// case _ => context.emptyResult +// } +// +// def output: Seq[Attribute] = Seq.empty // TODO: right thing? +//} + + +case class ExplainCommandPhysical(child: SparkPlan) + (@transient context: SQLContext) extends UnaryNode { + def execute(): RDD[Row] = { + val lines = child.toString.split("\n").map(s => new GenericRow(Array[Any](s))) + context.sparkContext.parallelize(lines) + } + + def output: Seq[Attribute] = child.output // right thing to do? +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index b21f24dad785d..b6648ab55d761 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -218,6 +218,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { val hiveContext = self override val strategies: Seq[Strategy] = Seq( + CommandStrategy(self), TakeOrdered, ParquetOperations, HiveTableScans, @@ -303,7 +304,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { */ def stringResult(): Seq[String] = analyzed match { case NativeCommand(cmd) => runSqlHive(cmd) - case ExplainCommand(plan) => new QueryExecution { val logical = plan }.toString.split("\n") + case ExplainCommand(plan) => mkQueryExecution(plan).toString.split("\n") case query => val result: Seq[Seq[Any]] = toRdd.collect().toSeq // We need the types so we can output struct field names diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 125cc18bfb2b5..c378b9da5fc9d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -159,4 +159,5 @@ class HiveQuerySuite extends HiveComparisonTest { hql("SHOW TABLES").toString hql("SELECT * FROM src").toString } + } From 439c6abb689b3859a63ec2b850586b362dc3a3a1 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Fri, 6 Jun 2014 17:02:45 -0700 Subject: [PATCH 2/6] Minor cleanups. --- .../apache/spark/sql/execution/commands.scala | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 85e0fa9594e03..575778f4ecda4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -17,45 +17,9 @@ package org.apache.spark.sql.execution -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SQLContext, Row} import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan - -/** - * :: DeveloperApi :: - */ -//@DeveloperApi -//case class SetCommandPhysical( -// key: Option[String], -// value: Option[String], -// context: SQLContext) -// extends LeafNode { -// -// "hi".split("=", 2) -// -// def execute(): RDD[Row] = (key, value) match { -// case (Some(k), Some(v)) => context.emptyResult -// case (Some(k), None) => -// val resultString = context.sqlConf.getOption(k) match { -// case Some(v) => s"$k=$v" -// case None => s"$k is undefined" -// } -// context.sparkContext.parallelize(Seq(new GenericRow(Array[Any](resultString))), 1) -// case (None, None) => -// val pairs = context.sqlConf.getAll -// val rows = pairs.map { case (k, v) => -// new GenericRow(Array[Any](s"$k=$v")) -// }.toSeq -// // Assume config parameters can fit into one split (machine) ;) -// context.sparkContext.parallelize(rows, 1) -// case _ => context.emptyResult -// } -// -// def output: Seq[Attribute] = Seq.empty // TODO: right thing? -//} - case class ExplainCommandPhysical(child: SparkPlan) (@transient context: SQLContext) extends UnaryNode { From 4318fd7ed84f3f08f3e74d0b70befc01c3d77971 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Fri, 6 Jun 2014 23:46:21 -0700 Subject: [PATCH 3/6] Make all output one Row. --- .../main/scala/org/apache/spark/sql/execution/commands.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 575778f4ecda4..2441bc5f6c876 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -24,8 +24,8 @@ import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute} case class ExplainCommandPhysical(child: SparkPlan) (@transient context: SQLContext) extends UnaryNode { def execute(): RDD[Row] = { - val lines = child.toString.split("\n").map(s => new GenericRow(Array[Any](s))) - context.sparkContext.parallelize(lines) + val planString = new GenericRow(Array[Any](child.toString)) + context.sparkContext.parallelize(Seq(planString)) } def output: Seq[Attribute] = child.output // right thing to do? From 719ada9e90787cc4fe9e2293f2a056b393348e08 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 7 Jun 2014 10:38:40 -0700 Subject: [PATCH 4/6] Override otherCopyArgs for ExplainCommandPhysical. --- .../main/scala/org/apache/spark/sql/execution/commands.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 2441bc5f6c876..5afa5e43a31ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -29,4 +29,6 @@ case class ExplainCommandPhysical(child: SparkPlan) } def output: Seq[Attribute] = child.output // right thing to do? + + override def otherCopyArgs = context :: Nil } From 1bfa379024e518fc7dc99eaa10894fd48740bf45 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 7 Jun 2014 11:35:08 -0700 Subject: [PATCH 5/6] Modify output(). --- .../spark/sql/catalyst/plans/logical/LogicalPlan.scala | 8 +++++--- .../org/apache/spark/sql/execution/SparkStrategies.scala | 2 +- .../scala/org/apache/spark/sql/execution/commands.scala | 4 +--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 2b8fbdcde9d37..4f641cd3a656b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.catalyst.types.{StringType, StructType} import org.apache.spark.sql.catalyst.trees abstract class LogicalPlan extends QueryPlan[LogicalPlan] { @@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] { */ abstract class Command extends LeafNode { self: Product => - def output = Seq.empty + def output: Seq[Attribute] = Seq.empty } /** @@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command * Returned by a parser when the users only wants to see what query plan would be executed, without * actually performing the execution. */ -case class ExplainCommand(plan: LogicalPlan) extends Command +case class ExplainCommand(plan: LogicalPlan) extends Command { + override def output = Seq(AttributeReference("plan", StringType, nullable = false)()) +} /** * A logical plan node with single child. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 7a7ce66fd82d4..a516cf478df85 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -223,7 +223,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case logical.ExplainCommand(child) => val qe = context.mkQueryExecution(child) - Seq(execution.ExplainCommandPhysical(qe.executedPlan)(context)) + Seq(execution.ExplainCommandPhysical(qe.executedPlan, plan.output)(context)) case _ => Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 5afa5e43a31ed..5371d2f479e73 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -21,14 +21,12 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SQLContext, Row} import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute} -case class ExplainCommandPhysical(child: SparkPlan) +case class ExplainCommandPhysical(child: SparkPlan, output: Seq[Attribute]) (@transient context: SQLContext) extends UnaryNode { def execute(): RDD[Row] = { val planString = new GenericRow(Array[Any](child.toString)) context.sparkContext.parallelize(Seq(planString)) } - def output: Seq[Attribute] = child.output // right thing to do? - override def otherCopyArgs = context :: Nil } From 5b7911f6d8564ace5bd88409a2e01a8bbfa355f7 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Mon, 9 Jun 2014 12:06:46 -0700 Subject: [PATCH 6/6] Add a regression test. --- .../spark/sql/hive/execution/HiveQuerySuite.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index c378b9da5fc9d..c56eee258047f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.sql.hive.test.TestHive /** * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution. @@ -160,4 +161,14 @@ class HiveQuerySuite extends HiveComparisonTest { hql("SELECT * FROM src").toString } + test("SPARK-1704: Explain commands as a SchemaRDD") { + hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") + val rdd = hql("explain select key, count(value) from src group by key") + assert(rdd.collect().size == 1) + assert(rdd.toString.contains("ExplainCommand")) + assert(rdd.filter(row => row.toString.contains("ExplainCommand")).collect().size == 0, + "actual contents of the result should be the plans of the query to be explained") + TestHive.reset() + } + }