From 3ff7c26cc3caa8fe559fad31e420f8b3292ec0aa Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Fri, 25 Jun 2021 11:24:58 -0700 Subject: [PATCH 1/2] Truncate and link plan labels to "print-plans" Properly treat graphviz compiled with the default 16K label max Closes #2710. Signed-off-by: Gera Shegalov --- .../rapids/tool/ToolTextFileWriter.scala | 3 ++ .../rapids/tool/profiling/GenerateDot.scala | 49 ++++++++++++++----- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala index d46d08adbe8..0ba0ac14da7 100644 --- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala +++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala @@ -29,6 +29,9 @@ class ToolTextFileWriter(finalOutputDir: String, logFileName: String) extends Lo private val textOutputPath = new Path(s"$finalOutputDir/$logFileName") private val fs = FileSystem.get(textOutputPath.toUri, new Configuration()) + + def outputFilePath = textOutputPath + // this overwrites existing path private var outFile: Option[FSDataOutputStream] = Some(fs.create(textOutputPath)) logInfo(s"Output directory: $finalOutputDir") diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala index b35d7601cb2..99548520d05 100644 --- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala +++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala @@ -80,7 +80,7 @@ object GenerateDot { appId: String): Unit = { val graph = SparkPlanGraph(plan.plan, appId, sqlId.toString, physicalPlanString, accumIdToStageId, stageIdToStageMetrics) - val str = graph.makeDotFile(plan.metrics) + val str = graph.makeDotFile(plan.metrics, fileWriter.outputFilePath.getParent.toString) fileWriter.write(str) } } @@ -108,20 +108,12 @@ case class SparkPlanGraph( sqlId: String, physicalPlan: String) { - def makeDotFile(metrics: Map[Long, Long]): String = { - val leftAlignedLabel = - s""" - |Application: $appId - |Query: $sqlId - | - |$physicalPlan""" - .stripMargin - .replace("\n", "\\l") - + def makeDotFile(metrics: Map[Long, Long], outputDir: String): String = { + val queryLabel = SparkPlanGraph.createDotLabel(outputDir, appId, sqlId, physicalPlan) val dotFile = new StringBuilder dotFile.append("digraph G {\n") - dotFile.append(s"""label="$leftAlignedLabel"\n""") + dotFile.append(s"label=$queryLabel\n") dotFile.append("labelloc=b\n") dotFile.append("fontname=Courier\n") dotFile.append(s"""tooltip="APP: $appId Query: $sqlId"\n""") @@ -292,6 +284,39 @@ object SparkPlanGraph { exchanges, accumIdToStageId, stageIdToStageMetrics)) } } + + def createDotLabel( + outputDir: String, + appId: String, + sqlId: String, + physicalPlan: String, + maxLength: Int = 16384 + ): String = { + val sqlPlanFile = s"file://$outputDir/planDescriptions-$appId" + val sqlPlanPlaceHolder = "%s" + val htmlLineBreak = """
""" + "\n" + val queryLabelFormat = + s"""< + | + | + | + |
Application: $appId, Query: $sqlId
$sqlPlanPlaceHolder
+ |Large physical may be truncated. Start the profiling tool with --print-plans + |and open the link to locate + |Plan for SQL ID : $sqlId + |
>""".stripMargin + + // pre-calculate size post substitutions + val formatBytes = queryLabelFormat.length() - sqlPlanPlaceHolder.length() + val numLinebreaks = physicalPlan.count(_ == '\n') + val lineBreakBytes = numLinebreaks * htmlLineBreak.length() + val maxPlanLength = maxLength - formatBytes - lineBreakBytes + + queryLabelFormat.format( + physicalPlan.take(maxPlanLength) + .replaceAll("\n", htmlLineBreak) + ) + } } /** From b9035ba1a0e1a14668ccfb600b2a9c14d8f327a7 Mon Sep 17 00:00:00 2001 From: Gera Shegalov Date: Fri, 25 Jun 2021 17:02:34 -0700 Subject: [PATCH 2/2] test and review --- .../rapids/tool/ToolTextFileWriter.scala | 3 - .../rapids/tool/profiling/GenerateDot.scala | 29 +++---- .../tool/profiling/GenerateDotSuite.scala | 85 ++++++++++++++++--- 3 files changed, 85 insertions(+), 32 deletions(-) diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala index 0ba0ac14da7..d46d08adbe8 100644 --- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala +++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/ToolTextFileWriter.scala @@ -29,9 +29,6 @@ class ToolTextFileWriter(finalOutputDir: String, logFileName: String) extends Lo private val textOutputPath = new Path(s"$finalOutputDir/$logFileName") private val fs = FileSystem.get(textOutputPath.toUri, new Configuration()) - - def outputFilePath = textOutputPath - // this overwrites existing path private var outFile: Option[FSDataOutputStream] = Some(fs.create(textOutputPath)) logInfo(s"Output directory: $finalOutputDir") diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala index 99548520d05..122b95c62d2 100644 --- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala +++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDot.scala @@ -80,7 +80,7 @@ object GenerateDot { appId: String): Unit = { val graph = SparkPlanGraph(plan.plan, appId, sqlId.toString, physicalPlanString, accumIdToStageId, stageIdToStageMetrics) - val str = graph.makeDotFile(plan.metrics, fileWriter.outputFilePath.getParent.toString) + val str = graph.makeDotFile(plan.metrics) fileWriter.write(str) } } @@ -108,8 +108,8 @@ case class SparkPlanGraph( sqlId: String, physicalPlan: String) { - def makeDotFile(metrics: Map[Long, Long], outputDir: String): String = { - val queryLabel = SparkPlanGraph.createDotLabel(outputDir, appId, sqlId, physicalPlan) + def makeDotFile(metrics: Map[Long, Long]): String = { + val queryLabel = SparkPlanGraph.makeDotLabel(appId, sqlId, physicalPlan) val dotFile = new StringBuilder dotFile.append("digraph G {\n") @@ -285,33 +285,30 @@ object SparkPlanGraph { } } - def createDotLabel( - outputDir: String, - appId: String, + val htmlLineBreak = """
""" + "\n" + + def makeDotLabel( + appId: String, sqlId: String, physicalPlan: String, - maxLength: Int = 16384 + maxLength: Int = 16384 ): String = { - val sqlPlanFile = s"file://$outputDir/planDescriptions-$appId" val sqlPlanPlaceHolder = "%s" - val htmlLineBreak = """
""" + "\n" - val queryLabelFormat = + val queryLabelFormat = s"""< | | - | |
Application: $appId, Query: $sqlId
$sqlPlanPlaceHolder
- |Large physical may be truncated. Start the profiling tool with --print-plans - |and open the link to locate - |Plan for SQL ID : $sqlId + |
Large physical plans may be truncated. See output from + |--print-plans captioned "Plan for SQL ID : $sqlId" |
>""".stripMargin - // pre-calculate size post substitutions + // pre-calculate size post substitutions val formatBytes = queryLabelFormat.length() - sqlPlanPlaceHolder.length() val numLinebreaks = physicalPlan.count(_ == '\n') val lineBreakBytes = numLinebreaks * htmlLineBreak.length() val maxPlanLength = maxLength - formatBytes - lineBreakBytes - + queryLabelFormat.format( physicalPlan.take(maxPlanLength) .replaceAll("\n", htmlLineBreak) diff --git a/tools/src/test/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDotSuite.scala b/tools/src/test/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDotSuite.scala index 2d7b33004b2..3f98eab921d 100644 --- a/tools/src/test/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDotSuite.scala +++ b/tools/src/test/scala/com/nvidia/spark/rapids/tool/profiling/GenerateDotSuite.scala @@ -16,7 +16,9 @@ package com.nvidia.spark.rapids.tool.profiling import java.io.File +import java.security.SecureRandom +import scala.collection.mutable import scala.io.Source import com.nvidia.spark.rapids.tool.ToolTestUtils @@ -67,21 +69,78 @@ class GenerateDotSuite extends FunSuite with BeforeAndAfterAll with Logging { for (file <- dotDirs) { assert(file.getAbsolutePath.endsWith(".dot")) val source = Source.fromFile(file) - try { - val lines = source.getLines().toArray - assert(lines.head === "digraph G {") - assert(lines.last === "}") - hashAggCount += lines.count(_.contains("HashAggregate")) - stageCount += lines.count(_.contains("STAGE ")) - } finally { - source.close() - } + val dotFileStr = source.mkString + source.close() + assert(dotFileStr.startsWith("digraph G {")) + assert(dotFileStr.endsWith("}")) + val hashAggr = "HashAggregate" + val stageWord = "STAGE" + hashAggCount += dotFileStr.sliding(hashAggr.length).count(_ == hashAggr) + stageCount += dotFileStr.sliding(stageWord.length).count(_ == stageWord) } - // 2 node labels + 1 graph label - assert(hashAggCount === 3) - // Initial Aggregation, Final Aggregation, Sorting final output - assert(stageCount === 3) + + assert(hashAggCount === 8, "Expected: 4 in node labels + 4 in graph label") + assert(stageCount === 4, "Expected: UNKNOWN Stage, Initial Aggregation, " + + "Final Aggregation, Sorting final output") } } } + + test("Empty physical plan") { + val planLabel = SparkPlanGraph.makeDotLabel( + appId = "local-12345-1", + sqlId = "120", + physicalPlan = "") + + planLabelChecks(planLabel) + } + + test("Long physical plan") { + val random = new SecureRandom() + val seed = System.currentTimeMillis(); + random.setSeed(seed); + info("Seeding test with: " + seed) + val numTests = 100 + + val lineLengthRange = 50 until 200 + + + val planLengthSeq = mutable.ArrayBuffer.empty[Int] + val labelLengthSeq = mutable.ArrayBuffer.empty[Int] + + // some imperfect randomness for edge cases + for (_ <- 1 to numTests) { + val lineLength = lineLengthRange.start + + random.nextInt(lineLengthRange.length) - + SparkPlanGraph.htmlLineBreak.length() + + val sign = if (random.nextBoolean()) 1 else -1 + val planLength = 16 * 1024 + sign * lineLength * (1 + random.nextInt(5)); + val planStr = (0 to planLength / lineLength).map(_ => "a" * lineLength).mkString("\n") + + planLengthSeq += planStr.length() + + val planLabel = SparkPlanGraph.makeDotLabel( + appId = "local-12345-1", + sqlId = "120", + physicalPlan = planStr) + + labelLengthSeq += planLabel.length() + + planLabelChecks(planLabel) + assert(planLabel.length() <= 16 * 1024) + assert(planLabel.contains("a" * lineLength)) + assert(planLabel.contains(SparkPlanGraph.htmlLineBreak)) + } + + info(s"Plan length summary: min=${labelLengthSeq.min} max=${labelLengthSeq.max}") + info(s"Plan label summary: min=${planLengthSeq.min} max=${planLengthSeq.max}") + } + + private def planLabelChecks(planLabel: String) { + assert(planLabel.startsWith("<>")) + assert(planLabel.contains("local-12345-1")) + assert(planLabel.contains("Plan for SQL ID : 120")) + } }