From 2bcd3d28d73c094ae9365b40fa12486eebd3118f Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Thu, 29 Feb 2024 17:23:21 -0800 Subject: [PATCH 1/2] Fix ReadSchema bug and NPE in Profiling tool Signed-off-by: Niranjan Artal --- .../spark/sql/rapids/tool/AppBase.scala | 20 +++++++++++-------- .../tool/profiling/ApplicationInfo.scala | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala index c4f711e41..7a786d0e2 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala @@ -426,15 +426,19 @@ abstract class AppBase( // Get ReadSchema of each Node and sanitize it for comparison val trimmedNode = trimSchema(ReadParser.parseReadNode(node).schema) readSchema.contains(trimmedNode) - }).filter(ReadParser.isScanNode(_)).head + }).filter(ReadParser.isScanNode(_)) - dataSourceInfo += DataSourceCase(sqlID, - scanNode.id, - meta.getOrElse("Format", "unknown"), - meta.getOrElse("Location", "unknown"), - meta.getOrElse("PushedFilters", "unknown"), - readSchema - ) + // If the ReadSchema is empty or if the Scan is not supported, then we don't need to + // add it to the dataSourceInfo + if (scanNode.nonEmpty) { + dataSourceInfo += DataSourceCase(sqlID, + scanNode.head.id, + meta.getOrElse("Format", "unknown"), + meta.getOrElse("Location", "unknown"), + meta.getOrElse("PushedFilters", "unknown"), + readSchema + ) + } } // "scan hive" has no "ReadSchema" defined. So, we need to look explicitly for nodes // that are scan hive and add them one by one to the dataSource diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala index 231e8d77c..fca67ea98 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/profiling/ApplicationInfo.scala @@ -335,7 +335,7 @@ class ApplicationInfo( nodeIds.contains((j.sqlID.get, n.id)) } validNodes.map(n => s"${n.name}(${n.id.toString})") - }.getOrElse(null) + }.getOrElse(Seq.empty) SQLStageInfoProfileResult(index, j.sqlID.get, jobId, s, sa, info.duration, nodeNames) } } From 307304cfa6f60651c7ca0871e6946ae2bf918ec3 Mon Sep 17 00:00:00 2001 From: Niranjan Artal Date: Fri, 1 Mar 2024 14:09:52 -0800 Subject: [PATCH 2/2] Update comment Signed-off-by: Niranjan Artal --- .../main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala index 7a786d0e2..20f197739 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala @@ -428,8 +428,9 @@ abstract class AppBase( readSchema.contains(trimmedNode) }).filter(ReadParser.isScanNode(_)) - // If the ReadSchema is empty or if the Scan is not supported, then we don't need to + // If the ReadSchema is empty or if it is PhotonScan, then we don't need to // add it to the dataSourceInfo + // Processing Photon eventlogs issue: https://github.com/NVIDIA/spark-rapids-tools/issues/251 if (scanNode.nonEmpty) { dataSourceInfo += DataSourceCase(sqlID, scanNode.head.id,