apache-spark-on-k8s · ifilonenko · Jun 16, 2017 · Jun 17, 2017 · Jun 17, 2017 · Jun 20, 2017
diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ We've been asked by an Apache Spark Committer to work outside of the Apache infr
 
 This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
 
+- Bloomberg
 - Google
 - Haiwen
 - Hyperpilot

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -335,8 +335,8 @@ object SparkSubmit {
     (clusterManager, deployMode) match {
       case (KUBERNETES, CLIENT) =>
         printErrorAndExit("Client mode is currently not supported for Kubernetes.")
-      case (KUBERNETES, CLUSTER) if args.isPython || args.isR =>
-        printErrorAndExit("Kubernetes does not currently support python or R applications.")
+      case (KUBERNETES, CLUSTER) if args.isR =>
+        printErrorAndExit("Kubernetes does not currently support R applications.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -620,8 +620,14 @@ object SparkSubmit {
 
     if (isKubernetesCluster) {
       childMainClass = "org.apache.spark.deploy.kubernetes.submit.Client"
-      childArgs += args.primaryResource
-      childArgs += args.mainClass
+      if (args.isPython) {
+        childArgs += args.primaryResource
+        childArgs += "org.apache.spark.deploy.PythonRunner"
+        childArgs += args.pyFiles
+      } else {
+        childArgs += args.primaryResource
+        childArgs += args.mainClass
+      }
       childArgs ++= args.childArgs
     }
 

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
@@ -180,6 +180,32 @@ The above mechanism using `kubectl proxy` can be used when we have authenticatio
 kubernetes-client library does not support. Authentication using X509 Client Certs and OAuth tokens
 is currently supported.
 
+### Running PySpark
+
+Running PySpark on Kubernetes leverages the same spark-submit logic when launching on Yarn and Mesos. 
+Python files can be distributed by including, in the conf, `--py-files` 
+
+Below is an example submission: 
+
+
+```
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --master k8s://http://127.0.0.1:8001 \
+      --kubernetes-namespace default \
+      --conf spark.executor.memory=500m \
+      --conf spark.driver.memory=1G \
+      --conf spark.driver.cores=1 \
+      --conf spark.executor.cores=1 \
+      --conf spark.executor.instances=1 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=spark-driver-py:latest \
+      --conf spark.kubernetes.executor.docker.image=spark-executor-py:latest \
+      --conf spark.kubernetes.initcontainer.docker.image=spark-init:latest \
+      --py-files local:///opt/spark/examples/src/main/python/sort.py \
+      local:///opt/spark/examples/src/main/python/pi.py 100
+```
+
 ## Dynamic Executor Scaling
 
 Spark on Kubernetes supports Dynamic Allocation with cluster mode. This mode requires running

diff --git a/...anagers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/...anagers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -67,6 +67,8 @@ package object constants {
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
   private[spark] val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
+  private[spark] val ENV_PYSPARK_FILES = "PYSPARK_FILES"
+  private[spark] val ENV_PYSPARK_PRIMARY = "PYSPARK_PRIMARY"
 
   // Bootstrapping dependencies with the init-container
   private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"

diff --git a/...ers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/...ers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -47,11 +47,11 @@ private[spark] class Client(
     appName: String,
     kubernetesResourceNamePrefix: String,
     kubernetesAppId: String,
+    mainAppResource: String,
+    pythonResource: Option[PythonSubmissionResources],
     mainClass: String,
     sparkConf: SparkConf,
     appArgs: Array[String],
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
     waitForAppCompletion: Boolean,
     kubernetesClient: KubernetesClient,
     initContainerComponentsProvider: DriverInitContainerComponentsProvider,
@@ -82,9 +82,10 @@ private[spark] class Client(
     org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
 
   def run(): Unit = {
-    validateNoDuplicateFileNames(sparkJars)
-    validateNoDuplicateFileNames(sparkFiles)
-
+    val arguments = pythonResource match {
+      case Some(p) => p.arguments
+      case None => appArgs
+    }
     val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
       sparkConf,
       KUBERNETES_DRIVER_LABEL_PREFIX,
@@ -136,7 +137,7 @@ private[spark] class Client(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.mkString(" "))
+        .withValue(arguments.mkString(" "))
         .endEnv()
       .withNewResources()
         .addToRequests("cpu", driverCpuQuantity)
@@ -182,9 +183,14 @@ private[spark] class Client(
       .map(_.build())
 
     val containerLocalizedFilesResolver = initContainerComponentsProvider
-      .provideContainerLocalizedFilesResolver()
+      .provideContainerLocalizedFilesResolver(mainAppResource)
     val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
     val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
+    val resolvedPySparkFiles = containerLocalizedFilesResolver.resolveSubmittedPySparkFiles()
+    val resolvedPrimaryPySparkResource = pythonResource match {
+      case Some(p) => p.primarySparkResource(containerLocalizedFilesResolver)
+      case None => ""
+    }
 
     val initContainerBundler = initContainerComponentsProvider
       .provideInitContainerBundle(maybeSubmittedResourceIdentifiers.map(_.ids()),
@@ -221,7 +227,7 @@ private[spark] class Client(
     val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
       case (confKey, confValue) => s"-D$confKey=$confValue"
     }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-    val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
+    val resolvedDriverPodBuilder = podWithInitContainerAndMountedCreds.editSpec()
       .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
         .addNewEnv()
           .withName(ENV_MOUNTED_CLASSPATH)
@@ -233,7 +239,16 @@ private[spark] class Client(
           .endEnv()
         .endContainer()
       .endSpec()
-      .build()
+    val resolvedDriverPod = pythonResource match {
+      case Some(p) => p.driverPod(
+        initContainerComponentsProvider,
+        resolvedPrimaryPySparkResource,
+        resolvedPySparkFiles.mkString(","),
+        driverContainer.getName,
+        resolvedDriverPodBuilder
+      )
+      case None => resolvedDriverPodBuilder.build()
+    }
     Utils.tryWithResource(
         kubernetesClient
             .pods()
@@ -271,17 +286,6 @@ private[spark] class Client(
       }
     }
   }
-
-  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
-    val fileNamesToUris = allFiles.map { file =>
-      (new File(Utils.resolveURI(file).getPath).getName, file)
-    }
-    fileNamesToUris.groupBy(_._1).foreach {
-      case (fileName, urisWithFileName) =>
-        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
-          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
-    }
-  }
 }
 
 private[spark] object Client {
@@ -292,22 +296,38 @@ private[spark] object Client {
     val appArgs = args.drop(2)
     run(sparkConf, mainAppResource, mainClass, appArgs)
   }
-
   def run(
       sparkConf: SparkConf,
       mainAppResource: String,
       mainClass: String,
       appArgs: Array[String]): Unit = {
+    val isPython = mainAppResource.endsWith(".py")
+    val pythonResource: Option[PythonSubmissionResources] =
+      if (isPython) {
+        Option(new PythonSubmissionResources(mainAppResource, appArgs))
+      } else None
+    // Since you might need jars for SQL UDFs in PySpark
+    def sparkJarFilter() : Seq[String] = pythonResource match {
+      case Some(p) => p.sparkJars
+      case None =>
+        Option(mainAppResource)
+          .filterNot(_ == SparkLauncher.NO_RESOURCE)
+          .toSeq
+    }
     val sparkJars = sparkConf.getOption("spark.jars")
       .map(_.split(","))
-      .getOrElse(Array.empty[String]) ++
-      Option(mainAppResource)
-        .filterNot(_ == SparkLauncher.NO_RESOURCE)
-        .toSeq
+      .getOrElse(Array.empty[String]) ++ sparkJarFilter()
     val launchTime = System.currentTimeMillis
     val sparkFiles = sparkConf.getOption("spark.files")
       .map(_.split(","))
       .getOrElse(Array.empty[String])
+    val pySparkFiles: Array[String] = pythonResource match {
+      case Some(p) => p.pySparkFiles
+      case None => Array.empty[String]
+    }
+    validateNoDuplicateFileNames(sparkJars)
+    validateNoDuplicateFileNames(sparkFiles)
+    if (pythonResource.isDefined) {validateNoDuplicateFileNames(pySparkFiles)}
     val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
     // The resource name prefix is derived from the application name, making it easy to connect the
     // names of the Kubernetes resources from e.g. Kubectl or the Kubernetes dashboard to the
@@ -326,6 +346,7 @@ private[spark] object Client {
         namespace,
         sparkJars,
         sparkFiles,
+        pySparkFiles,
         sslOptionsProvider.getSslOptions)
     Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
         master,
@@ -346,16 +367,26 @@ private[spark] object Client {
           appName,
           kubernetesResourceNamePrefix,
           kubernetesAppId,
+          mainAppResource,
+          pythonResource,
           mainClass,
           sparkConf,
           appArgs,
-          sparkJars,
-          sparkFiles,
           waitForAppCompletion,
           kubernetesClient,
           initContainerComponentsProvider,
           kubernetesCredentialsMounterProvider,
           loggingPodStatusWatcher).run()
     }
   }
+  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
+    val fileNamesToUris = allFiles.map { file =>
+      (new File(Utils.resolveURI(file).getPath).getName, file)
+    }
+    fileNamesToUris.groupBy(_._1).foreach {
+      case (fileName, urisWithFileName) =>
+        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
+          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
+    }
+  }
 }
diff --git a/...ain/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala b/...ain/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
@@ -24,13 +24,19 @@ private[spark] trait ContainerLocalizedFilesResolver {
   def resolveSubmittedAndRemoteSparkJars(): Seq[String]
   def resolveSubmittedSparkJars(): Seq[String]
   def resolveSubmittedSparkFiles(): Seq[String]
+  def resolveSubmittedPySparkFiles(): Seq[String]
+  def resolvePrimaryResourceFile(): String
 }
 
 private[spark] class ContainerLocalizedFilesResolverImpl(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
+    pySparkFiles: Seq[String],
+    primaryPyFile: String,
     jarsDownloadPath: String,
-    filesDownloadPath: String) extends ContainerLocalizedFilesResolver {
+    filesDownloadPath: String
+    ) extends ContainerLocalizedFilesResolver {
+
 
   override def resolveSubmittedAndRemoteSparkJars(): Seq[String] = {
     sparkJars.map { jar =>
@@ -53,16 +59,33 @@ private[spark] class ContainerLocalizedFilesResolverImpl(
     resolveSubmittedFiles(sparkFiles, filesDownloadPath)
   }
 
-  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
-    files.map { file =>
-      val fileUri = Utils.resolveURI(file)
-      Option(fileUri.getScheme).getOrElse("file") match {
-        case "file" =>
-          val fileName = new File(fileUri.getPath).getName
-          s"$downloadPath/$fileName"
-        case _ =>
-          file
-      }
+  override def resolveSubmittedPySparkFiles(): Seq[String] = {
+    def filterMainResource(x: String) = x match {
+      case `primaryPyFile` => None
+      case _ => Some(resolveFile(x, filesDownloadPath))
+    }
+    pySparkFiles.flatMap(x => filterMainResource(x))
+  }
+
+  override def resolvePrimaryResourceFile(): String = {
+    Option(primaryPyFile) match {
+      case None => ""
+      case Some(p) => resolveFile(p, filesDownloadPath)
     }
   }
+
+  private def resolveFile(file: String, downloadPath: String) = {
+    val fileUri = Utils.resolveURI(file)
+    Option(fileUri.getScheme).getOrElse("file") match {
+      case "file" =>
+        val fileName = new File(fileUri.getPath).getName
+        s"$downloadPath/$fileName"
+      case _ =>
+        file
+    }
+  }
+
+  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
+    files.map { file => resolveFile(file, downloadPath) }
+  }
 }
-Original file line number
+Diff line change
@@ Expand Up @@
     This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
+    - Bloomberg
     - Google
     - Haiwen
     - Hyperpilot
@@ Expand Down @@