apache-spark-on-k8s · ifilonenko · Jun 16, 2017 · Jun 17, 2017 · Jun 17, 2017 · Jun 20, 2017
diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ We've been asked by an Apache Spark Committer to work outside of the Apache infr
 
 This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
 
+- Bloomberg
 - Google
 - Haiwen
 - Hyperpilot

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -335,8 +335,8 @@ object SparkSubmit {
     (clusterManager, deployMode) match {
       case (KUBERNETES, CLIENT) =>
         printErrorAndExit("Client mode is currently not supported for Kubernetes.")
-      case (KUBERNETES, CLUSTER) if args.isPython || args.isR =>
-        printErrorAndExit("Kubernetes does not currently support python or R applications.")
+      case (KUBERNETES, CLUSTER) if args.isR =>
+        printErrorAndExit("Kubernetes does not currently support R applications.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -620,8 +620,14 @@ object SparkSubmit {
 
     if (isKubernetesCluster) {
       childMainClass = "org.apache.spark.deploy.kubernetes.submit.Client"
-      childArgs += args.primaryResource
-      childArgs += args.mainClass
+      if (args.isPython) {
+        childArgs += args.primaryResource
+        childArgs += "org.apache.spark.deploy.PythonRunner"
+        childArgs += args.pyFiles
+      } else {
+        childArgs += args.primaryResource
+        childArgs += args.mainClass
+      }
       childArgs ++= args.childArgs
     }
 

diff --git a/...anagers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/...anagers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -67,6 +67,8 @@ package object constants {
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
   private[spark] val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
+  private[spark] val ENV_PYSPARK_FILES = "PYSPARK_FILES"
+  private[spark] val ENV_PYSPARK_PRIMARY = "PYSPARK_PRIMARY"
 
   // Bootstrapping dependencies with the init-container
   private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"

diff --git a/...ers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/...ers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -47,11 +47,14 @@ private[spark] class Client(
     appName: String,
     kubernetesResourceNamePrefix: String,
     kubernetesAppId: String,
+    mainAppResource: String,
+    isPython: Boolean,
     mainClass: String,
     sparkConf: SparkConf,
     appArgs: Array[String],
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
+    pySparkFiles: List[String],
     waitForAppCompletion: Boolean,
     kubernetesClient: KubernetesClient,
     initContainerComponentsProvider: DriverInitContainerComponentsProvider,
@@ -83,7 +86,14 @@ private[spark] class Client(
   def run(): Unit = {
     validateNoDuplicateFileNames(sparkJars)
     validateNoDuplicateFileNames(sparkFiles)
-
+    if (isPython) {validateNoDuplicateFileNames(pySparkFiles)}
+    val arguments = if (isPython) pySparkFiles match {
+      case Nil => appArgs
+      case a::b => a match {
+        case _ if a==mainAppResource && b==Nil => appArgs
+        case _ => appArgs.drop(1)
+      }
+    } else appArgs
     val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
       sparkConf,
       KUBERNETES_DRIVER_LABEL_PREFIX,
@@ -135,7 +145,7 @@ private[spark] class Client(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.mkString(" "))
+        .withValue(arguments.mkString(" "))
         .endEnv()
       .withNewResources()
         .addToRequests("cpu", driverCpuQuantity)
@@ -173,10 +183,12 @@ private[spark] class Client(
         .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod)
 
     val containerLocalizedFilesResolver = initContainerComponentsProvider
-        .provideContainerLocalizedFilesResolver()
+        .provideContainerLocalizedFilesResolver(mainAppResource)
     val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
     val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
-
+    val resolvedPySparkFiles = containerLocalizedFilesResolver.resolveSubmittedPySparkFiles()
+    val resolvedPrimaryPySparkResource = if (!isPython) ""
+     else { containerLocalizedFilesResolver.resolvePrimaryResourceFile() }
     val executorInitContainerConfiguration = initContainerComponentsProvider
         .provideExecutorInitContainerConfiguration()
     val sparkConfWithExecutorInit = executorInitContainerConfiguration
@@ -204,7 +216,7 @@ private[spark] class Client(
     val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
       case (confKey, confValue) => s"-D$confKey=$confValue"
     }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-    val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
+    val resolvedDriverPodBuilder = podWithInitContainerAndMountedCreds.editSpec()
       .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
         .addNewEnv()
           .withName(ENV_MOUNTED_CLASSPATH)
@@ -216,7 +228,18 @@ private[spark] class Client(
           .endEnv()
         .endContainer()
       .endSpec()
-      .build()
+    val resolvedDriverPod = if (!isPython) {
+      resolvedDriverPodBuilder.build()
+    } else {
+      initContainerComponentsProvider
+        .provideDriverPodFileMounter()
+        .addPySparkFiles(
+          resolvedPrimaryPySparkResource,
+          resolvedPySparkFiles.mkString(","),
+          driverContainer.getName,
+          resolvedDriverPodBuilder)
+        .build()
+    }
     Utils.tryWithResource(
         kubernetesClient
             .pods()
@@ -274,22 +297,31 @@ private[spark] object Client {
     val appArgs = args.drop(2)
     run(sparkConf, mainAppResource, mainClass, appArgs)
   }
-
   def run(
       sparkConf: SparkConf,
       mainAppResource: String,
       mainClass: String,
       appArgs: Array[String]): Unit = {
-    val sparkJars = sparkConf.getOption("spark.jars")
-      .map(_.split(","))
-      .getOrElse(Array.empty[String]) ++
-      Option(mainAppResource)
+    val isPython = mainAppResource.endsWith(".py")
+    // Since you might need jars for SQL UDFs in PySpark
+    def sparkJarFilter() : Seq[String] = isPython match {
+      case true => Seq.empty[String]
+      case false => Option(mainAppResource)
         .filterNot(_ == SparkLauncher.NO_RESOURCE)
         .toSeq
+    }
+    val sparkJars = sparkConf.getOption("spark.jars")
+      .map(_.split(","))
+      .getOrElse(Array.empty[String]) ++ sparkJarFilter()
     val launchTime = System.currentTimeMillis
     val sparkFiles = sparkConf.getOption("spark.files")
       .map(_.split(","))
       .getOrElse(Array.empty[String])
+    val pySparkFiles: Array[String] = if (isPython) {
+      Option(appArgs(0)) match {
+        case None => Array(mainAppResource)
+        case Some(a) => mainAppResource +: a.split(",") }
+      } else { Array.empty[String] }
     val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
     // The resource name prefix is derived from the application name, making it easy to connect the
     // names of the Kubernetes resources from e.g. Kubectl or the Kubernetes dashboard to the
@@ -308,6 +340,7 @@ private[spark] object Client {
         namespace,
         sparkJars,
         sparkFiles,
+        pySparkFiles,
         sslOptionsProvider.getSslOptions)
     Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
         master,
@@ -328,11 +361,14 @@ private[spark] object Client {
           appName,
           kubernetesResourceNamePrefix,
           kubernetesAppId,
+          mainAppResource,
+          isPython,
           mainClass,
           sparkConf,
           appArgs,
           sparkJars,
           sparkFiles,
+          pySparkFiles.toList,
           waitForAppCompletion,
           kubernetesClient,
           initContainerComponentsProvider,

diff --git a/...ain/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala b/...ain/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
@@ -24,13 +24,19 @@ private[spark] trait ContainerLocalizedFilesResolver {
   def resolveSubmittedAndRemoteSparkJars(): Seq[String]
   def resolveSubmittedSparkJars(): Seq[String]
   def resolveSubmittedSparkFiles(): Seq[String]
+  def resolveSubmittedPySparkFiles(): Seq[String]
+  def resolvePrimaryResourceFile(): String
 }
 
 private[spark] class ContainerLocalizedFilesResolverImpl(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
+    pySparkFiles: Seq[String],
+    primaryPyFile: String,
     jarsDownloadPath: String,
-    filesDownloadPath: String) extends ContainerLocalizedFilesResolver {
+    filesDownloadPath: String
+    ) extends ContainerLocalizedFilesResolver {
+
 
   override def resolveSubmittedAndRemoteSparkJars(): Seq[String] = {
     sparkJars.map { jar =>
@@ -53,16 +59,33 @@ private[spark] class ContainerLocalizedFilesResolverImpl(
     resolveSubmittedFiles(sparkFiles, filesDownloadPath)
   }
 
-  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
-    files.map { file =>
-      val fileUri = Utils.resolveURI(file)
-      Option(fileUri.getScheme).getOrElse("file") match {
-        case "file" =>
-          val fileName = new File(fileUri.getPath).getName
-          s"$downloadPath/$fileName"
-        case _ =>
-          file
-      }
+  override def resolveSubmittedPySparkFiles(): Seq[String] = {
+    def filterMainResource(x: String) = x match {
+      case `primaryPyFile` => None
+      case _ => Some(resolveFile(x, filesDownloadPath))
+    }
+    pySparkFiles.flatMap(x => filterMainResource(x))
+  }
+
+  override def resolvePrimaryResourceFile(): String = {
+    Option(primaryPyFile) match {
+      case None => ""
+      case Some(p) => resolveFile(p, filesDownloadPath)
     }
   }
+
+  private def resolveFile(file: String, downloadPath: String) = {
+    val fileUri = Utils.resolveURI(file)
+    Option(fileUri.getScheme).getOrElse("file") match {
+      case "file" =>
+        val fileName = new File(fileUri.getPath).getName
+        s"$downloadPath/$fileName"
+      case _ =>
+        file
+    }
+  }
+
+  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
+    files.map { file => resolveFile(file, downloadPath) }
+  }
 }
diff --git a/...ala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/...ala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -33,14 +33,17 @@ private[spark] trait DriverInitContainerComponentsProvider {
   def provideInitContainerConfigMapBuilder(
       maybeSubmittedResourceIds: Option[SubmittedResourceIds])
       : SparkInitContainerConfigMapBuilder
-  def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver
+  def provideContainerLocalizedFilesResolver(
+      mainAppResource: String) : ContainerLocalizedFilesResolver
   def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration
   def provideInitContainerSubmittedDependencyUploader(
       driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader]
   def provideSubmittedDependenciesSecretBuilder(
       maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
       : Option[SubmittedDependencySecretBuilder]
   def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap
+  def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter
+
 }
 
 private[spark] class DriverInitContainerComponentsProviderImpl(
@@ -49,6 +52,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       namespace: String,
       sparkJars: Seq[String],
       sparkFiles: Seq[String],
+      pySparkFiles: Seq[String],
       resourceStagingServerExternalSslOptions: SSLOptions)
     extends DriverInitContainerComponentsProvider {
 
@@ -104,6 +108,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
+  private val pySparkSubmitted = KubernetesFileUtils.getOnlySubmitterLocalFiles(pySparkFiles)
 
   override def provideInitContainerConfigMapBuilder(
       maybeSubmittedResourceIds: Option[SubmittedResourceIds])
@@ -131,17 +136,18 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
     }
     new SparkInitContainerConfigMapBuilderImpl(
         sparkJars,
-        sparkFiles,
+        sparkFiles ++ pySparkSubmitted,
         jarsDownloadPath,
         filesDownloadPath,
         configMapName,
         configMapKey,
         submittedDependencyConfigPlugin)
   }
 
-  override def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver = {
+  override def provideContainerLocalizedFilesResolver(mainAppResource: String)
+    : ContainerLocalizedFilesResolver = {
     new ContainerLocalizedFilesResolverImpl(
-        sparkJars, sparkFiles, jarsDownloadPath, filesDownloadPath)
+        sparkJars, sparkFiles, pySparkFiles, mainAppResource, jarsDownloadPath, filesDownloadPath)
   }
 
   override def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
@@ -160,7 +166,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
           namespace,
           stagingServerUri,
           sparkJars,
-          sparkFiles,
+          sparkFiles ++ pySparkSubmitted,
           resourceStagingServerExternalSslOptions,
           RetrofitClientFactoryImpl)
     }
@@ -202,4 +208,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         configMapKey,
         resourceStagingServerSecretPlugin)
   }
+  override def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter = {
+    new DriverPodKubernetesFileMounterImpl()
+  }
 }
diff --git a/...main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala b/...main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import io.fabric8.kubernetes.api.model.{Container, PodBuilder}
+
+import org.apache.spark.deploy.kubernetes.constants._
+
+ /**
+  * Trait that is responsible for providing full file-paths dynamically after
+  * the filesDownloadPath has been defined. The file-names are then stored in the
+  * environmental variables in the driver-pod.
+  */
+private[spark] trait DriverPodKubernetesFileMounter {
+  def addPySparkFiles(primaryFile: String, pySparkFiles: String,
+    mainContainerName: String, originalPodSpec: PodBuilder) : PodBuilder
+}
+
+private[spark] class DriverPodKubernetesFileMounterImpl()
+  extends DriverPodKubernetesFileMounter {
+  override def addPySparkFiles(
+        primaryFile: String,
+        pySparkFiles: String,
+        mainContainerName: String,
+        originalPodSpec: PodBuilder): PodBuilder = {
+
+    originalPodSpec
+      .editSpec()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
+          .addNewEnv()
+            .withName(ENV_PYSPARK_PRIMARY)
+            .withValue(primaryFile)
+          .endEnv()
+          .addNewEnv()
+            .withName(ENV_PYSPARK_FILES)
+            .withValue(pySparkFiles)
+          .endEnv()
+        .endContainer()
+      .endSpec()
+  }
+}
-Original file line number
+Diff line change
@@ Expand Up @@
     This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
+    - Bloomberg
     - Google
     - Haiwen
     - Hyperpilot
@@ Expand Down @@