apache · chamikaramj · Oct 17, 2022 · Oct 13, 2022 · Oct 13, 2022 · Oct 13, 2022
diff --git a/examples/java/build.gradle b/examples/java/build.gradle
@@ -57,6 +57,7 @@ dependencies {
   implementation library.java.kafka_clients
   implementation project(path: ":sdks:java:core", configuration: "shadow")
   implementation project(":sdks:java:extensions:google-cloud-platform-core")
+  implementation project(":sdks:java:extensions:python")
   implementation project(":sdks:java:io:google-cloud-platform")
   implementation project(":sdks:java:io:kafka")
   implementation project(":sdks:java:extensions:ml")

diff --git a/...java/src/main/java/org/apache/beam/examples/multilanguage/SklearnMnistClassification.java b/...java/src/main/java/org/apache/beam/examples/multilanguage/SklearnMnistClassification.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples.multilanguage;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.VarLongCoder;
+import org.apache.beam.sdk.extensions.python.transforms.RunInference;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.Validation.Required;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.sdk.schemas.Schema.FieldType;
+import org.apache.beam.sdk.transforms.Filter;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.SerializableFunction;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter;
+
+/**
+ * An example Java Multi-language pipeline that Performs image classification on handwritten digits
+ * from the <a href="https://en.wikipedia.org/wiki/MNIST_database">MNIST</a> database.
+ *
+ * <p>For more details and instructions for running this please see <a
+ * href="https://github.com/apache/beam/tree/master/examples/multi-language">here</a>.
+ */
+public class SklearnMnistClassification {
+
+  /**
+   * We generate a Python function that produces a KV sklearn model loader and use that to
+   * instantiate {@link RunInference}. Note that {@code RunInference} can be instantiated with any
+   * arbitrary function that produces a model loader.
+   */
+  private String getModelLoaderScript() {
+    String s = "from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy\n";
+    s = s + "from apache_beam.ml.inference.base import KeyedModelHandler\n";
+    s = s + "def get_model_handler(model_uri):\n";
+    s = s + "  return KeyedModelHandler(SklearnModelHandlerNumpy(model_uri))\n";
+
+    return s;
+  }
+
+  /** Filters out the header of the dataset that should not be used for the computation. */
+  static class FilterNonRecordsFn implements SerializableFunction<String, Boolean> {
+
+    @Override
+    public Boolean apply(String input) {
+      return !input.startsWith("label");
+    }
+  }
+
+  /**
+   * Seperates our input records to label and data. Each input record is a set of comma separated
+   * string digits where first digit is the label and rest are data (pixels that represent the
+   * digit).
+   */
+  static class RecordsToLabeledPixelsFn extends SimpleFunction<String, KV<Long, Iterable<Long>>> {
+
+    @Override
+    public KV<Long, Iterable<Long>> apply(String input) {
+      String[] data = Splitter.on(',').splitToList(input).toArray(new String[] {});
+      Long label = Long.valueOf(data[0]);
+      List<Long> pixels = new ArrayList<Long>();
+      for (int i = 1; i < data.length; i++) {
+        pixels.add(Long.valueOf(data[i]));
+      }
+
+      return KV.of(label, pixels);
+    }
+  }
+
+  /** Formats the output to a mapping from the expected digit to the inferred digit. */
+  static class FormatOutput extends SimpleFunction<KV<Long, Row>, String> {
+
+    @Override
+    public String apply(KV<Long, Row> input) {
+      return input.getKey() + "," + input.getValue().getString("inference");
+    }
+  }
+
+  void runExample(SklearnMnistClassificationOptions options, String expansionService) {
+    // Schema of the output PCollection Row type to be provided to the RunInference transform.
+    Schema schema =
+        Schema.of(
+            Schema.Field.of("example", Schema.FieldType.array(Schema.FieldType.INT64)),
+            Schema.Field.of("inference", FieldType.STRING));
+
+    Pipeline pipeline = Pipeline.create(options);
+    PCollection<KV<Long, Iterable<Long>>> col =
+        pipeline
+            .apply(TextIO.read().from(options.getInput()))
+            .apply(Filter.by(new FilterNonRecordsFn()))
+            .apply(MapElements.via(new RecordsToLabeledPixelsFn()));
+    col.apply(
+            RunInference.ofKVs(getModelLoaderScript(), schema, VarLongCoder.of())
+                .withKwarg("model_uri", options.getModelPath())
+                .withExpansionService(expansionService))
+        .apply(MapElements.via(new FormatOutput()))
+        .apply(TextIO.write().to(options.getOutput()));
+
+    pipeline.run().waitUntilFinish();
+  }
+
+  public interface SklearnMnistClassificationOptions extends PipelineOptions {
+
+    @Description("Path to an input file that contains labels and pixels to feed into the model")
+    @Default.String("gs://apache-beam-samples/multi-language/mnist/example_input.csv")
+    String getInput();
+
+    void setInput(String value);
+
+    @Description("Path for storing the output")
+    @Required
+    String getOutput();
+
+    void setOutput(String value);
+
+    @Description(
+        "Path to a model file that contains the pickled file of a scikit-learn model trained on MNIST data")
+    @Default.String("gs://apache-beam-samples/multi-language/mnist/example_model")
+    String getModelPath();
+
+    void setModelPath(String value);
+
+    /** Set this option to specify Python expansion service URL. */
+    @Description("URL of Python expansion service")
+    @Default.String("")
+    String getExpansionService();
+
+    void setExpansionService(String value);
+  }
+
+  public static void main(String[] args) {
+    SklearnMnistClassificationOptions options =
+        PipelineOptionsFactory.fromArgs(args).as(SklearnMnistClassificationOptions.class);
+    SklearnMnistClassification example = new SklearnMnistClassification();
+    example.runExample(options, options.getExpansionService());
+  }
+}
diff --git a/examples/multi-language/README.md b/examples/multi-language/README.md
@@ -22,29 +22,147 @@
 This project provides examples of Apache Beam
 [multi-language pipelines](https://beam.apache.org/documentation/programming-guide/#multi-language-pipelines):
 
+## Using Java transforms from Python
+
 * **python/addprefix** - A Python pipeline that reads a text file and attaches a prefix on the Java side to each input.
 * **python/javacount** - A Python pipeline that counts words using the Java `Count.perElement()` transform.
 * **python/javadatagenerator** - A Python pipeline that produces a set of strings generated from Java.
                                   This example demonstrates the `JavaExternalTransform` API.
 
-## Instructions for running the pipelines
+### Instructions for running the pipelines
 
-### 1) Start the expansion service
+#### 1) Start the expansion service
 
 1. Download the latest 'beam-examples-multi-language' JAR. Starting with Apache Beam 2.36.0,
    you can find it in [the Maven Central Repository](https://search.maven.org/search?q=g:org.apache.beam).
 2. Run the following command, replacing `<version>` and `<port>` with valid values:
   `java -jar beam-examples-multi-language-<version>.jar <port> --javaClassLookupAllowlistFile='*'`
 
-### 2) Set up a Python virtual environment for Beam
+#### 2) Set up a Python virtual environment for Beam
 
 1. See [the Python quickstart](https://beam.apache.org/get-started/quickstart-py/)
    for more information.
 
-### 3) Execute the Python pipeline
+#### 3) Execute the Python pipeline
 
 1. In a new shell, run a pipeline in the **python** directory using a Beam runner that supports
    multi-language pipelines.
 
    The Python files contain details about the actual commands to run.
 
+## Using Python transforms from Java
+
+### Sklearn Mnist Classification
+
+Performs image classification on handwritten digits from the [MNIST](https://en.wikipedia.org/wiki/MNIST_database)
+database.
+
+Please see [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference) for
+context and information regarding the corresponding Python pipeline.
+
+Please note that the Java pipeline is
+[availalble in the Beam Java examples module](https://github.com/apache/beam/tree/master/examples/java/src/main/java/org/apache/beam/examples/multilanguage/SklearnMnistClassification.java).
+
+#### Setup
+
+* Obtain/generate a csv input file that contains labels and pixels to feed into the model and store it in
+GCS. An example input is available
+[here](https://storage.googleapis.com/apache-beam-samples/multi-language/mnist/example_input.csv).
+
+* Create a model file that contains the pickled file of a scikit-learn model
+trained on MNIST data and store it in GCS. An example model file is available
+[here](https://storage.googleapis.com/apache-beam-samples/multi-language/mnist/example_model).
+This model was generated by by running the program given
+[here](https://python-course.eu/machine-learning/training-and-testing-with-mnist.php)
+on the
+[example input dataset](https://storage.googleapis.com/apache-beam-samples/multi-language/mnist/example_input.csv).
+
+* Perform Beam runner specific setup according to instructions
+[here](https://beam.apache.org/get-started/quickstart-java/#run-a-pipeline).
+
+Following instructions are for running the pipeline with the Dataflow runner. For other portable runners,
+please modify the instructions according to the guidelines
+[here](https://beam.apache.org/documentation/sdks/java-multi-language-pipelines/#run-with-directrunner)
+
+#### Instructions for running the Java pipeline on released Beam (Beam 2.43.0 and later).
+
+* Checkout the Beam examples Maven archetype for the relevant Beam version.
+
+```
+export BEAM_VERSION=<Beam version>
+
+mvn archetype:generate \
+    -DarchetypeGroupId=org.apache.beam \
+    -DarchetypeArtifactId=beam-sdks-java-maven-archetypes-examples \
+    -DarchetypeVersion=$BEAM_VERSION \
+    -DgroupId=org.example \
+    -DartifactId=multi-language-beam \
+    -Dversion="0.1" \
+    -Dpackage=org.apache.beam.examples \
+    -DinteractiveMode=false
+```
+
+* Run the pipeline.
+
+```
+export GCP_PROJECT=<GCP project>
+export GCP_BUCKET=<GCP bucket>
+export GCP_REGION=<GCP region>
+
+mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.multilanguage.SklearnMnistClassification \
+    -Dexec.args="--runner=DataflowRunner --project=$GCP_PROJECT \
+                 --region=us-central1 \
+                 --gcpTempLocation=gs://$GCP_BUCKET/multi-language-beam/tmp \
+                 --output=gs://$GCP_BUCKET/multi-language-beam/output" \
+    -Pdataflow-runner
+```
+
+* Inspect the output. Each line has data separated by a comma ",". The first item is the actual label of
+the digit. The second item is the predicted label of the digit.
+
+```
+gsutil cat gs://$GCP_BUCKET/multi-language-beam/output*
+```
+
+#### Instructions for running the Java pipeline at HEAD (Beam 2.41.0 and 2.42.0).
+
+* Make sure that Docker is installed and available on your system.
+
+* Build and push Python and Java Docker containers.
+
+```
+export DOCKER_ROOT=<Docker root>
+
+./gradlew :sdks:python:container:py38:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest
+
+docker push $DOCKER_ROOT/beam_python3.8_sdk:latest
+
+./gradlew :sdks:java:container:java11:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest
+
+docker push $DOCKER_ROOT/beam_java11_sdk:latest
+```
+
+* Run the pipeline using the following Gradle command (this guide assumes Dataflow runner).
+Note that we override both the Java and Python SDK harness containers here.
+
+```
+export GCP_PROJECT=<GCP project>
+export GCP_BUCKET=<GCP bucket>
+export GCP_REGION=<GCP region>
+
+./gradlew :examples:multi-language:sklearnMinstClassification --args=" \
+--runner=DataflowRunner \
+--project=$GCP_PROJECT \
+--gcpTempLocation=gs://$GCP_BUCKET/multi-language-beam/tmp \
+--output=gs://$GCP_BUCKET/multi-language-beam/output \
+--sdkContainerImage=$DOCKER_ROOT/beam_java11_sdk:latest \
+--sdkHarnessContainerImageOverrides=.*python.*,$DOCKER_ROOT/beam_python3.8_sdk:latest \
+--region=${GCP_REGION}"
+```
+
+* Inspect the output. Each line has data separated by a comma ",". The first item is the actual label
+of the digit. The second item is the predicted label of the digit.
+
+```
+gsutil cat gs://$GCP_BUCKET/multi-language-beam/output*
+```
diff --git a/examples/multi-language/build.gradle b/examples/multi-language/build.gradle
@@ -34,6 +34,7 @@ ext.summary = "Java Classes for Multi-language Examples"
 dependencies {
     implementation library.java.vendored_guava_26_0_jre
     implementation project(path: ":sdks:java:core", configuration: "shadow")
+    runtimeOnly project(path: ":examples:java")
     runtimeOnly project(path: ":runners:direct-java", configuration: "shadow")
     runtimeOnly project(path: ":runners:google-cloud-dataflow-java")
     runtimeOnly project(path: ":runners:portability:java")
@@ -47,4 +48,11 @@ task pythonDataframeWordCount(type: JavaExec) {
     description "Run the Java word count example using external Python DataframeTransform"
     mainClass = "org.apache.beam.examples.multilanguage.PythonDataframeWordCount"
     classpath = sourceSets.main.runtimeClasspath
-}
+}
+
+task sklearnMinstClassification(type: JavaExec) {
+    description "Run the Java pipeline that performns image classification on handwritten digits from the MNIST database"
+    mainClass = "org.apache.beam.examples.multilanguage.SklearnMnistClassification"
+    classpath = sourceSets.main.runtimeClasspath
+}
+
diff --git a/...nguage/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java b/...nguage/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java
@@ -49,7 +49,6 @@
  * ./gradlew :examples:multi-language:pythonDataframeWordCount --args=" \
  * --runner=DataflowRunner \
  * --output=gs://{$OUTPUT_BUCKET}/count \
- * --experiments=use_runner_v2 \
  * --sdkHarnessContainerImageOverrides=.*python.*,gcr.io/apache-beam-testing/beam-sdk/beam_python{$PYTHON_VERSION}_sdk:latest"
  * }</pre>
  */

diff --git a/sdks/java/maven-archetypes/examples/generate-sources.sh b/sdks/java/maven-archetypes/examples/generate-sources.sh
@@ -70,6 +70,16 @@ rsync -a                                                                      \
     "${EXAMPLES_ROOT}"/src/test/java/org/apache/beam/examples/complete/game/  \
     "${ARCHETYPE_ROOT}/src/test/java/complete/game"
 
+#
+# Copy the Java multi-language examples.
+#
+
+mkdir -p "${ARCHETYPE_ROOT}/src/test/java/multilanguage/"
+
+rsync -a                                                                      \
+    "${EXAMPLES_ROOT}"/src/main/java/org/apache/beam/examples/multilanguage/  \
+    "${ARCHETYPE_ROOT}/src/main/java/multilanguage"
+
 #
 # Replace 'package org.apache.beam.examples' with 'package ${package}' in all Java code
 #

diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml
@@ -357,6 +357,13 @@
       <version>${beam.version}</version>
     </dependency>
 
+    <!-- Adds a dependency on the Python Multi-language pipelines API module. -->
+    <dependency>
+      <groupId>org.apache.beam</groupId>
+      <artifactId>beam-sdks-java-extensions-python</artifactId>
+      <version>${beam.version}</version>
+    </dependency>
+
     <!-- Dependencies below this line are specific dependencies needed by the examples code. -->
     <dependency>
       <groupId>com.google.api-client</groupId>