Skip to content

Commit

Permalink
Batch Ingestion Job rewritten on Spark (#1020)
Browse files Browse the repository at this point in the history
* test scala spark

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* offline batch ingestion in spark

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* clean up

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* deduplicate rows & use latest

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* clarify

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* validation & deadletter

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* tests on mapping & deadletter

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* scala styling

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* integration test stage

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* remove version from ingestion-spark pom

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* refactor job options

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* clean up dependencies + some api docs

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* extend mapping test

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* add shade plugin version & group

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>

* disable buildkit on docker build

Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>
  • Loading branch information
pyalex authored Oct 6, 2020
1 parent 8bf55de commit e47903f
Show file tree
Hide file tree
Showing 23 changed files with 2,024 additions and 21 deletions.
1 change: 0 additions & 1 deletion .github/workflows/complete.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ jobs:
GITHUB_PR_SHA: ${{ github.event.pull_request.head.sha }}
REGISTRY: gcr.io/kf-feast
MAVEN_CACHE: gs://feast-templocation-kf-feast/.m2.2020-08-19.tar
DOCKER_BUILDKIT: '1'
steps:
- uses: actions/checkout@v2
- uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
Expand Down
2 changes: 2 additions & 0 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
align.preset = more
maxColumn = 100
52 changes: 32 additions & 20 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
<module>common</module>
<module>job-controller</module>
<module>common-test</module>
<module>spark/ingestion</module>
</modules>

<properties>
Expand Down Expand Up @@ -85,6 +86,27 @@
<org.hibernate.validator.version>6.1.2.Final</org.hibernate.validator.version>
<auto.value.version>1.6.6</auto.value.version>

<license.content><![CDATA[
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright 2018-$YEAR The Feast Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
]]>
</license.content>
<parent.basedir>${maven.multiModuleProjectDirectory}</parent.basedir>

<skipUTs>false</skipUTs>
<feast.auth.providers.http.client.package.name>feast.common.auth.providers.http.client</feast.auth.providers.http.client.package.name>
</properties>
Expand Down Expand Up @@ -528,26 +550,7 @@
<configuration>
<java>
<licenseHeader>
<content>
<![CDATA[
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright 2018-$YEAR The Feast Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
]]>
</content>
<content>${license.content}</content>
</licenseHeader>
<googleJavaFormat>
<version>1.7</version>
Expand All @@ -558,6 +561,15 @@
</excludes>
<removeUnusedImports />
</java>
<scala>
<licenseHeader>
<content>${license.content}</content>
</licenseHeader>
<scalafmt>
<version>2.7.2</version>
<file>${parent.basedir}/.scalafmt.conf</file>
</scalafmt>
</scala>
</configuration>
<executions>
<!-- Move check to fail faster, but after compilation. Default is verify phase -->
Expand Down
299 changes: 299 additions & 0 deletions spark/ingestion/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2018 The Feast Authors
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
~
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>dev.feast</groupId>
<artifactId>feast-parent</artifactId>
<version>${revision}</version>
<relativePath>../..</relativePath>
</parent>

<name>Feast Spark Ingestion</name>
<artifactId>feast-ingestion-spark</artifactId>

<properties>
<scala.version>2.12</scala.version>
<scala.fullVersion>${scala.version}.12</scala.fullVersion>
<spark.version>2.4.7</spark.version>
<scala-maven-plugin.version>4.4.0</scala-maven-plugin.version>
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
<project.version>0.7-SNAPSHOT</project.version>
</properties>


<dependencies>
<dependency>
<groupId>dev.feast</groupId>
<artifactId>datatypes-java</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>*</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.12.2</version>
</dependency>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.fullVersion}</version>
</dependency>

<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_${scala.version}</artifactId>
<version>2.2.0</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>3.0.16</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>com.github.scopt</groupId>
<artifactId>scopt_${scala.version}</artifactId>
<version>3.7.1</version>
</dependency>

<dependency>
<groupId>com.google.cloud.spark</groupId>
<artifactId>spark-bigquery_${scala.version}</artifactId>
<version>0.17.2</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.6</version>
</dependency>

<dependency>
<groupId>com.redislabs</groupId>
<artifactId>spark-redis_${scala.version}</artifactId>
<version>2.5.0</version>
</dependency>

<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
<version>0.16.0</version>
</dependency>

<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.52.Final</version>
</dependency>

<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-ext_${scala.version}</artifactId>
<version>3.7.0-M6</version>
</dependency>

<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.version}</artifactId>
<version>3.2.2</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.scalacheck</groupId>
<artifactId>scalacheck_${scala.version}</artifactId>
<version>1.14.3</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.dimafeng</groupId>
<artifactId>testcontainers-scala-scalatest_${scala.version}</artifactId>
<version>0.38.3</version>
<scope>test</scope>
</dependency>

</dependencies>


<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.fullVersion}</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>2.0.0</version>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>TestSuiteReport.txt</filereports>
</configuration>
<executions>
<execution>
<id>test</id>
<phase>integration-test</phase>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<relocations>
<relocation>
<pattern>com.google.protobuf</pattern>
<shadedPattern>com.google.protobuf.vendor</shadedPattern>
</relocation>
</relocations>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${scala-maven-plugin.version}</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>${maven-assembly-plugin.version}</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
Loading

0 comments on commit e47903f

Please sign in to comment.