Merge branch 'master' into odfv-updated

feast-dev · Feb 2, 2022 · b90e125 · b90e125
2 parents 8aa39d3 + 6e30457
commit b90e125
Show file tree

Hide file tree

Showing 49 changed files with 1,510 additions and 301 deletions.
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -38,12 +38,12 @@
 ## How-to Guides
 
 * [Running Feast with Snowflake/GCP/AWS](how-to-guides/feast-snowflake-gcp-aws/README.md)
-  * [Install Feast](how-to-guides/feast-gcp-aws/install-feast.md)
-  * [Create a feature repository](how-to-guides/feast-gcp-aws/create-a-feature-repository.md)
-  * [Deploy a feature store](how-to-guides/feast-gcp-aws/deploy-a-feature-store.md)
-  * [Build a training dataset](how-to-guides/feast-gcp-aws/build-a-training-dataset.md)
-  * [Load data into the online store](how-to-guides/feast-gcp-aws/load-data-into-the-online-store.md)
-  * [Read features from the online store](how-to-guides/feast-gcp-aws/read-features-from-the-online-store.md)
+  * [Install Feast](how-to-guides/feast-snowflake-gcp-aws/install-feast.md)
+  * [Create a feature repository](how-to-guides/feast-snowflake-gcp-aws/create-a-feature-repository.md)
+  * [Deploy a feature store](how-to-guides/feast-snowflake-gcp-aws/deploy-a-feature-store.md)
+  * [Build a training dataset](how-to-guides/feast-snowflake-gcp-aws/build-a-training-dataset.md)
+  * [Load data into the online store](how-to-guides/feast-snowflake-gcp-aws/load-data-into-the-online-store.md)
+  * [Read features from the online store](how-to-guides/feast-snowflake-gcp-aws/read-features-from-the-online-store.md)
 * [Running Feast in production](how-to-guides/running-feast-in-production.md)
 * [Upgrading from Feast 0.9](https://docs.google.com/document/u/1/d/1AOsr\_baczuARjCpmZgVd8mCqTF4AZ49OEyU4Cn-uTT0/edit)
 * [Adding a custom provider](how-to-guides/creating-a-custom-provider.md)
@@ -75,9 +75,10 @@
 * [Feature repository](reference/feature-repository/README.md)
   * [feature\_store.yaml](reference/feature-repository/feature-store-yaml.md)
   * [.feastignore](reference/feature-repository/feast-ignore.md)
+* [Feature servers](reference/feature-servers/README.md)
+  * [Local feature server](reference/feature-servers/local-feature-server.md)
 * [\[Alpha\] On demand feature view](reference/alpha-on-demand-feature-view.md)
 * [\[Alpha\] Stream ingestion](reference/alpha-stream-ingestion.md)
-* [\[Alpha\] Local feature server](reference/feature-server.md)
 * [\[Alpha\] AWS Lambda feature server](reference/alpha-aws-lambda-feature-server.md)
 * [Feast CLI reference](reference/feast-cli-commands.md)
 * [Python API reference](http://rtd.feast.dev)

diff --git a/docs/architecture.md b/docs/architecture.md
diff --git a/docs/build-a-training-dataset.md b/docs/build-a-training-dataset.md
diff --git a/docs/create-a-feature-repository.md b/docs/create-a-feature-repository.md
diff --git a/docs/deploy-a-feature-store.md b/docs/deploy-a-feature-store.md
diff --git a/docs/entities.md b/docs/entities.md
diff --git a/docs/feature-views.md b/docs/feature-views.md
diff --git a/docs/load-data-into-the-online-store.md b/docs/load-data-into-the-online-store.md
diff --git a/docs/read-features-from-the-online-store.md b/docs/read-features-from-the-online-store.md
diff --git a/docs/reference/dqm.md b/docs/reference/dqm.md
@@ -0,0 +1,77 @@
+# Data Quality Monitoring
+
+Data Quality Monitoring (DQM) is a Feast module aimed to help users to validate their data with the user-curated set of rules.
+Validation could be applied during:
+* Historical retrieval (training dataset generation)
+* [planned] Writing features into an online store
+* [planned] Reading features from an online store
+
+Its goal is to address several complex data problems, namely:
+* Data consistency - new training datasets can be significantly different from previous datasets. This might require a change in model architecture.
+* Issues/bugs in the upstream pipeline - bugs in upstream pipelines can cause invalid values to overwrite existing valid values in an online store.
+* Training/serving skew - distribution shift could significantly decrease the performance of the model.
+
+> To monitor data quality, we check that the characteristics of the tested dataset (aka the tested dataset's profile) are "equivalent" to the characteristics of the reference dataset.
+> How exactly profile equivalency should be measured is up to the user. 
+
+### Overview
+
+The validation process consists of the following steps:
+1. User prepares reference dataset (currently only [saved datasets](../getting-started/concepts/dataset.md) from historical retrieval are supported).
+2. User defines profiler function, which should produce profile by given dataset (currently only profilers based on [Great Expectations](https://docs.greatexpectations.io) are allowed).
+3. Validation of tested dataset is performed with reference dataset and profiler provided as parameters.
+
+### Preparations
+Feast with Great Expectations support can be installed via
+```shell
+pip install 'feast[ge]'
+```
+
+### Dataset profile
+Currently, Feast supports only [Great Expectation's](https://greatexpectations.io/) [ExpectationSuite](https://legacy.docs.greatexpectations.io/en/latest/autoapi/great_expectations/core/expectation_suite/index.html#great_expectations.core.expectation_suite.ExpectationSuite)
+as dataset's profile. Hence, the user needs to define a function (profiler) that would receive a dataset and return an [ExpectationSuite](https://legacy.docs.greatexpectations.io/en/latest/autoapi/great_expectations/core/expectation_suite/index.html#great_expectations.core.expectation_suite.ExpectationSuite).
+
+Great Expectations supports automatic profiling as well as manually specifying expectations:
+```python
+from great_expectations.dataset import Dataset
+from great_expectations.core.expectation_suite import ExpectationSuite
+
+from feast.dqm.profilers.ge_profiler import ge_profiler
+
+@ge_profiler
+def automatic_profiler(dataset: Dataset) -> ExpectationSuite:
+    from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
+
+    return UserConfigurableProfiler(
+        profile_dataset=dataset,
+        ignored_columns=['conv_rate'],
+        value_set_threshold='few'
+    ).build_suite()
+```
+However, from our experience capabilities of automatic profiler are quite limited. So we would recommend crafting your own expectations:
+```python
+@ge_profiler
+def manual_profiler(dataset: Dataset) -> ExpectationSuite:
+    dataset.expect_column_max_to_be_between("column", 1, 2)
+    return dataset.get_expectation_suite()
+```
+
+
+
+### Validating Training Dataset
+During retrieval of historical features, `validation_reference` can be passed as a parameter to methods `.to_df(validation_reference=...)` or `.to_arrow(validation_reference=...)` of RetrievalJob.
+If parameter is provided Feast will run validation once dataset is materialized. In case if validation successful materialized dataset is returned.
+Otherwise, `feast.dqm.errors.ValidationFailed` exception would be raised. It will consist of all details for expectations that didn't pass.
+
+```python
+from feast import FeatureStore
+
+fs = FeatureStore(".")
+
+job = fs.get_historical_features(...)
+job.to_df(
+    validation_reference=fs
+        .get_saved_dataset("my_reference_dataset")
+        .as_reference(profiler=manual_profiler)
+)
+```
diff --git a/docs/reference/feature-servers/README.md b/docs/reference/feature-servers/README.md
@@ -0,0 +1,5 @@
+# Feature servers
+
+Feast users can choose to retrieve features from a feature server, as opposed to through the Python SDK.
+
+{% page-ref page="local-feature-server.md" %}
diff --git a/docs/reference/feature-server.md → ...e/feature-servers/local-feature-server.md b/docs/reference/feature-server.md → ...e/feature-servers/local-feature-server.md
@@ -1,10 +1,4 @@
-# \[Alpha\] Local feature server
-
-**Warning**: This is an _experimental_ feature. It's intended for early testing and feedback, and could change without warnings in future releases.
-
-{% hint style="info" %}
-To enable this feature, run **`feast alpha enable python_feature_server`**
-{% endhint %}
+# Local feature server
 
 ## Overview
 
@@ -122,4 +116,3 @@ curl -X POST \
     }
   }' | jq
 ```
-
diff --git a/docs/reference/repository-config.md b/docs/reference/repository-config.md
diff --git a/docs/reference/telemetry.md b/docs/reference/telemetry.md
diff --git a/docs/repository-config.md b/docs/repository-config.md
diff --git a/docs/sources.md b/docs/sources.md
diff --git a/infra/charts/feast-python-server/README.md b/infra/charts/feast-python-server/README.md
@@ -56,16 +56,4 @@ RUN pip install pip --upgrade
 RUN pip install feast
 
 COPY feature_store.yaml /feature_store.yaml
-```
-
-Make sure that you have enabled the flags for the python server. Example `feature_store.yaml`:
-```
-project: feature_repo
-registry: data/registry.db
-provider: local
-online_store:
-    path: data/online_store.db
-flags:
-  alpha_features: true
-  python_feature_server: true
 ```
diff --git a/infra/charts/feast/charts/transformation-service/config/feature_store.yaml b/infra/charts/feast/charts/transformation-service/config/feature_store.yaml
@@ -5,5 +5,4 @@ provider: local
 project: {{ .Values.global.project }}
 flags:
   on_demand_transforms: true
-  python_feature_server: true
   alpha_features: true
diff --git a/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java b/java/serving/src/test/java/feast/serving/it/ServingBaseTests.java
@@ -157,5 +157,28 @@ public void shouldRefreshRegistryAndServeNewFeatures() throws InterruptedExcepti
             equalTo(3));
   }
 
+  /** https://github.com/feast-dev/feast/issues/2253 */
+  @Test
+  public void shouldGetOnlineFeaturesWithStringEntity() {
+    Map<String, ValueProto.RepeatedValue> entityRows =
+        ImmutableMap.of(
+            "entity",
+            ValueProto.RepeatedValue.newBuilder()
+                .addVal(DataGenerator.createStrValue("key-1"))
+                .build());
+
+    ImmutableList<String> featureReferences =
+        ImmutableList.of("feature_view_0:feature_0", "feature_view_0:feature_1");
+
+    ServingAPIProto.GetOnlineFeaturesRequest req =
+        TestUtils.createOnlineFeatureRequest(featureReferences, entityRows);
+
+    ServingAPIProto.GetOnlineFeaturesResponse resp = servingStub.getOnlineFeatures(req);
+
+    for (final int featureIdx : List.of(0, 1)) {
+      assertEquals(FieldStatus.PRESENT, resp.getResults(featureIdx).getStatuses(0));
+    }
+  }
+
   abstract void updateRegistryFile(RegistryProto.Registry registry);
 }
diff --git a/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java b/java/serving/src/test/java/feast/serving/it/ServingBenchmarkIT.java
@@ -51,7 +51,8 @@ protected ServingAPIProto.GetOnlineFeaturesRequest buildOnlineRequest(
       int rowsCount, int featuresCount) {
     List<ValueProto.Value> entities =
         IntStream.range(0, rowsCount)
-            .mapToObj(i -> DataGenerator.createInt64Value(rand.nextInt(1000)))
+            .mapToObj(
+                i -> DataGenerator.createStrValue(String.format("key-%s", rand.nextInt(1000))))
             .collect(Collectors.toList());
 
     List<String> featureReferences =

diff --git a/java/serving/src/test/resources/docker-compose/feast10/definitions.py b/java/serving/src/test/resources/docker-compose/feast10/definitions.py
@@ -73,7 +73,7 @@ def transformed_conv_rate(features_df: pd.DataFrame) -> pd.DataFrame:
 
 entity = Entity(
     name="entity",
-    value_type=ValueType.INT64,
+    value_type=ValueType.STRING,
 )
 
 benchmark_feature_views = [

diff --git a/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml b/java/serving/src/test/resources/docker-compose/feast10/feature_store.yaml
@@ -8,4 +8,3 @@ offline_store: {}
 flags:
   alpha_features: true
   on_demand_transforms: true
-  python_feature_server: true
diff --git a/java/serving/src/test/resources/docker-compose/feast10/materialize.py b/java/serving/src/test/resources/docker-compose/feast10/materialize.py
@@ -28,22 +28,26 @@
 # for more info.
 df.to_parquet("driver_stats.parquet")
 
+
 # For Benchmarks
 # Please read more in Feast RFC-031
 # (link https://docs.google.com/document/d/12UuvTQnTTCJhdRgy6h10zSbInNGSyEJkIxpOcgOen1I/edit)
 # about this benchmark setup
-def generate_data(num_rows: int, num_features: int, key_space: int, destination: str) -> pd.DataFrame:
+def generate_data(num_rows: int, num_features: int, destination: str) -> pd.DataFrame:
     features = [f"feature_{i}" for i in range(num_features)]
     columns = ["entity", "event_timestamp"] + features
     df = pd.DataFrame(0, index=np.arange(num_rows), columns=columns)
     df["event_timestamp"] = datetime.utcnow()
-    for column in ["entity"] + features:
-        df[column] = np.random.randint(1, key_space, num_rows)
+    for column in features:
+        df[column] = np.random.randint(1, num_rows, num_rows)
+
+    df["entity"] = "key-" + \
+                   pd.Series(np.arange(1, num_rows + 1)).astype(pd.StringDtype())
 
     df.to_parquet(destination)
 
 
-generate_data(10**3, 250, 10**3, "benchmark_data.parquet")
+generate_data(10**3, 250, "benchmark_data.parquet")
 
 
 fs = FeatureStore(".")