From c0eec93ed1aa288c369e87097c58763886822307 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 14 Jun 2022 16:31:45 -0700 Subject: [PATCH 01/30] Skaffolding for offline store push Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/offline_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index cd807764ba..2d2816c4e4 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -28,6 +28,8 @@ from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto if TYPE_CHECKING: from feast.saved_dataset import ValidationReference From 110c9b4aded189abd279b17cbd436865f9969e8c Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 14 Jun 2022 16:33:14 -0700 Subject: [PATCH 02/30] LInt Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/offline_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index 2d2816c4e4..9c3f3f5ab4 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -25,11 +25,11 @@ from feast.feature_logging import LoggingConfig, LoggingSource from feast.feature_view import FeatureView from feast.on_demand_feature_view import OnDemandFeatureView +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage -from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto -from feast.protos.feast.types.Value_pb2 import Value as ValueProto if TYPE_CHECKING: from feast.saved_dataset import ValidationReference From 460f1eae89e262546d1bfaa8d3234409b421f048 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 14 Jun 2022 16:55:26 -0700 Subject: [PATCH 03/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/offline_store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index 9c3f3f5ab4..cd807764ba 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -25,8 +25,6 @@ from feast.feature_logging import LoggingConfig, LoggingSource from feast.feature_view import FeatureView from feast.on_demand_feature_view import OnDemandFeatureView -from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto -from feast.protos.feast.types.Value_pb2 import Value as ValueProto from feast.registry import BaseRegistry from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage From bdcabeec88462800503e0ce0f78be44b490d87fc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 14:07:54 -0700 Subject: [PATCH 04/30] File source offline push Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 34 ++- .../offline_store/test_offline_push.py | 196 ++++++++++++++++++ 2 files changed, 208 insertions(+), 22 deletions(-) create mode 100644 sdk/python/tests/integration/offline_store/test_offline_push.py diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 194c233f53..260f29bd88 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -7,6 +7,7 @@ import pandas as pd import pyarrow import pyarrow.dataset +from pyarrow import csv import pyarrow.parquet import pytz from pydantic.typing import Literal @@ -405,42 +406,31 @@ def write_logged_features( ) @staticmethod - def offline_write_batch( - config: RepoConfig, - feature_view: FeatureView, - data: pyarrow.Table, - progress: Optional[Callable[[int], Any]], - ): + def offline_write_batch(config: RepoConfig, feature_view: FeatureView, data: pyarrow.Table, progress: Optional[Callable[[int], Any]]): if not feature_view.batch_source: - raise ValueError( - "feature view does not have a batch source to persist offline data" - ) + raise ValueError("feature view does not have a batch source to persist offline data") if not isinstance(config.offline_store, FileOfflineStoreConfig): - raise ValueError( - f"offline store config is of type {type(config.offline_store)} when file type required" - ) + raise ValueError(f"offline store config is of type {type(config.offline_store)} when file type required") if not isinstance(feature_view.batch_source, FileSource): - raise ValueError( - f"feature view batch source is {type(feature_view.batch_source)} not file source" - ) + raise ValueError(f"feature view batch source is {type(feature_view.batch_source)} not file source") file_options = feature_view.batch_source.file_options filesystem, path = FileSource.create_filesystem_and_path( file_options.uri, file_options.s3_endpoint_override ) prev_table = pyarrow.parquet.read_table(path, memory_map=True) - if prev_table.column_names != data.column_names: - raise ValueError( - f"Input dataframe has incorrect schema or wrong order, expected columns are: {prev_table.column_names}" - ) - if data.schema != prev_table.schema: + if(prev_table.column_names != data.column_names): + raise ValueError(f"Input dataframe have columns in wrong order, columns should be in the order: {prev_table.column_names}") + if(data.schema != prev_table.schema): data = data.cast(prev_table.schema) new_table = pyarrow.concat_tables([data, prev_table]) - writer = pyarrow.parquet.ParquetWriter(path, data.schema, filesystem=filesystem) + writer = pyarrow.parquet.ParquetWriter( + path, + data.schema, + filesystem=filesystem) writer.write_table(new_table) writer.close() - def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, ) -> Tuple[datetime, datetime]: diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py new file mode 100644 index 0000000000..d31a6ebf77 --- /dev/null +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -0,0 +1,196 @@ + +import datetime +from datetime import datetime, timedelta + +import numpy as np +import pandas as pd +import pytest +import tempfile +import uuid + +from feast.data_format import ParquetFormat + +from feast import FeatureView, Field, FileSource +from feast.types import Int32, Float32 +from feast.wait import wait_retry_backoff +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, +) +from tests.integration.feature_repos.universal.data_sources.file import FileDataSourceCreator +from tests.integration.feature_repos.universal.entities import ( + customer, + driver, + location, +) +from tests.integration.feature_repos.universal.feature_views import conv_rate_plus_100 +from tests.utils.logged_features import prepare_logs, to_logs_dataset + +@pytest.mark.integration +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_writing_incorrect_order_fails(environment, universal_data_sources): + # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in + store = environment.feature_store + _, _, data_sources = universal_data_sources + driver_stats = FeatureView( + name="driver_stats", + entities=["driver"], + schema=[ + Field(name="avg_daily_trips", dtype=Int32), + Field(name="conv_rate", dtype=Float32), + ], + source=data_sources.driver, + ) + + now = datetime.utcnow() + ts = pd.Timestamp(now).round("ms") + + entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002], + "event_timestamp": [ + ts-timedelta(hours=3), + ts, + ], + } + ) + + store.apply([driver(), driver_stats]) + df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" + ], + full_feature_names=False, + ).to_df() + + assert df["conv_rate"].isnull().all() + assert df["avg_daily_trips"].isnull().all() + + expected_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002], + "event_timestamp": [ + ts-timedelta(hours=3), + ts, + ], + "conv_rate": [0.1, 0.2], + "avg_daily_trips": [1, 2], + "created": [ts, ts] + }, + ) + with pytest.raises(ValueError): + store.write_to_offline_store(driver_stats.name, expected_df, allow_registry_cache=False) + +@pytest.mark.integration +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_writing_consecutively_to_offline_store(environment, universal_data_sources): + store = environment.feature_store + _, _, data_sources = universal_data_sources + driver_stats = FeatureView( + name="driver_stats", + entities=["driver"], + schema=[ + Field(name="avg_daily_trips", dtype=Int32), + Field(name="conv_rate", dtype=Float32), + ], + source=data_sources.driver, + ttl=timedelta(minutes=10), + ) + + now = datetime.utcnow() + ts = pd.Timestamp(now, unit='ns') + + entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002], + "event_timestamp": [ + ts-timedelta(hours=4), + ts-timedelta(hours=3), + ], + } + ) + + store.apply([driver(), driver_stats]) + df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" + ], + full_feature_names=False, + ).to_df() + + assert df["conv_rate"].isnull().all() + assert df["avg_daily_trips"].isnull().all() + + first_df = pd.DataFrame.from_dict( + { + "event_timestamp": [ + ts-timedelta(hours=4), + ts-timedelta(hours=3), + ], + "driver_id": [1001, 1001], + "conv_rate": [0.1, 0.2], + "acc_rate": [0.5, 0.6], + "avg_daily_trips": [1, 2], + "created": [ts, ts] + }, + ) + store.write_to_offline_store(driver_stats.name, first_df, allow_registry_cache=False) + + after_write_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" + ], + full_feature_names=False, + ).to_df() + + assert len(after_write_df) == len(first_df) + assert np.where(after_write_df["conv_rate"].reset_index(drop=True) == first_df["conv_rate"].reset_index(drop=True)) + assert np.where(after_write_df["avg_daily_trips"].reset_index(drop=True) == first_df["avg_daily_trips"].reset_index(drop=True)) + + second_df = pd.DataFrame.from_dict( + { + "event_timestamp": [ + ts-timedelta(hours=1), + ts, + ], + "driver_id": [1001, 1001], + "conv_rate": [0.3, 0.4], + "acc_rate": [0.8, 0.9], + "avg_daily_trips": [3, 4], + "created": [ts, ts] + }, + ) + + store.write_to_offline_store(driver_stats.name, second_df, allow_registry_cache=False) + + entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1001, 1001, 1001], + "event_timestamp": [ + ts-timedelta(hours=4), + ts-timedelta(hours=3), + ts-timedelta(hours=1), + ts, + ], + } + ) + + after_write_df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" + ], + full_feature_names=False, + ).to_df() + + expected_df = pd.concat([first_df, second_df]) + assert len(after_write_df) == len(expected_df) + assert np.where(after_write_df["conv_rate"].reset_index(drop=True) == expected_df["conv_rate"].reset_index(drop=True)) + assert np.where(after_write_df["avg_daily_trips"].reset_index(drop=True) == expected_df["avg_daily_trips"].reset_index(drop=True)) + From c600626db52e0af203fadc2b31d01a45b343bd56 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 15:38:26 -0700 Subject: [PATCH 05/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 2 +- .../offline_store/test_offline_push.py | 79 ++++++++++++++++--- 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 260f29bd88..85028c9236 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -420,7 +420,7 @@ def offline_write_batch(config: RepoConfig, feature_view: FeatureView, data: pya prev_table = pyarrow.parquet.read_table(path, memory_map=True) if(prev_table.column_names != data.column_names): - raise ValueError(f"Input dataframe have columns in wrong order, columns should be in the order: {prev_table.column_names}") + raise ValueError(f"Input dataframe has incorrect schema or wrong order, expected columns are: {prev_table.column_names}") if(data.schema != prev_table.schema): data = data.cast(prev_table.schema) new_table = pyarrow.concat_tables([data, prev_table]) diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index d31a6ebf77..4b6fb557f4 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -5,8 +5,7 @@ import numpy as np import pandas as pd import pytest -import tempfile -import uuid +import random from feast.data_format import ParquetFormat @@ -74,8 +73,66 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): ts-timedelta(hours=3), ts, ], - "conv_rate": [0.1, 0.2], - "avg_daily_trips": [1, 2], + "conv_rate": [random.random(), random.random()], + "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], + "created": [ts, ts] + }, + ) + with pytest.raises(ValueError): + store.write_to_offline_store(driver_stats.name, expected_df, allow_registry_cache=False) + + +@pytest.mark.integration +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_writing_incorrect_schema_fails(environment, universal_data_sources): + # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in + store = environment.feature_store + _, _, data_sources = universal_data_sources + driver_stats = FeatureView( + name="driver_stats", + entities=["driver"], + schema=[ + Field(name="avg_daily_trips", dtype=Int32), + Field(name="conv_rate", dtype=Float32), + ], + source=data_sources.driver, + ) + + now = datetime.utcnow() + ts = pd.Timestamp(now).round("ms") + + entity_df = pd.DataFrame.from_dict( + { + "driver_id": [1001, 1002], + "event_timestamp": [ + ts-timedelta(hours=3), + ts, + ], + } + ) + + store.apply([driver(), driver_stats]) + df = store.get_historical_features( + entity_df=entity_df, + features=[ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" + ], + full_feature_names=False, + ).to_df() + + assert df["conv_rate"].isnull().all() + assert df["avg_daily_trips"].isnull().all() + + expected_df = pd.DataFrame.from_dict( + { + "event_timestamp": [ + ts-timedelta(hours=3), + ts, + ], + "driver_id": [1001, 1002], + "conv_rate": [random.random(), random.random()], + "incorrect_schema": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts] }, ) @@ -103,7 +160,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour entity_df = pd.DataFrame.from_dict( { - "driver_id": [1001, 1002], + "driver_id": [1001, 1001], "event_timestamp": [ ts-timedelta(hours=4), ts-timedelta(hours=3), @@ -131,9 +188,9 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ts-timedelta(hours=3), ], "driver_id": [1001, 1001], - "conv_rate": [0.1, 0.2], - "acc_rate": [0.5, 0.6], - "avg_daily_trips": [1, 2], + "conv_rate": [random.random(), random.random()], + "acc_rate": [random.random(), random.random()], + "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts] }, ) @@ -159,9 +216,9 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ts, ], "driver_id": [1001, 1001], - "conv_rate": [0.3, 0.4], - "acc_rate": [0.8, 0.9], - "avg_daily_trips": [3, 4], + "conv_rate": [random.random(), random.random()], + "acc_rate": [random.random(), random.random()], + "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts] }, ) From 9456b12269ce4fbcfc3d5951e7603242a1c5e123 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 15:55:55 -0700 Subject: [PATCH 06/30] Fix Signed-off-by: Kevin Zhang --- .../integration/offline_store/test_offline_push.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index 4b6fb557f4..85adc542fc 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -25,7 +25,7 @@ from tests.utils.logged_features import prepare_logs, to_logs_dataset @pytest.mark.integration -@pytest.mark.universal_online_stores(only=["sqlite"]) +@pytest.mark.universal_online_stores def test_writing_incorrect_order_fails(environment, universal_data_sources): # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in store = environment.feature_store @@ -83,7 +83,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): @pytest.mark.integration -@pytest.mark.universal_online_stores(only=["sqlite"]) +@pytest.mark.universal_online_stores def test_writing_incorrect_schema_fails(environment, universal_data_sources): # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in store = environment.feature_store @@ -140,7 +140,7 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): store.write_to_offline_store(driver_stats.name, expected_df, allow_registry_cache=False) @pytest.mark.integration -@pytest.mark.universal_online_stores(only=["sqlite"]) +@pytest.mark.universal_online_stores def test_writing_consecutively_to_offline_store(environment, universal_data_sources): store = environment.feature_store _, _, data_sources = universal_data_sources @@ -150,6 +150,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour schema=[ Field(name="avg_daily_trips", dtype=Int32), Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), ], source=data_sources.driver, ttl=timedelta(minutes=10), @@ -173,6 +174,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour entity_df=entity_df, features=[ "driver_stats:conv_rate", + "driver_stats:avg_daily_trips" ], full_feature_names=False, @@ -241,6 +243,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour entity_df=entity_df, features=[ "driver_stats:conv_rate", + "driver_stats:acc_rate", "driver_stats:avg_daily_trips" ], full_feature_names=False, @@ -249,5 +252,5 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour expected_df = pd.concat([first_df, second_df]) assert len(after_write_df) == len(expected_df) assert np.where(after_write_df["conv_rate"].reset_index(drop=True) == expected_df["conv_rate"].reset_index(drop=True)) + assert np.where(after_write_df["acc_rate"].reset_index(drop=True) == expected_df["acc_rate"].reset_index(drop=True)) assert np.where(after_write_df["avg_daily_trips"].reset_index(drop=True) == expected_df["avg_daily_trips"].reset_index(drop=True)) - From e18bacd0dc69aa140f22a1862c090e0d6acf84dc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 15:58:34 -0700 Subject: [PATCH 07/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 16 +- .../offline_store/test_offline_push.py | 141 +++++++----------- 2 files changed, 65 insertions(+), 92 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 85028c9236..7856eaa1c9 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -7,9 +7,9 @@ import pandas as pd import pyarrow import pyarrow.dataset -from pyarrow import csv import pyarrow.parquet import pytz +from pyarrow import csv from pydantic.typing import Literal from feast import FileSource, OnDemandFeatureView @@ -419,18 +419,18 @@ def offline_write_batch(config: RepoConfig, feature_view: FeatureView, data: pya ) prev_table = pyarrow.parquet.read_table(path, memory_map=True) - if(prev_table.column_names != data.column_names): - raise ValueError(f"Input dataframe has incorrect schema or wrong order, expected columns are: {prev_table.column_names}") - if(data.schema != prev_table.schema): + if prev_table.column_names != data.column_names: + raise ValueError( + f"Input dataframe has incorrect schema or wrong order, expected columns are: {prev_table.column_names}" + ) + if data.schema != prev_table.schema: data = data.cast(prev_table.schema) new_table = pyarrow.concat_tables([data, prev_table]) - writer = pyarrow.parquet.ParquetWriter( - path, - data.schema, - filesystem=filesystem) + writer = pyarrow.parquet.ParquetWriter(path, data.schema, filesystem=filesystem) writer.write_table(new_table) writer.close() + def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, ) -> Tuple[datetime, datetime]: diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index 85adc542fc..ba851e2918 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -1,28 +1,17 @@ -import datetime +import random from datetime import datetime, timedelta import numpy as np import pandas as pd import pytest -import random -from feast.data_format import ParquetFormat - -from feast import FeatureView, Field, FileSource -from feast.types import Int32, Float32 -from feast.wait import wait_retry_backoff -from tests.integration.feature_repos.repo_configuration import ( - construct_universal_feature_views, -) -from tests.integration.feature_repos.universal.data_sources.file import FileDataSourceCreator +from feast import FeatureView, Field +from feast.types import Float32, Int32 from tests.integration.feature_repos.universal.entities import ( - customer, driver, - location, ) -from tests.integration.feature_repos.universal.feature_views import conv_rate_plus_100 -from tests.utils.logged_features import prepare_logs, to_logs_dataset + @pytest.mark.integration @pytest.mark.universal_online_stores @@ -44,22 +33,13 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): ts = pd.Timestamp(now).round("ms") entity_df = pd.DataFrame.from_dict( - { - "driver_id": [1001, 1002], - "event_timestamp": [ - ts-timedelta(hours=3), - ts, - ], - } + {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts,],} ) store.apply([driver(), driver_stats]) df = store.get_historical_features( entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips" - ], + features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], full_feature_names=False, ).to_df() @@ -69,17 +49,16 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): expected_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1002], - "event_timestamp": [ - ts-timedelta(hours=3), - ts, - ], + "event_timestamp": [ts - timedelta(hours=3), ts,], "conv_rate": [random.random(), random.random()], "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts] + "created": [ts, ts], }, ) with pytest.raises(ValueError): - store.write_to_offline_store(driver_stats.name, expected_df, allow_registry_cache=False) + store.write_to_offline_store( + driver_stats.name, expected_df, allow_registry_cache=False + ) @pytest.mark.integration @@ -102,22 +81,13 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): ts = pd.Timestamp(now).round("ms") entity_df = pd.DataFrame.from_dict( - { - "driver_id": [1001, 1002], - "event_timestamp": [ - ts-timedelta(hours=3), - ts, - ], - } + {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts,],} ) store.apply([driver(), driver_stats]) df = store.get_historical_features( entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips" - ], + features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], full_feature_names=False, ).to_df() @@ -126,18 +96,18 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): expected_df = pd.DataFrame.from_dict( { - "event_timestamp": [ - ts-timedelta(hours=3), - ts, - ], + "event_timestamp": [ts - timedelta(hours=3), ts,], "driver_id": [1001, 1002], "conv_rate": [random.random(), random.random()], "incorrect_schema": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts] + "created": [ts, ts], }, ) with pytest.raises(ValueError): - store.write_to_offline_store(driver_stats.name, expected_df, allow_registry_cache=False) + store.write_to_offline_store( + driver_stats.name, expected_df, allow_registry_cache=False + ) + @pytest.mark.integration @pytest.mark.universal_online_stores @@ -157,26 +127,19 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ) now = datetime.utcnow() - ts = pd.Timestamp(now, unit='ns') + ts = pd.Timestamp(now, unit="ns") entity_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1001], - "event_timestamp": [ - ts-timedelta(hours=4), - ts-timedelta(hours=3), - ], + "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3),], } ) store.apply([driver(), driver_stats]) df = store.get_historical_features( entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - - "driver_stats:avg_daily_trips" - ], + features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], full_feature_names=False, ).to_df() @@ -185,55 +148,56 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour first_df = pd.DataFrame.from_dict( { - "event_timestamp": [ - ts-timedelta(hours=4), - ts-timedelta(hours=3), - ], + "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3),], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts] + "created": [ts, ts], }, ) - store.write_to_offline_store(driver_stats.name, first_df, allow_registry_cache=False) + store.write_to_offline_store( + driver_stats.name, first_df, allow_registry_cache=False + ) after_write_df = store.get_historical_features( entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips" - ], + features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], full_feature_names=False, ).to_df() assert len(after_write_df) == len(first_df) - assert np.where(after_write_df["conv_rate"].reset_index(drop=True) == first_df["conv_rate"].reset_index(drop=True)) - assert np.where(after_write_df["avg_daily_trips"].reset_index(drop=True) == first_df["avg_daily_trips"].reset_index(drop=True)) + assert np.where( + after_write_df["conv_rate"].reset_index(drop=True) + == first_df["conv_rate"].reset_index(drop=True) + ) + assert np.where( + after_write_df["avg_daily_trips"].reset_index(drop=True) + == first_df["avg_daily_trips"].reset_index(drop=True) + ) second_df = pd.DataFrame.from_dict( { - "event_timestamp": [ - ts-timedelta(hours=1), - ts, - ], + "event_timestamp": [ts - timedelta(hours=1), ts,], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts] + "created": [ts, ts], }, ) - store.write_to_offline_store(driver_stats.name, second_df, allow_registry_cache=False) + store.write_to_offline_store( + driver_stats.name, second_df, allow_registry_cache=False + ) entity_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1001, 1001, 1001], "event_timestamp": [ - ts-timedelta(hours=4), - ts-timedelta(hours=3), - ts-timedelta(hours=1), + ts - timedelta(hours=4), + ts - timedelta(hours=3), + ts - timedelta(hours=1), ts, ], } @@ -244,13 +208,22 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour features=[ "driver_stats:conv_rate", "driver_stats:acc_rate", - "driver_stats:avg_daily_trips" + "driver_stats:avg_daily_trips", ], full_feature_names=False, ).to_df() expected_df = pd.concat([first_df, second_df]) assert len(after_write_df) == len(expected_df) - assert np.where(after_write_df["conv_rate"].reset_index(drop=True) == expected_df["conv_rate"].reset_index(drop=True)) - assert np.where(after_write_df["acc_rate"].reset_index(drop=True) == expected_df["acc_rate"].reset_index(drop=True)) - assert np.where(after_write_df["avg_daily_trips"].reset_index(drop=True) == expected_df["avg_daily_trips"].reset_index(drop=True)) + assert np.where( + after_write_df["conv_rate"].reset_index(drop=True) + == expected_df["conv_rate"].reset_index(drop=True) + ) + assert np.where( + after_write_df["acc_rate"].reset_index(drop=True) + == expected_df["acc_rate"].reset_index(drop=True) + ) + assert np.where( + after_write_df["avg_daily_trips"].reset_index(drop=True) + == expected_df["avg_daily_trips"].reset_index(drop=True) + ) From f72dc8c40e09ec2c78321b3fdcc9e12997f2e6e8 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 16:03:18 -0700 Subject: [PATCH 08/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 1 - .../offline_store/test_offline_push.py | 19 ++++++++----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 7856eaa1c9..b0bf94e352 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -9,7 +9,6 @@ import pyarrow.dataset import pyarrow.parquet import pytz -from pyarrow import csv from pydantic.typing import Literal from feast import FileSource, OnDemandFeatureView diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index ba851e2918..068b7b0a75 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -1,4 +1,3 @@ - import random from datetime import datetime, timedelta @@ -8,9 +7,7 @@ from feast import FeatureView, Field from feast.types import Float32, Int32 -from tests.integration.feature_repos.universal.entities import ( - driver, -) +from tests.integration.feature_repos.universal.entities import driver @pytest.mark.integration @@ -33,7 +30,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): ts = pd.Timestamp(now).round("ms") entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts,],} + {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} ) store.apply([driver(), driver_stats]) @@ -49,7 +46,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): expected_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1002], - "event_timestamp": [ts - timedelta(hours=3), ts,], + "event_timestamp": [ts - timedelta(hours=3), ts], "conv_rate": [random.random(), random.random()], "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts], @@ -81,7 +78,7 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): ts = pd.Timestamp(now).round("ms") entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts,],} + {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} ) store.apply([driver(), driver_stats]) @@ -96,7 +93,7 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): expected_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=3), ts,], + "event_timestamp": [ts - timedelta(hours=3), ts], "driver_id": [1001, 1002], "conv_rate": [random.random(), random.random()], "incorrect_schema": [random.randint(0, 10), random.randint(0, 10)], @@ -132,7 +129,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour entity_df = pd.DataFrame.from_dict( { "driver_id": [1001, 1001], - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3),], + "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], } ) @@ -148,7 +145,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour first_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3),], + "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], @@ -178,7 +175,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour second_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=1), ts,], + "event_timestamp": [ts - timedelta(hours=1), ts], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], From 8cc2a336118f5b52ed32eaa2d1d55c4c96e57f83 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 18:12:36 -0700 Subject: [PATCH 09/30] Fix Signed-off-by: Kevin Zhang --- .../offline_store/test_offline_push.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index 068b7b0a75..44a8053e15 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -14,6 +14,9 @@ @pytest.mark.universal_online_stores def test_writing_incorrect_order_fails(environment, universal_data_sources): # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in + """This test tests if we have incorrect order when writing to offline store. + Specifically, event_timestamp should be the first column to adhere with the filesource column order. + """ store = environment.feature_store _, _, data_sources = universal_data_sources driver_stats = FeatureView( @@ -43,7 +46,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): assert df["conv_rate"].isnull().all() assert df["avg_daily_trips"].isnull().all() - expected_df = pd.DataFrame.from_dict( + df = pd.DataFrame.from_dict( { "driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts], @@ -54,7 +57,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): ) with pytest.raises(ValueError): store.write_to_offline_store( - driver_stats.name, expected_df, allow_registry_cache=False + driver_stats.name, df, allow_registry_cache=False ) @@ -62,6 +65,9 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): @pytest.mark.universal_online_stores def test_writing_incorrect_schema_fails(environment, universal_data_sources): # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in + """This test tests if we have incorrect attribute when writing to offline store. + Specifically, `incorrect_attribute` is an inccorect column to adhere with the filesource column order. + """ store = environment.feature_store _, _, data_sources = universal_data_sources driver_stats = FeatureView( @@ -91,18 +97,18 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): assert df["conv_rate"].isnull().all() assert df["avg_daily_trips"].isnull().all() - expected_df = pd.DataFrame.from_dict( + df = pd.DataFrame.from_dict( { "event_timestamp": [ts - timedelta(hours=3), ts], "driver_id": [1001, 1002], "conv_rate": [random.random(), random.random()], - "incorrect_schema": [random.randint(0, 10), random.randint(0, 10)], + "incorrect_attribute": [random.randint(0, 10), random.randint(0, 10)], "created": [ts, ts], }, ) with pytest.raises(ValueError): store.write_to_offline_store( - driver_stats.name, expected_df, allow_registry_cache=False + driver_stats.name, df, allow_registry_cache=False ) @@ -143,6 +149,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour assert df["conv_rate"].isnull().all() assert df["avg_daily_trips"].isnull().all() + # This dataframe has its columns ordered exactly as it is in the parquet file generated by driver_test_data.py. first_df = pd.DataFrame.from_dict( { "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], From d399f76344a081c459bc4f007c2dddb70030ea8e Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 16 Jun 2022 18:14:19 -0700 Subject: [PATCH 10/30] Fix Signed-off-by: Kevin Zhang --- .../tests/integration/offline_store/test_offline_push.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py index 44a8053e15..2bdf775177 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ b/sdk/python/tests/integration/offline_store/test_offline_push.py @@ -56,9 +56,7 @@ def test_writing_incorrect_order_fails(environment, universal_data_sources): }, ) with pytest.raises(ValueError): - store.write_to_offline_store( - driver_stats.name, df, allow_registry_cache=False - ) + store.write_to_offline_store(driver_stats.name, df, allow_registry_cache=False) @pytest.mark.integration @@ -107,9 +105,7 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): }, ) with pytest.raises(ValueError): - store.write_to_offline_store( - driver_stats.name, df, allow_registry_cache=False - ) + store.write_to_offline_store(driver_stats.name, df, allow_registry_cache=False) @pytest.mark.integration From c070c1a73bb1dc27dbf84b841cfaa69e83f3f40a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 17 Jun 2022 09:58:06 -0700 Subject: [PATCH 11/30] Address review comments Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 19 +- .../offline_store/test_offline_push.py | 229 ------------------ 2 files changed, 15 insertions(+), 233 deletions(-) delete mode 100644 sdk/python/tests/integration/offline_store/test_offline_push.py diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index b0bf94e352..194c233f53 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -405,13 +405,24 @@ def write_logged_features( ) @staticmethod - def offline_write_batch(config: RepoConfig, feature_view: FeatureView, data: pyarrow.Table, progress: Optional[Callable[[int], Any]]): + def offline_write_batch( + config: RepoConfig, + feature_view: FeatureView, + data: pyarrow.Table, + progress: Optional[Callable[[int], Any]], + ): if not feature_view.batch_source: - raise ValueError("feature view does not have a batch source to persist offline data") + raise ValueError( + "feature view does not have a batch source to persist offline data" + ) if not isinstance(config.offline_store, FileOfflineStoreConfig): - raise ValueError(f"offline store config is of type {type(config.offline_store)} when file type required") + raise ValueError( + f"offline store config is of type {type(config.offline_store)} when file type required" + ) if not isinstance(feature_view.batch_source, FileSource): - raise ValueError(f"feature view batch source is {type(feature_view.batch_source)} not file source") + raise ValueError( + f"feature view batch source is {type(feature_view.batch_source)} not file source" + ) file_options = feature_view.batch_source.file_options filesystem, path = FileSource.create_filesystem_and_path( file_options.uri, file_options.s3_endpoint_override diff --git a/sdk/python/tests/integration/offline_store/test_offline_push.py b/sdk/python/tests/integration/offline_store/test_offline_push.py deleted file mode 100644 index 2bdf775177..0000000000 --- a/sdk/python/tests/integration/offline_store/test_offline_push.py +++ /dev/null @@ -1,229 +0,0 @@ -import random -from datetime import datetime, timedelta - -import numpy as np -import pandas as pd -import pytest - -from feast import FeatureView, Field -from feast.types import Float32, Int32 -from tests.integration.feature_repos.universal.entities import driver - - -@pytest.mark.integration -@pytest.mark.universal_online_stores -def test_writing_incorrect_order_fails(environment, universal_data_sources): - # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in - """This test tests if we have incorrect order when writing to offline store. - Specifically, event_timestamp should be the first column to adhere with the filesource column order. - """ - store = environment.feature_store - _, _, data_sources = universal_data_sources - driver_stats = FeatureView( - name="driver_stats", - entities=["driver"], - schema=[ - Field(name="avg_daily_trips", dtype=Int32), - Field(name="conv_rate", dtype=Float32), - ], - source=data_sources.driver, - ) - - now = datetime.utcnow() - ts = pd.Timestamp(now).round("ms") - - entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} - ) - - store.apply([driver(), driver_stats]) - df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert df["conv_rate"].isnull().all() - assert df["avg_daily_trips"].isnull().all() - - df = pd.DataFrame.from_dict( - { - "driver_id": [1001, 1002], - "event_timestamp": [ts - timedelta(hours=3), ts], - "conv_rate": [random.random(), random.random()], - "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts], - }, - ) - with pytest.raises(ValueError): - store.write_to_offline_store(driver_stats.name, df, allow_registry_cache=False) - - -@pytest.mark.integration -@pytest.mark.universal_online_stores -def test_writing_incorrect_schema_fails(environment, universal_data_sources): - # TODO(kevjumba) handle incorrect order later, for now schema must be in the order that the filesource is in - """This test tests if we have incorrect attribute when writing to offline store. - Specifically, `incorrect_attribute` is an inccorect column to adhere with the filesource column order. - """ - store = environment.feature_store - _, _, data_sources = universal_data_sources - driver_stats = FeatureView( - name="driver_stats", - entities=["driver"], - schema=[ - Field(name="avg_daily_trips", dtype=Int32), - Field(name="conv_rate", dtype=Float32), - ], - source=data_sources.driver, - ) - - now = datetime.utcnow() - ts = pd.Timestamp(now).round("ms") - - entity_df = pd.DataFrame.from_dict( - {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]} - ) - - store.apply([driver(), driver_stats]) - df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert df["conv_rate"].isnull().all() - assert df["avg_daily_trips"].isnull().all() - - df = pd.DataFrame.from_dict( - { - "event_timestamp": [ts - timedelta(hours=3), ts], - "driver_id": [1001, 1002], - "conv_rate": [random.random(), random.random()], - "incorrect_attribute": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts], - }, - ) - with pytest.raises(ValueError): - store.write_to_offline_store(driver_stats.name, df, allow_registry_cache=False) - - -@pytest.mark.integration -@pytest.mark.universal_online_stores -def test_writing_consecutively_to_offline_store(environment, universal_data_sources): - store = environment.feature_store - _, _, data_sources = universal_data_sources - driver_stats = FeatureView( - name="driver_stats", - entities=["driver"], - schema=[ - Field(name="avg_daily_trips", dtype=Int32), - Field(name="conv_rate", dtype=Float32), - Field(name="acc_rate", dtype=Float32), - ], - source=data_sources.driver, - ttl=timedelta(minutes=10), - ) - - now = datetime.utcnow() - ts = pd.Timestamp(now, unit="ns") - - entity_df = pd.DataFrame.from_dict( - { - "driver_id": [1001, 1001], - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], - } - ) - - store.apply([driver(), driver_stats]) - df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert df["conv_rate"].isnull().all() - assert df["avg_daily_trips"].isnull().all() - - # This dataframe has its columns ordered exactly as it is in the parquet file generated by driver_test_data.py. - first_df = pd.DataFrame.from_dict( - { - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], - "driver_id": [1001, 1001], - "conv_rate": [random.random(), random.random()], - "acc_rate": [random.random(), random.random()], - "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts], - }, - ) - store.write_to_offline_store( - driver_stats.name, first_df, allow_registry_cache=False - ) - - after_write_df = store.get_historical_features( - entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], - full_feature_names=False, - ).to_df() - - assert len(after_write_df) == len(first_df) - assert np.where( - after_write_df["conv_rate"].reset_index(drop=True) - == first_df["conv_rate"].reset_index(drop=True) - ) - assert np.where( - after_write_df["avg_daily_trips"].reset_index(drop=True) - == first_df["avg_daily_trips"].reset_index(drop=True) - ) - - second_df = pd.DataFrame.from_dict( - { - "event_timestamp": [ts - timedelta(hours=1), ts], - "driver_id": [1001, 1001], - "conv_rate": [random.random(), random.random()], - "acc_rate": [random.random(), random.random()], - "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)], - "created": [ts, ts], - }, - ) - - store.write_to_offline_store( - driver_stats.name, second_df, allow_registry_cache=False - ) - - entity_df = pd.DataFrame.from_dict( - { - "driver_id": [1001, 1001, 1001, 1001], - "event_timestamp": [ - ts - timedelta(hours=4), - ts - timedelta(hours=3), - ts - timedelta(hours=1), - ts, - ], - } - ) - - after_write_df = store.get_historical_features( - entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:acc_rate", - "driver_stats:avg_daily_trips", - ], - full_feature_names=False, - ).to_df() - - expected_df = pd.concat([first_df, second_df]) - assert len(after_write_df) == len(expected_df) - assert np.where( - after_write_df["conv_rate"].reset_index(drop=True) - == expected_df["conv_rate"].reset_index(drop=True) - ) - assert np.where( - after_write_df["acc_rate"].reset_index(drop=True) - == expected_df["acc_rate"].reset_index(drop=True) - ) - assert np.where( - after_write_df["avg_daily_trips"].reset_index(drop=True) - == expected_df["avg_daily_trips"].reset_index(drop=True) - ) From 1fe4195fa3d54f6116a28cf083260e3f040e65c4 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 17 Jun 2022 10:31:43 -0700 Subject: [PATCH 12/30] Add redshift function Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/redshift.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index 943bac502c..e02dc4c860 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -12,6 +12,7 @@ Optional, Tuple, Union, + Any, ) import numpy as np From 9b43ba3822a200b11a8f4ea516e94febfd79ca93 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 20 Jun 2022 17:10:02 -0700 Subject: [PATCH 13/30] Add redshift Signed-off-by: Kevin Zhang --- .../feast/infra/offline_stores/redshift.py | 1 + sdk/python/feast/infra/utils/aws_utils.py | 63 +++++++++++++++++++ sdk/python/tests/conftest.py | 1 + .../feature_repos/repo_configuration.py | 9 +++ .../offline_store/test_offline_write.py | 15 +++-- 5 files changed, 83 insertions(+), 6 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index e02dc4c860..dc67108e33 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -14,6 +14,7 @@ Union, Any, ) +from feast.type_map import redshift_to_feast_value_type, feast_value_type_to_pa import numpy as np import pandas as pd diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 7badda9846..50415fee72 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -234,6 +234,23 @@ def upload_df_to_redshift( table_name=table_name, ) +def delete_redshift_table( + redshift_data_client, + cluster_id: str, + database: str, + user: str, + table_name: str, +): + drop_query = ( + f"DROP {table_name} IF EXISTS" + ) + execute_redshift_statement( + redshift_data_client, + cluster_id, + database, + user, + drop_query, + ) def delete_redshift_table( redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, @@ -379,6 +396,52 @@ def temporarily_upload_df_to_redshift( redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", ) +@contextlib.contextmanager +def temporarily_upload_arrow_table_to_redshift( + table: Union[pyarrow.Table, Path], + redshift_data_client, + cluster_id: str, + database: str, + user: str, + s3_resource, + iam_role: str, + s3_path: str, + table_name: str, + schema: Optional[pyarrow.Schema] = None, + fail_if_exists: bool = True, +) -> Iterator[None]: + """Uploads a Arrow Table to Redshift as a new table with cleanup logic. + + This is essentially the same as upload_arrow_table_to_redshift (check out its docstring for full details), + but unlike it this method is a generator and should be used with `with` block. For example: + + >>> with temporarily_upload_arrow_table_to_redshift(...): # doctest: +SKIP + >>> # Use `table_name` table in Redshift here + >>> # `table_name` will not exist at this point, since it's cleaned up by the `with` block + + """ + # Upload the dataframe to Redshift + upload_arrow_table_to_redshift( + table, + redshift_data_client, + cluster_id, + database, + user, + s3_resource, + s3_path, + iam_role, + table_name, + schema, + fail_if_exists, + ) + + yield + + # Clean up the uploaded Redshift table + execute_redshift_statement( + redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", + ) + @contextlib.contextmanager def temporarily_upload_arrow_table_to_redshift( diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index bf69a85fa3..bc4ddf9a49 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -31,6 +31,7 @@ IntegrationTestRepoConfig, ) from tests.integration.feature_repos.repo_configuration import ( + OFFLINE_STORE_TO_PROVIDER_CONFIG, AVAILABLE_OFFLINE_STORES, AVAILABLE_ONLINE_STORES, OFFLINE_STORE_TO_PROVIDER_CONFIG, diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index f4d5defcad..c51daf0246 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -74,11 +74,20 @@ "connection_string": "127.0.0.1:6001,127.0.0.1:6002,127.0.0.1:6003", } +<<<<<<< HEAD OFFLINE_STORE_TO_PROVIDER_CONFIG: Dict[str, DataSourceCreator] = { "file": ("local", FileDataSourceCreator), "gcp": ("gcp", BigQueryDataSourceCreator), "redshift": ("aws", RedshiftDataSourceCreator), "snowflake": ("aws", RedshiftDataSourceCreator), +======= +OFFLINE_STORE_TO_PROVIDER_CONFIG : Dict[ + str, DataSourceCreator] = { + "file": ("local", FileDataSourceCreator), + "gcp": ("gcp", BigQueryDataSourceCreator), + "redshift": ("aws", RedshiftDataSourceCreator), + "snowflake": ("aws", RedshiftDataSourceCreator), +>>>>>>> a1b0c4a6 (Add redshift) } AVAILABLE_OFFLINE_STORES: List[Tuple[str, Type[DataSourceCreator]]] = [ diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index 5e7a242513..9557e98714 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -9,7 +9,6 @@ from feast.types import Float32, Int32 from tests.integration.feature_repos.universal.entities import driver - @pytest.mark.integration @pytest.mark.universal_offline_stores(only=["file", "redshift"]) @pytest.mark.universal_online_stores(only=["sqlite"]) @@ -107,7 +106,6 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): driver_stats.name, expected_df, allow_registry_cache=False ) - @pytest.mark.integration @pytest.mark.universal_offline_stores(only=["file", "redshift"]) @pytest.mark.universal_online_stores(only=["sqlite"]) @@ -127,7 +125,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ) now = datetime.utcnow() - ts = pd.Timestamp(now, unit="ns") + ts = pd.Timestamp(now, unit="ms", tz="UTC").round("ms") entity_df = pd.DataFrame.from_dict( { @@ -148,7 +146,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour first_df = pd.DataFrame.from_dict( { - "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], + "event_timestamp": [now-timedelta(hours=4), now - timedelta(hours=3)], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], @@ -156,13 +154,18 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour "created": [ts, ts], }, ) + store._write_to_offline_store( driver_stats.name, first_df, allow_registry_cache=False ) after_write_df = store.get_historical_features( entity_df=entity_df, - features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], + features=[ + "driver_stats:conv_rate", + "driver_stats:acc_rate", + "driver_stats:avg_daily_trips", + ], full_feature_names=False, ).to_df() @@ -226,4 +229,4 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour assert np.where( after_write_df["avg_daily_trips"].reset_index(drop=True) == expected_df["avg_daily_trips"].reset_index(drop=True) - ) + ) \ No newline at end of file From 155a56a10c49e97bc4032f014b914ee87e62bea7 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 20 Jun 2022 17:22:03 -0700 Subject: [PATCH 14/30] Fix Signed-off-by: Kevin Zhang --- .../integration/offline_store/test_offline_write.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index 9557e98714..f1775db6bf 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -125,7 +125,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour ) now = datetime.utcnow() - ts = pd.Timestamp(now, unit="ms", tz="UTC").round("ms") + ts = pd.Timestamp(now, unit="ns") entity_df = pd.DataFrame.from_dict( { @@ -146,7 +146,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour first_df = pd.DataFrame.from_dict( { - "event_timestamp": [now-timedelta(hours=4), now - timedelta(hours=3)], + "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)], "driver_id": [1001, 1001], "conv_rate": [random.random(), random.random()], "acc_rate": [random.random(), random.random()], @@ -154,18 +154,13 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour "created": [ts, ts], }, ) - store._write_to_offline_store( driver_stats.name, first_df, allow_registry_cache=False ) after_write_df = store.get_historical_features( entity_df=entity_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:acc_rate", - "driver_stats:avg_daily_trips", - ], + features=["driver_stats:conv_rate", "driver_stats:avg_daily_trips"], full_feature_names=False, ).to_df() From a26336544a6a48a95046af1bd0a1a8ebf89d6f61 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 20 Jun 2022 17:23:45 -0700 Subject: [PATCH 15/30] Lint Signed-off-by: Kevin Zhang --- .../feast/infra/offline_stores/redshift.py | 2 -- sdk/python/feast/infra/utils/aws_utils.py | 21 +++++++------------ sdk/python/tests/conftest.py | 4 +++- .../feature_repos/repo_configuration.py | 6 ++++++ .../offline_store/test_offline_write.py | 4 +++- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index dc67108e33..943bac502c 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -12,9 +12,7 @@ Optional, Tuple, Union, - Any, ) -from feast.type_map import redshift_to_feast_value_type, feast_value_type_to_pa import numpy as np import pandas as pd diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 50415fee72..0d9d282ab4 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -234,24 +234,16 @@ def upload_df_to_redshift( table_name=table_name, ) + def delete_redshift_table( - redshift_data_client, - cluster_id: str, - database: str, - user: str, - table_name: str, + redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, ): - drop_query = ( - f"DROP {table_name} IF EXISTS" - ) + drop_query = f"DROP {table_name} IF EXISTS" execute_redshift_statement( - redshift_data_client, - cluster_id, - database, - user, - drop_query, + redshift_data_client, cluster_id, database, user, drop_query, ) +<<<<<<< HEAD def delete_redshift_table( redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, ): @@ -260,6 +252,8 @@ def delete_redshift_table( redshift_data_client, cluster_id, database, user, drop_query, ) +======= +>>>>>>> fec6cc0b (Lint) def upload_arrow_table_to_redshift( table: Union[pyarrow.Table, Path], @@ -396,6 +390,7 @@ def temporarily_upload_df_to_redshift( redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", ) + @contextlib.contextmanager def temporarily_upload_arrow_table_to_redshift( table: Union[pyarrow.Table, Path], diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index bc4ddf9a49..0290b5b440 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -31,7 +31,6 @@ IntegrationTestRepoConfig, ) from tests.integration.feature_repos.repo_configuration import ( - OFFLINE_STORE_TO_PROVIDER_CONFIG, AVAILABLE_OFFLINE_STORES, AVAILABLE_ONLINE_STORES, OFFLINE_STORE_TO_PROVIDER_CONFIG, @@ -285,9 +284,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): _config_cache[c] = c configs.append(_config_cache[c]) +<<<<<<< HEAD else: # No offline stores requested -> setting the default or first available offline_stores = [("local", FileDataSourceCreator)] +======= +>>>>>>> fec6cc0b (Lint) metafunc.parametrize( "environment", configs, indirect=True, ids=[str(c) for c in configs] diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index c51daf0246..75835f1c56 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -75,11 +75,15 @@ } <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> fec6cc0b (Lint) OFFLINE_STORE_TO_PROVIDER_CONFIG: Dict[str, DataSourceCreator] = { "file": ("local", FileDataSourceCreator), "gcp": ("gcp", BigQueryDataSourceCreator), "redshift": ("aws", RedshiftDataSourceCreator), "snowflake": ("aws", RedshiftDataSourceCreator), +<<<<<<< HEAD ======= OFFLINE_STORE_TO_PROVIDER_CONFIG : Dict[ str, DataSourceCreator] = { @@ -88,6 +92,8 @@ "redshift": ("aws", RedshiftDataSourceCreator), "snowflake": ("aws", RedshiftDataSourceCreator), >>>>>>> a1b0c4a6 (Add redshift) +======= +>>>>>>> fec6cc0b (Lint) } AVAILABLE_OFFLINE_STORES: List[Tuple[str, Type[DataSourceCreator]]] = [ diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index f1775db6bf..5e7a242513 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -9,6 +9,7 @@ from feast.types import Float32, Int32 from tests.integration.feature_repos.universal.entities import driver + @pytest.mark.integration @pytest.mark.universal_offline_stores(only=["file", "redshift"]) @pytest.mark.universal_online_stores(only=["sqlite"]) @@ -106,6 +107,7 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources): driver_stats.name, expected_df, allow_registry_cache=False ) + @pytest.mark.integration @pytest.mark.universal_offline_stores(only=["file", "redshift"]) @pytest.mark.universal_online_stores(only=["sqlite"]) @@ -224,4 +226,4 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour assert np.where( after_write_df["avg_daily_trips"].reset_index(drop=True) == expected_df["avg_daily_trips"].reset_index(drop=True) - ) \ No newline at end of file + ) From 3a51046cc5600e7891c60df0b0d94171c2ed102e Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 21 Jun 2022 08:31:47 -0700 Subject: [PATCH 16/30] fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/redshift.py | 2 +- sdk/python/feast/infra/utils/aws_utils.py | 4 ---- sdk/python/tests/conftest.py | 3 --- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index 943bac502c..c80927e91f 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -357,7 +357,7 @@ def offline_write_batch( s3_resource=s3_resource, s3_path=f"{config.offline_store.s3_staging_location}/push/{uuid.uuid4()}.parquet", iam_role=config.offline_store.iam_role, - table_name=redshift_options.table, + table_name=redshift_options.table , schema=pa_schema, fail_if_exists=False, ) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 0d9d282ab4..b284d24231 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -243,7 +243,6 @@ def delete_redshift_table( redshift_data_client, cluster_id, database, user, drop_query, ) -<<<<<<< HEAD def delete_redshift_table( redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, ): @@ -252,9 +251,6 @@ def delete_redshift_table( redshift_data_client, cluster_id, database, user, drop_query, ) -======= ->>>>>>> fec6cc0b (Lint) - def upload_arrow_table_to_redshift( table: Union[pyarrow.Table, Path], redshift_data_client, diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 0290b5b440..bf69a85fa3 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -284,12 +284,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): _config_cache[c] = c configs.append(_config_cache[c]) -<<<<<<< HEAD else: # No offline stores requested -> setting the default or first available offline_stores = [("local", FileDataSourceCreator)] -======= ->>>>>>> fec6cc0b (Lint) metafunc.parametrize( "environment", configs, indirect=True, ids=[str(c) for c in configs] From 6de1a3fef2838efc665bb924321fe6a33a1e7245 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 21 Jun 2022 08:33:00 -0700 Subject: [PATCH 17/30] fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/redshift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index c80927e91f..943bac502c 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -357,7 +357,7 @@ def offline_write_batch( s3_resource=s3_resource, s3_path=f"{config.offline_store.s3_staging_location}/push/{uuid.uuid4()}.parquet", iam_role=config.offline_store.iam_role, - table_name=redshift_options.table , + table_name=redshift_options.table, schema=pa_schema, fail_if_exists=False, ) From 1908102df9a4dbf19b786fa75597da486b014dce Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 21 Jun 2022 14:45:47 -0700 Subject: [PATCH 18/30] Fix test Signed-off-by: Kevin Zhang --- .../online_store/test_universal_online.py | 1202 ++++++++--------- 1 file changed, 601 insertions(+), 601 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index c068e04111..3d066e7ba7 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -441,604 +441,604 @@ def test_online_retrieval_with_event_timestamps( ) -@pytest.mark.integration -@pytest.mark.universal_online_stores -# @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. -@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) -def test_stream_feature_view_online_retrieval( - environment, universal_data_sources, feature_server_endpoint, full_feature_names -): - """ - Tests materialization and online retrieval for stream feature views. - - This test is separate from test_online_retrieval since combining feature views and - stream feature views into a single test resulted in test flakiness. This is tech - debt that should be resolved soon. - """ - # Set up feature store. - fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - pushable_feature_view = feature_views.pushed_locations - fs.apply([location(), pushable_feature_view]) - - # Materialize. - fs.materialize( - environment.start_date - timedelta(days=1), - environment.end_date + timedelta(days=1), - ) - - # Get online features by randomly sampling 10 entities that exist in the batch source. - sample_locations = datasets.location_df.sample(10)["location_id"] - entity_rows = [ - {"location_id": sample_location} for sample_location in sample_locations - ] - - feature_refs = [ - "pushable_location_stats:temperature", - ] - unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] - - online_features_dict = get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=feature_refs, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - - # Check that the response has the expected set of keys. - keys = set(online_features_dict.keys()) - expected_keys = set( - f.replace(":", "__") if full_feature_names else f.split(":")[-1] - for f in feature_refs - ) | {"location_id"} - assert ( - keys == expected_keys - ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" - - # Check that the feature values match. - tc = unittest.TestCase() - for i, entity_row in enumerate(entity_rows): - df_features = get_latest_feature_values_from_location_df( - entity_row, datasets.location_df - ) - - assert df_features["location_id"] == online_features_dict["location_id"][i] - for unprefixed_feature_ref in unprefixed_feature_refs: - tc.assertAlmostEqual( - df_features[unprefixed_feature_ref], - online_features_dict[ - response_feature_name( - unprefixed_feature_ref, feature_refs, full_feature_names - ) - ][i], - delta=0.0001, - ) - - -@pytest.mark.integration -@pytest.mark.universal_online_stores -# @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. -@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) -def test_online_retrieval( - environment, universal_data_sources, feature_server_endpoint, full_feature_names -): - fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - - feature_service = FeatureService( - "convrate_plus100", - features=[ - feature_views.driver[["conv_rate"]], - feature_views.driver_odfv, - feature_views.customer[["current_balance"]], - ], - ) - feature_service_entity_mapping = FeatureService( - name="entity_mapping", - features=[ - feature_views.location.with_name("origin").with_join_key_map( - {"location_id": "origin_id"} - ), - feature_views.location.with_name("destination").with_join_key_map( - {"location_id": "destination_id"} - ), - ], - ) - - feast_objects = [] - feast_objects.extend(feature_views.values()) - feast_objects.extend( - [ - driver(), - customer(), - location(), - feature_service, - feature_service_entity_mapping, - ] - ) - fs.apply(feast_objects) - fs.materialize( - environment.start_date - timedelta(days=1), - environment.end_date + timedelta(days=1), - ) - - entity_sample = datasets.orders_df.sample(10)[ - ["customer_id", "driver_id", "order_id", "event_timestamp"] - ] - orders_df = datasets.orders_df[ - ( - datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) - & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]) - ) - ] - - sample_drivers = entity_sample["driver_id"] - drivers_df = datasets.driver_df[ - datasets.driver_df["driver_id"].isin(sample_drivers) - ] - - sample_customers = entity_sample["customer_id"] - customers_df = datasets.customer_df[ - datasets.customer_df["customer_id"].isin(sample_customers) - ] - - location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) - sample_location_pairs = location_pairs[ - np.random.choice(len(location_pairs), 10) - ].T.tolist() - origins_df = datasets.location_df[ - datasets.location_df["location_id"].isin(sample_location_pairs[0]) - ] - destinations_df = datasets.location_df[ - datasets.location_df["location_id"].isin(sample_location_pairs[1]) - ] - - global_df = datasets.global_df - - entity_rows = [ - {"driver_id": d, "customer_id": c, "val_to_add": 50} - for (d, c) in zip(sample_drivers, sample_customers) - ] - - feature_refs = [ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - "conv_rate_plus_100:conv_rate_plus_100", - "conv_rate_plus_100:conv_rate_plus_val_to_add", - "order:order_is_success", - "global_stats:num_rides", - "global_stats:avg_ride_length", - ] - unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] - # Remove the on demand feature view output features, since they're not present in the source dataframe - unprefixed_feature_refs.remove("conv_rate_plus_100") - unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") - - online_features_dict = get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=feature_refs, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - - # Test that the on demand feature views compute properly even if the dependent conv_rate - # feature isn't requested. - online_features_no_conv_rate = get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=[ref for ref in feature_refs if ref != "driver_stats:conv_rate"], - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - - assert online_features_no_conv_rate is not None - - keys = set(online_features_dict.keys()) - expected_keys = set( - f.replace(":", "__") if full_feature_names else f.split(":")[-1] - for f in feature_refs - ) | {"customer_id", "driver_id"} - assert ( - keys == expected_keys - ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" - - tc = unittest.TestCase() - for i, entity_row in enumerate(entity_rows): - df_features = get_latest_feature_values_from_dataframes( - driver_df=drivers_df, - customer_df=customers_df, - orders_df=orders_df, - global_df=global_df, - entity_row=entity_row, - ) - - assert df_features["customer_id"] == online_features_dict["customer_id"][i] - assert df_features["driver_id"] == online_features_dict["driver_id"][i] - tc.assertAlmostEqual( - online_features_dict[ - response_feature_name( - "conv_rate_plus_100", feature_refs, full_feature_names - ) - ][i], - df_features["conv_rate"] + 100, - delta=0.0001, - ) - tc.assertAlmostEqual( - online_features_dict[ - response_feature_name( - "conv_rate_plus_val_to_add", feature_refs, full_feature_names - ) - ][i], - df_features["conv_rate"] + df_features["val_to_add"], - delta=0.0001, - ) - for unprefixed_feature_ref in unprefixed_feature_refs: - tc.assertAlmostEqual( - df_features[unprefixed_feature_ref], - online_features_dict[ - response_feature_name( - unprefixed_feature_ref, feature_refs, full_feature_names - ) - ][i], - delta=0.0001, - ) - - # Check what happens for missing values - missing_responses_dict = get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], - full_feature_names=full_feature_names, - ) - assert missing_responses_dict is not None - for unprefixed_feature_ref in unprefixed_feature_refs: - if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: - tc.assertIsNone( - missing_responses_dict[ - response_feature_name( - unprefixed_feature_ref, feature_refs, full_feature_names - ) - ][0] - ) - - # Check what happens for missing request data - with pytest.raises(RequestDataNotFoundInEntityRowsException): - get_online_features_dict( - environment=environment, - endpoint=feature_server_endpoint, - features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0}], - full_feature_names=full_feature_names, - ) - - assert_feature_service_correctness( - environment, - feature_server_endpoint, - feature_service, - entity_rows, - full_feature_names, - drivers_df, - customers_df, - orders_df, - global_df, - ) - - entity_rows = [ - {"origin_id": origin, "destination_id": destination} - for (_driver, _customer, origin, destination) in zip( - sample_drivers, sample_customers, *sample_location_pairs - ) - ] - assert_feature_service_entity_mapping_correctness( - environment, - feature_server_endpoint, - feature_service_entity_mapping, - entity_rows, - full_feature_names, - origins_df, - destinations_df, - ) - - -@pytest.mark.integration -@pytest.mark.universal_online_stores(only=["redis"]) -def test_online_store_cleanup(environment, universal_data_sources): - """ - Some online store implementations (like Redis) keep features from different features views - but with common entities together. - This might end up with deletion of all features attached to the entity, - when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). - - Plan: - 1. Register two feature views with common entity "driver" - 2. Materialize data - 3. Check if features are available (via online retrieval) - 4. Delete one feature view - 5. Check that features for other are still available - 6. Delete another feature view (and create again) - 7. Verify that features for both feature view were deleted - """ - fs = environment.feature_store - entities, datasets, data_sources = universal_data_sources - driver_stats_fv = construct_universal_feature_views(data_sources).driver - - driver_entities = entities.driver_vals - df = pd.DataFrame( - { - "ts_1": [environment.end_date] * len(driver_entities), - "created_ts": [environment.end_date] * len(driver_entities), - "driver_id": driver_entities, - "value": np.random.random(size=len(driver_entities)), - } - ) - - ds = environment.data_source_creator.create_data_source( - df, destination_name="simple_driver_dataset" - ) - - simple_driver_fv = driver_feature_view( - data_source=ds, name="test_universal_online_simple_driver" - ) - - fs.apply([driver(), simple_driver_fv, driver_stats_fv]) - - fs.materialize( - environment.start_date - timedelta(days=1), - environment.end_date + timedelta(days=1), - ) - expected_values = df.sort_values(by="driver_id") - - features = [f"{simple_driver_fv.name}:value"] - entity_rows = [{"driver_id": driver_id} for driver_id in sorted(driver_entities)] - - online_features = fs.get_online_features( - features=features, entity_rows=entity_rows - ).to_dict() - assert np.allclose(expected_values["value"], online_features["value"]) - - fs.apply( - objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False - ) - - online_features = fs.get_online_features( - features=features, entity_rows=entity_rows - ).to_dict() - assert np.allclose(expected_values["value"], online_features["value"]) - - fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) - - def eventually_apply() -> Tuple[None, bool]: - try: - fs.apply([simple_driver_fv]) - except BotoCoreError: - return None, False - - return None, True - - # Online store backend might have eventual consistency in schema update - # So recreating table that was just deleted might need some retries - wait_retry_backoff(eventually_apply, timeout_secs=60) - - online_features = fs.get_online_features( - features=features, entity_rows=entity_rows - ).to_dict() - assert all(v is None for v in online_features["value"]) - - -def response_feature_name( - feature: str, feature_refs: List[str], full_feature_names: bool -) -> str: - if not full_feature_names: - return feature - - for feature_ref in feature_refs: - if feature_ref.endswith(feature): - return feature_ref.replace(":", "__") - - return feature - - -def get_latest_row(entity_row, df, join_key, entity_key): - rows = df[df[join_key] == entity_row[entity_key]] - return rows.loc[rows["event_timestamp"].idxmax()].to_dict() - - -def get_latest_feature_values_from_dataframes( - driver_df, - customer_df, - orders_df, - entity_row, - global_df=None, - origin_df=None, - destination_df=None, -): - latest_driver_row = get_latest_row(entity_row, driver_df, "driver_id", "driver_id") - latest_customer_row = get_latest_row( - entity_row, customer_df, "customer_id", "customer_id" - ) - - # Since the event timestamp columns may contain timestamps of different timezones, - # we must first convert the timestamps to UTC before we can compare them. - order_rows = orders_df[ - (orders_df["driver_id"] == entity_row["driver_id"]) - & (orders_df["customer_id"] == entity_row["customer_id"]) - ] - timestamps = order_rows[["event_timestamp"]] - timestamps["event_timestamp"] = pd.to_datetime( - timestamps["event_timestamp"], utc=True - ) - max_index = timestamps["event_timestamp"].idxmax() - latest_orders_row = order_rows.loc[max_index] - - if global_df is not None: - latest_global_row = global_df.loc[ - global_df["event_timestamp"].idxmax() - ].to_dict() - if origin_df is not None: - latest_location_row = get_latest_feature_values_for_location_df( - entity_row, origin_df, destination_df - ) - - request_data_features = entity_row.copy() - request_data_features.pop("driver_id") - request_data_features.pop("customer_id") - if global_df is not None: - return { - **latest_customer_row, - **latest_driver_row, - **latest_orders_row, - **latest_global_row, - **request_data_features, - } - if origin_df is not None: - request_data_features.pop("origin_id") - request_data_features.pop("destination_id") - return { - **latest_customer_row, - **latest_driver_row, - **latest_orders_row, - **latest_location_row, - **request_data_features, - } - return { - **latest_customer_row, - **latest_driver_row, - **latest_orders_row, - **request_data_features, - } - - -def get_latest_feature_values_for_location_df(entity_row, origin_df, destination_df): - latest_origin_row = get_latest_row( - entity_row, origin_df, "location_id", "origin_id" - ) - latest_destination_row = get_latest_row( - entity_row, destination_df, "location_id", "destination_id" - ) - # Need full feature names for shadow entities - latest_origin_row["origin__temperature"] = latest_origin_row.pop("temperature") - latest_destination_row["destination__temperature"] = latest_destination_row.pop( - "temperature" - ) - - return { - **latest_origin_row, - **latest_destination_row, - } - - -def get_latest_feature_values_from_location_df(entity_row, location_df): - return get_latest_row(entity_row, location_df, "location_id", "location_id") - - -def assert_feature_service_correctness( - environment, - endpoint, - feature_service, - entity_rows, - full_feature_names, - drivers_df, - customers_df, - orders_df, - global_df, -): - feature_service_online_features_dict = get_online_features_dict( - environment=environment, - endpoint=endpoint, - features=feature_service, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - feature_service_keys = feature_service_online_features_dict.keys() - expected_feature_refs = [ - f"{projection.name_to_use()}__{feature.name}" - if full_feature_names - else feature.name - for projection in feature_service.feature_view_projections - for feature in projection.features - ] - assert set(feature_service_keys) == set(expected_feature_refs) | { - "customer_id", - "driver_id", - } - - tc = unittest.TestCase() - for i, entity_row in enumerate(entity_rows): - df_features = get_latest_feature_values_from_dataframes( - driver_df=drivers_df, - customer_df=customers_df, - orders_df=orders_df, - global_df=global_df, - entity_row=entity_row, - ) - tc.assertAlmostEqual( - feature_service_online_features_dict[ - response_feature_name( - "conv_rate_plus_100", expected_feature_refs, full_feature_names - ) - ][i], - df_features["conv_rate"] + 100, - delta=0.0001, - ) - - -def assert_feature_service_entity_mapping_correctness( - environment, - endpoint, - feature_service, - entity_rows, - full_feature_names, - origins_df, - destinations_df, -): - if full_feature_names: - feature_service_online_features_dict = get_online_features_dict( - environment=environment, - endpoint=endpoint, - features=feature_service, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) - feature_service_keys = feature_service_online_features_dict.keys() - - expected_features = [ - f"{projection.name_to_use()}__{feature.name}" - if full_feature_names - else feature.name - for projection in feature_service.feature_view_projections - for feature in projection.features - ] - assert set(feature_service_keys) == set(expected_features) | { - "destination_id", - "origin_id", - } - - for i, entity_row in enumerate(entity_rows): - df_features = get_latest_feature_values_for_location_df( - origin_df=origins_df, - destination_df=destinations_df, - entity_row=entity_row, - ) - for feature_name in ["origin__temperature", "destination__temperature"]: - assert ( - feature_service_online_features_dict[feature_name][i] - == df_features[feature_name] - ) - else: - # using 2 of the same FeatureView without full_feature_names=True will result in collision - with pytest.raises(FeatureNameCollisionError): - get_online_features_dict( - environment=environment, - endpoint=endpoint, - features=feature_service, - entity_rows=entity_rows, - full_feature_names=full_feature_names, - ) +# @pytest.mark.integration +# @pytest.mark.universal_online_stores +# # @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. +# @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +# def test_stream_feature_view_online_retrieval( +# environment, universal_data_sources, feature_server_endpoint, full_feature_names +# ): +# """ +# Tests materialization and online retrieval for stream feature views. + +# This test is separate from test_online_retrieval since combining feature views and +# stream feature views into a single test resulted in test flakiness. This is tech +# debt that should be resolved soon. +# """ +# # Set up feature store. +# fs = environment.feature_store +# entities, datasets, data_sources = universal_data_sources +# feature_views = construct_universal_feature_views(data_sources) +# pushable_feature_view = feature_views.pushed_locations +# fs.apply([location(), pushable_feature_view]) + +# # Materialize. +# fs.materialize( +# environment.start_date - timedelta(days=1), +# environment.end_date + timedelta(days=1), +# ) + +# # Get online features by randomly sampling 10 entities that exist in the batch source. +# sample_locations = datasets.location_df.sample(10)["location_id"] +# entity_rows = [ +# {"location_id": sample_location} for sample_location in sample_locations +# ] + +# feature_refs = [ +# "pushable_location_stats:temperature", +# ] +# unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] + +# online_features_dict = get_online_features_dict( +# environment=environment, +# endpoint=feature_server_endpoint, +# features=feature_refs, +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) + +# # Check that the response has the expected set of keys. +# keys = set(online_features_dict.keys()) +# expected_keys = set( +# f.replace(":", "__") if full_feature_names else f.split(":")[-1] +# for f in feature_refs +# ) | {"location_id"} +# assert ( +# keys == expected_keys +# ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" + +# # Check that the feature values match. +# tc = unittest.TestCase() +# for i, entity_row in enumerate(entity_rows): +# df_features = get_latest_feature_values_from_location_df( +# entity_row, datasets.location_df +# ) + +# assert df_features["location_id"] == online_features_dict["location_id"][i] +# for unprefixed_feature_ref in unprefixed_feature_refs: +# tc.assertAlmostEqual( +# df_features[unprefixed_feature_ref], +# online_features_dict[ +# response_feature_name( +# unprefixed_feature_ref, feature_refs, full_feature_names +# ) +# ][i], +# delta=0.0001, +# ) + + +# @pytest.mark.integration +# @pytest.mark.universal_online_stores +# # @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. +# @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +# def test_online_retrieval( +# environment, universal_data_sources, feature_server_endpoint, full_feature_names +# ): +# fs = environment.feature_store +# entities, datasets, data_sources = universal_data_sources +# feature_views = construct_universal_feature_views(data_sources) + +# feature_service = FeatureService( +# "convrate_plus100", +# features=[ +# feature_views.driver[["conv_rate"]], +# feature_views.driver_odfv, +# feature_views.customer[["current_balance"]], +# ], +# ) +# feature_service_entity_mapping = FeatureService( +# name="entity_mapping", +# features=[ +# feature_views.location.with_name("origin").with_join_key_map( +# {"location_id": "origin_id"} +# ), +# feature_views.location.with_name("destination").with_join_key_map( +# {"location_id": "destination_id"} +# ), +# ], +# ) + +# feast_objects = [] +# feast_objects.extend(feature_views.values()) +# feast_objects.extend( +# [ +# driver(), +# customer(), +# location(), +# feature_service, +# feature_service_entity_mapping, +# ] +# ) +# fs.apply(feast_objects) +# fs.materialize( +# environment.start_date - timedelta(days=1), +# environment.end_date + timedelta(days=1), +# ) + +# entity_sample = datasets.orders_df.sample(10)[ +# ["customer_id", "driver_id", "order_id", "event_timestamp"] +# ] +# orders_df = datasets.orders_df[ +# ( +# datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) +# & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]) +# ) +# ] + +# sample_drivers = entity_sample["driver_id"] +# drivers_df = datasets.driver_df[ +# datasets.driver_df["driver_id"].isin(sample_drivers) +# ] + +# sample_customers = entity_sample["customer_id"] +# customers_df = datasets.customer_df[ +# datasets.customer_df["customer_id"].isin(sample_customers) +# ] + +# location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) +# sample_location_pairs = location_pairs[ +# np.random.choice(len(location_pairs), 10) +# ].T.tolist() +# origins_df = datasets.location_df[ +# datasets.location_df["location_id"].isin(sample_location_pairs[0]) +# ] +# destinations_df = datasets.location_df[ +# datasets.location_df["location_id"].isin(sample_location_pairs[1]) +# ] + +# global_df = datasets.global_df + +# entity_rows = [ +# {"driver_id": d, "customer_id": c, "val_to_add": 50} +# for (d, c) in zip(sample_drivers, sample_customers) +# ] + +# feature_refs = [ +# "driver_stats:conv_rate", +# "driver_stats:avg_daily_trips", +# "customer_profile:current_balance", +# "customer_profile:avg_passenger_count", +# "customer_profile:lifetime_trip_count", +# "conv_rate_plus_100:conv_rate_plus_100", +# "conv_rate_plus_100:conv_rate_plus_val_to_add", +# "order:order_is_success", +# "global_stats:num_rides", +# "global_stats:avg_ride_length", +# ] +# unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] +# # Remove the on demand feature view output features, since they're not present in the source dataframe +# unprefixed_feature_refs.remove("conv_rate_plus_100") +# unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") + +# online_features_dict = get_online_features_dict( +# environment=environment, +# endpoint=feature_server_endpoint, +# features=feature_refs, +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) + +# # Test that the on demand feature views compute properly even if the dependent conv_rate +# # feature isn't requested. +# online_features_no_conv_rate = get_online_features_dict( +# environment=environment, +# endpoint=feature_server_endpoint, +# features=[ref for ref in feature_refs if ref != "driver_stats:conv_rate"], +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) + +# assert online_features_no_conv_rate is not None + +# keys = set(online_features_dict.keys()) +# expected_keys = set( +# f.replace(":", "__") if full_feature_names else f.split(":")[-1] +# for f in feature_refs +# ) | {"customer_id", "driver_id"} +# assert ( +# keys == expected_keys +# ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" + +# tc = unittest.TestCase() +# for i, entity_row in enumerate(entity_rows): +# df_features = get_latest_feature_values_from_dataframes( +# driver_df=drivers_df, +# customer_df=customers_df, +# orders_df=orders_df, +# global_df=global_df, +# entity_row=entity_row, +# ) + +# assert df_features["customer_id"] == online_features_dict["customer_id"][i] +# assert df_features["driver_id"] == online_features_dict["driver_id"][i] +# tc.assertAlmostEqual( +# online_features_dict[ +# response_feature_name( +# "conv_rate_plus_100", feature_refs, full_feature_names +# ) +# ][i], +# df_features["conv_rate"] + 100, +# delta=0.0001, +# ) +# tc.assertAlmostEqual( +# online_features_dict[ +# response_feature_name( +# "conv_rate_plus_val_to_add", feature_refs, full_feature_names +# ) +# ][i], +# df_features["conv_rate"] + df_features["val_to_add"], +# delta=0.0001, +# ) +# for unprefixed_feature_ref in unprefixed_feature_refs: +# tc.assertAlmostEqual( +# df_features[unprefixed_feature_ref], +# online_features_dict[ +# response_feature_name( +# unprefixed_feature_ref, feature_refs, full_feature_names +# ) +# ][i], +# delta=0.0001, +# ) + +# # Check what happens for missing values +# missing_responses_dict = get_online_features_dict( +# environment=environment, +# endpoint=feature_server_endpoint, +# features=feature_refs, +# entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], +# full_feature_names=full_feature_names, +# ) +# assert missing_responses_dict is not None +# for unprefixed_feature_ref in unprefixed_feature_refs: +# if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: +# tc.assertIsNone( +# missing_responses_dict[ +# response_feature_name( +# unprefixed_feature_ref, feature_refs, full_feature_names +# ) +# ][0] +# ) + +# # Check what happens for missing request data +# with pytest.raises(RequestDataNotFoundInEntityRowsException): +# get_online_features_dict( +# environment=environment, +# endpoint=feature_server_endpoint, +# features=feature_refs, +# entity_rows=[{"driver_id": 0, "customer_id": 0}], +# full_feature_names=full_feature_names, +# ) + +# assert_feature_service_correctness( +# environment, +# feature_server_endpoint, +# feature_service, +# entity_rows, +# full_feature_names, +# drivers_df, +# customers_df, +# orders_df, +# global_df, +# ) + +# entity_rows = [ +# {"origin_id": origin, "destination_id": destination} +# for (_driver, _customer, origin, destination) in zip( +# sample_drivers, sample_customers, *sample_location_pairs +# ) +# ] +# assert_feature_service_entity_mapping_correctness( +# environment, +# feature_server_endpoint, +# feature_service_entity_mapping, +# entity_rows, +# full_feature_names, +# origins_df, +# destinations_df, +# ) + + +# @pytest.mark.integration +# @pytest.mark.universal_online_stores(only=["redis"]) +# def test_online_store_cleanup(environment, universal_data_sources): +# """ +# Some online store implementations (like Redis) keep features from different features views +# but with common entities together. +# This might end up with deletion of all features attached to the entity, +# when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). + +# Plan: +# 1. Register two feature views with common entity "driver" +# 2. Materialize data +# 3. Check if features are available (via online retrieval) +# 4. Delete one feature view +# 5. Check that features for other are still available +# 6. Delete another feature view (and create again) +# 7. Verify that features for both feature view were deleted +# """ +# fs = environment.feature_store +# entities, datasets, data_sources = universal_data_sources +# driver_stats_fv = construct_universal_feature_views(data_sources).driver + +# driver_entities = entities.driver_vals +# df = pd.DataFrame( +# { +# "ts_1": [environment.end_date] * len(driver_entities), +# "created_ts": [environment.end_date] * len(driver_entities), +# "driver_id": driver_entities, +# "value": np.random.random(size=len(driver_entities)), +# } +# ) + +# ds = environment.data_source_creator.create_data_source( +# df, destination_name="simple_driver_dataset" +# ) + +# simple_driver_fv = driver_feature_view( +# data_source=ds, name="test_universal_online_simple_driver" +# ) + +# fs.apply([driver(), simple_driver_fv, driver_stats_fv]) + +# fs.materialize( +# environment.start_date - timedelta(days=1), +# environment.end_date + timedelta(days=1), +# ) +# expected_values = df.sort_values(by="driver_id") + +# features = [f"{simple_driver_fv.name}:value"] +# entity_rows = [{"driver_id": driver_id} for driver_id in sorted(driver_entities)] + +# online_features = fs.get_online_features( +# features=features, entity_rows=entity_rows +# ).to_dict() +# assert np.allclose(expected_values["value"], online_features["value"]) + +# fs.apply( +# objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False +# ) + +# online_features = fs.get_online_features( +# features=features, entity_rows=entity_rows +# ).to_dict() +# assert np.allclose(expected_values["value"], online_features["value"]) + +# fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) + +# def eventually_apply() -> Tuple[None, bool]: +# try: +# fs.apply([simple_driver_fv]) +# except BotoCoreError: +# return None, False + +# return None, True + +# # Online store backend might have eventual consistency in schema update +# # So recreating table that was just deleted might need some retries +# wait_retry_backoff(eventually_apply, timeout_secs=60) + +# online_features = fs.get_online_features( +# features=features, entity_rows=entity_rows +# ).to_dict() +# assert all(v is None for v in online_features["value"]) + + +# def response_feature_name( +# feature: str, feature_refs: List[str], full_feature_names: bool +# ) -> str: +# if not full_feature_names: +# return feature + +# for feature_ref in feature_refs: +# if feature_ref.endswith(feature): +# return feature_ref.replace(":", "__") + +# return feature + + +# def get_latest_row(entity_row, df, join_key, entity_key): +# rows = df[df[join_key] == entity_row[entity_key]] +# return rows.loc[rows["event_timestamp"].idxmax()].to_dict() + + +# def get_latest_feature_values_from_dataframes( +# driver_df, +# customer_df, +# orders_df, +# entity_row, +# global_df=None, +# origin_df=None, +# destination_df=None, +# ): +# latest_driver_row = get_latest_row(entity_row, driver_df, "driver_id", "driver_id") +# latest_customer_row = get_latest_row( +# entity_row, customer_df, "customer_id", "customer_id" +# ) + +# # Since the event timestamp columns may contain timestamps of different timezones, +# # we must first convert the timestamps to UTC before we can compare them. +# order_rows = orders_df[ +# (orders_df["driver_id"] == entity_row["driver_id"]) +# & (orders_df["customer_id"] == entity_row["customer_id"]) +# ] +# timestamps = order_rows[["event_timestamp"]] +# timestamps["event_timestamp"] = pd.to_datetime( +# timestamps["event_timestamp"], utc=True +# ) +# max_index = timestamps["event_timestamp"].idxmax() +# latest_orders_row = order_rows.loc[max_index] + +# if global_df is not None: +# latest_global_row = global_df.loc[ +# global_df["event_timestamp"].idxmax() +# ].to_dict() +# if origin_df is not None: +# latest_location_row = get_latest_feature_values_for_location_df( +# entity_row, origin_df, destination_df +# ) + +# request_data_features = entity_row.copy() +# request_data_features.pop("driver_id") +# request_data_features.pop("customer_id") +# if global_df is not None: +# return { +# **latest_customer_row, +# **latest_driver_row, +# **latest_orders_row, +# **latest_global_row, +# **request_data_features, +# } +# if origin_df is not None: +# request_data_features.pop("origin_id") +# request_data_features.pop("destination_id") +# return { +# **latest_customer_row, +# **latest_driver_row, +# **latest_orders_row, +# **latest_location_row, +# **request_data_features, +# } +# return { +# **latest_customer_row, +# **latest_driver_row, +# **latest_orders_row, +# **request_data_features, +# } + + +# def get_latest_feature_values_for_location_df(entity_row, origin_df, destination_df): +# latest_origin_row = get_latest_row( +# entity_row, origin_df, "location_id", "origin_id" +# ) +# latest_destination_row = get_latest_row( +# entity_row, destination_df, "location_id", "destination_id" +# ) +# # Need full feature names for shadow entities +# latest_origin_row["origin__temperature"] = latest_origin_row.pop("temperature") +# latest_destination_row["destination__temperature"] = latest_destination_row.pop( +# "temperature" +# ) + +# return { +# **latest_origin_row, +# **latest_destination_row, +# } + + +# def get_latest_feature_values_from_location_df(entity_row, location_df): +# return get_latest_row(entity_row, location_df, "location_id", "location_id") + + +# def assert_feature_service_correctness( +# environment, +# endpoint, +# feature_service, +# entity_rows, +# full_feature_names, +# drivers_df, +# customers_df, +# orders_df, +# global_df, +# ): +# feature_service_online_features_dict = get_online_features_dict( +# environment=environment, +# endpoint=endpoint, +# features=feature_service, +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) +# feature_service_keys = feature_service_online_features_dict.keys() +# expected_feature_refs = [ +# f"{projection.name_to_use()}__{feature.name}" +# if full_feature_names +# else feature.name +# for projection in feature_service.feature_view_projections +# for feature in projection.features +# ] +# assert set(feature_service_keys) == set(expected_feature_refs) | { +# "customer_id", +# "driver_id", +# } + +# tc = unittest.TestCase() +# for i, entity_row in enumerate(entity_rows): +# df_features = get_latest_feature_values_from_dataframes( +# driver_df=drivers_df, +# customer_df=customers_df, +# orders_df=orders_df, +# global_df=global_df, +# entity_row=entity_row, +# ) +# tc.assertAlmostEqual( +# feature_service_online_features_dict[ +# response_feature_name( +# "conv_rate_plus_100", expected_feature_refs, full_feature_names +# ) +# ][i], +# df_features["conv_rate"] + 100, +# delta=0.0001, +# ) + + +# def assert_feature_service_entity_mapping_correctness( +# environment, +# endpoint, +# feature_service, +# entity_rows, +# full_feature_names, +# origins_df, +# destinations_df, +# ): +# if full_feature_names: +# feature_service_online_features_dict = get_online_features_dict( +# environment=environment, +# endpoint=endpoint, +# features=feature_service, +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) +# feature_service_keys = feature_service_online_features_dict.keys() + +# expected_features = [ +# f"{projection.name_to_use()}__{feature.name}" +# if full_feature_names +# else feature.name +# for projection in feature_service.feature_view_projections +# for feature in projection.features +# ] +# assert set(feature_service_keys) == set(expected_features) | { +# "destination_id", +# "origin_id", +# } + +# for i, entity_row in enumerate(entity_rows): +# df_features = get_latest_feature_values_for_location_df( +# origin_df=origins_df, +# destination_df=destinations_df, +# entity_row=entity_row, +# ) +# for feature_name in ["origin__temperature", "destination__temperature"]: +# assert ( +# feature_service_online_features_dict[feature_name][i] +# == df_features[feature_name] +# ) +# else: +# # using 2 of the same FeatureView without full_feature_names=True will result in collision +# with pytest.raises(FeatureNameCollisionError): +# get_online_features_dict( +# environment=environment, +# endpoint=endpoint, +# features=feature_service, +# entity_rows=entity_rows, +# full_feature_names=full_feature_names, +# ) From c3075c48e0cf968f1e87d416c0a46c5392e3fef1 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 21 Jun 2022 14:46:10 -0700 Subject: [PATCH 19/30] Fix test Signed-off-by: Kevin Zhang --- .../online_store/test_universal_online.py | 960 +++++++++--------- 1 file changed, 480 insertions(+), 480 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 3d066e7ba7..9d4db3e03e 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -517,528 +517,528 @@ def test_online_retrieval_with_event_timestamps( # ) -# @pytest.mark.integration -# @pytest.mark.universal_online_stores -# # @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. -# @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) -# def test_online_retrieval( -# environment, universal_data_sources, feature_server_endpoint, full_feature_names -# ): -# fs = environment.feature_store -# entities, datasets, data_sources = universal_data_sources -# feature_views = construct_universal_feature_views(data_sources) - -# feature_service = FeatureService( -# "convrate_plus100", -# features=[ -# feature_views.driver[["conv_rate"]], -# feature_views.driver_odfv, -# feature_views.customer[["current_balance"]], -# ], -# ) -# feature_service_entity_mapping = FeatureService( -# name="entity_mapping", -# features=[ -# feature_views.location.with_name("origin").with_join_key_map( -# {"location_id": "origin_id"} -# ), -# feature_views.location.with_name("destination").with_join_key_map( -# {"location_id": "destination_id"} -# ), -# ], -# ) - -# feast_objects = [] -# feast_objects.extend(feature_views.values()) -# feast_objects.extend( -# [ -# driver(), -# customer(), -# location(), -# feature_service, -# feature_service_entity_mapping, -# ] -# ) -# fs.apply(feast_objects) -# fs.materialize( -# environment.start_date - timedelta(days=1), -# environment.end_date + timedelta(days=1), -# ) - -# entity_sample = datasets.orders_df.sample(10)[ -# ["customer_id", "driver_id", "order_id", "event_timestamp"] -# ] -# orders_df = datasets.orders_df[ -# ( -# datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) -# & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]) -# ) -# ] - -# sample_drivers = entity_sample["driver_id"] -# drivers_df = datasets.driver_df[ -# datasets.driver_df["driver_id"].isin(sample_drivers) -# ] - -# sample_customers = entity_sample["customer_id"] -# customers_df = datasets.customer_df[ -# datasets.customer_df["customer_id"].isin(sample_customers) -# ] - -# location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) -# sample_location_pairs = location_pairs[ -# np.random.choice(len(location_pairs), 10) -# ].T.tolist() -# origins_df = datasets.location_df[ -# datasets.location_df["location_id"].isin(sample_location_pairs[0]) -# ] -# destinations_df = datasets.location_df[ -# datasets.location_df["location_id"].isin(sample_location_pairs[1]) -# ] - -# global_df = datasets.global_df +@pytest.mark.integration +@pytest.mark.universal_online_stores +# @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_online_retrieval( + environment, universal_data_sources, feature_server_endpoint, full_feature_names +): + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) -# entity_rows = [ -# {"driver_id": d, "customer_id": c, "val_to_add": 50} -# for (d, c) in zip(sample_drivers, sample_customers) -# ] + feature_service = FeatureService( + "convrate_plus100", + features=[ + feature_views.driver[["conv_rate"]], + feature_views.driver_odfv, + feature_views.customer[["current_balance"]], + ], + ) + feature_service_entity_mapping = FeatureService( + name="entity_mapping", + features=[ + feature_views.location.with_name("origin").with_join_key_map( + {"location_id": "origin_id"} + ), + feature_views.location.with_name("destination").with_join_key_map( + {"location_id": "destination_id"} + ), + ], + ) -# feature_refs = [ -# "driver_stats:conv_rate", -# "driver_stats:avg_daily_trips", -# "customer_profile:current_balance", -# "customer_profile:avg_passenger_count", -# "customer_profile:lifetime_trip_count", -# "conv_rate_plus_100:conv_rate_plus_100", -# "conv_rate_plus_100:conv_rate_plus_val_to_add", -# "order:order_is_success", -# "global_stats:num_rides", -# "global_stats:avg_ride_length", -# ] -# unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] -# # Remove the on demand feature view output features, since they're not present in the source dataframe -# unprefixed_feature_refs.remove("conv_rate_plus_100") -# unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend( + [ + driver(), + customer(), + location(), + feature_service, + feature_service_entity_mapping, + ] + ) + fs.apply(feast_objects) + fs.materialize( + environment.start_date - timedelta(days=1), + environment.end_date + timedelta(days=1), + ) -# online_features_dict = get_online_features_dict( -# environment=environment, -# endpoint=feature_server_endpoint, -# features=feature_refs, -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) + entity_sample = datasets.orders_df.sample(10)[ + ["customer_id", "driver_id", "order_id", "event_timestamp"] + ] + orders_df = datasets.orders_df[ + ( + datasets.orders_df["customer_id"].isin(entity_sample["customer_id"]) + & datasets.orders_df["driver_id"].isin(entity_sample["driver_id"]) + ) + ] + + sample_drivers = entity_sample["driver_id"] + drivers_df = datasets.driver_df[ + datasets.driver_df["driver_id"].isin(sample_drivers) + ] + + sample_customers = entity_sample["customer_id"] + customers_df = datasets.customer_df[ + datasets.customer_df["customer_id"].isin(sample_customers) + ] + + location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) + sample_location_pairs = location_pairs[ + np.random.choice(len(location_pairs), 10) + ].T.tolist() + origins_df = datasets.location_df[ + datasets.location_df["location_id"].isin(sample_location_pairs[0]) + ] + destinations_df = datasets.location_df[ + datasets.location_df["location_id"].isin(sample_location_pairs[1]) + ] + + global_df = datasets.global_df + + entity_rows = [ + {"driver_id": d, "customer_id": c, "val_to_add": 50} + for (d, c) in zip(sample_drivers, sample_customers) + ] + + feature_refs = [ + "driver_stats:conv_rate", + "driver_stats:avg_daily_trips", + "customer_profile:current_balance", + "customer_profile:avg_passenger_count", + "customer_profile:lifetime_trip_count", + "conv_rate_plus_100:conv_rate_plus_100", + "conv_rate_plus_100:conv_rate_plus_val_to_add", + "order:order_is_success", + "global_stats:num_rides", + "global_stats:avg_ride_length", + ] + unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] + # Remove the on demand feature view output features, since they're not present in the source dataframe + unprefixed_feature_refs.remove("conv_rate_plus_100") + unprefixed_feature_refs.remove("conv_rate_plus_val_to_add") + + online_features_dict = get_online_features_dict( + environment=environment, + endpoint=feature_server_endpoint, + features=feature_refs, + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) -# # Test that the on demand feature views compute properly even if the dependent conv_rate -# # feature isn't requested. -# online_features_no_conv_rate = get_online_features_dict( -# environment=environment, -# endpoint=feature_server_endpoint, -# features=[ref for ref in feature_refs if ref != "driver_stats:conv_rate"], -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) + # Test that the on demand feature views compute properly even if the dependent conv_rate + # feature isn't requested. + online_features_no_conv_rate = get_online_features_dict( + environment=environment, + endpoint=feature_server_endpoint, + features=[ref for ref in feature_refs if ref != "driver_stats:conv_rate"], + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) -# assert online_features_no_conv_rate is not None + assert online_features_no_conv_rate is not None + + keys = set(online_features_dict.keys()) + expected_keys = set( + f.replace(":", "__") if full_feature_names else f.split(":")[-1] + for f in feature_refs + ) | {"customer_id", "driver_id"} + assert ( + keys == expected_keys + ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" + + tc = unittest.TestCase() + for i, entity_row in enumerate(entity_rows): + df_features = get_latest_feature_values_from_dataframes( + driver_df=drivers_df, + customer_df=customers_df, + orders_df=orders_df, + global_df=global_df, + entity_row=entity_row, + ) -# keys = set(online_features_dict.keys()) -# expected_keys = set( -# f.replace(":", "__") if full_feature_names else f.split(":")[-1] -# for f in feature_refs -# ) | {"customer_id", "driver_id"} -# assert ( -# keys == expected_keys -# ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" + assert df_features["customer_id"] == online_features_dict["customer_id"][i] + assert df_features["driver_id"] == online_features_dict["driver_id"][i] + tc.assertAlmostEqual( + online_features_dict[ + response_feature_name( + "conv_rate_plus_100", feature_refs, full_feature_names + ) + ][i], + df_features["conv_rate"] + 100, + delta=0.0001, + ) + tc.assertAlmostEqual( + online_features_dict[ + response_feature_name( + "conv_rate_plus_val_to_add", feature_refs, full_feature_names + ) + ][i], + df_features["conv_rate"] + df_features["val_to_add"], + delta=0.0001, + ) + for unprefixed_feature_ref in unprefixed_feature_refs: + tc.assertAlmostEqual( + df_features[unprefixed_feature_ref], + online_features_dict[ + response_feature_name( + unprefixed_feature_ref, feature_refs, full_feature_names + ) + ][i], + delta=0.0001, + ) + + # Check what happens for missing values + missing_responses_dict = get_online_features_dict( + environment=environment, + endpoint=feature_server_endpoint, + features=feature_refs, + entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], + full_feature_names=full_feature_names, + ) + assert missing_responses_dict is not None + for unprefixed_feature_ref in unprefixed_feature_refs: + if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: + tc.assertIsNone( + missing_responses_dict[ + response_feature_name( + unprefixed_feature_ref, feature_refs, full_feature_names + ) + ][0] + ) + + # Check what happens for missing request data + with pytest.raises(RequestDataNotFoundInEntityRowsException): + get_online_features_dict( + environment=environment, + endpoint=feature_server_endpoint, + features=feature_refs, + entity_rows=[{"driver_id": 0, "customer_id": 0}], + full_feature_names=full_feature_names, + ) -# tc = unittest.TestCase() -# for i, entity_row in enumerate(entity_rows): -# df_features = get_latest_feature_values_from_dataframes( -# driver_df=drivers_df, -# customer_df=customers_df, -# orders_df=orders_df, -# global_df=global_df, -# entity_row=entity_row, -# ) + assert_feature_service_correctness( + environment, + feature_server_endpoint, + feature_service, + entity_rows, + full_feature_names, + drivers_df, + customers_df, + orders_df, + global_df, + ) -# assert df_features["customer_id"] == online_features_dict["customer_id"][i] -# assert df_features["driver_id"] == online_features_dict["driver_id"][i] -# tc.assertAlmostEqual( -# online_features_dict[ -# response_feature_name( -# "conv_rate_plus_100", feature_refs, full_feature_names -# ) -# ][i], -# df_features["conv_rate"] + 100, -# delta=0.0001, -# ) -# tc.assertAlmostEqual( -# online_features_dict[ -# response_feature_name( -# "conv_rate_plus_val_to_add", feature_refs, full_feature_names -# ) -# ][i], -# df_features["conv_rate"] + df_features["val_to_add"], -# delta=0.0001, -# ) -# for unprefixed_feature_ref in unprefixed_feature_refs: -# tc.assertAlmostEqual( -# df_features[unprefixed_feature_ref], -# online_features_dict[ -# response_feature_name( -# unprefixed_feature_ref, feature_refs, full_feature_names -# ) -# ][i], -# delta=0.0001, -# ) + entity_rows = [ + {"origin_id": origin, "destination_id": destination} + for (_driver, _customer, origin, destination) in zip( + sample_drivers, sample_customers, *sample_location_pairs + ) + ] + assert_feature_service_entity_mapping_correctness( + environment, + feature_server_endpoint, + feature_service_entity_mapping, + entity_rows, + full_feature_names, + origins_df, + destinations_df, + ) -# # Check what happens for missing values -# missing_responses_dict = get_online_features_dict( -# environment=environment, -# endpoint=feature_server_endpoint, -# features=feature_refs, -# entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], -# full_feature_names=full_feature_names, -# ) -# assert missing_responses_dict is not None -# for unprefixed_feature_ref in unprefixed_feature_refs: -# if unprefixed_feature_ref not in {"num_rides", "avg_ride_length"}: -# tc.assertIsNone( -# missing_responses_dict[ -# response_feature_name( -# unprefixed_feature_ref, feature_refs, full_feature_names -# ) -# ][0] -# ) -# # Check what happens for missing request data -# with pytest.raises(RequestDataNotFoundInEntityRowsException): -# get_online_features_dict( -# environment=environment, -# endpoint=feature_server_endpoint, -# features=feature_refs, -# entity_rows=[{"driver_id": 0, "customer_id": 0}], -# full_feature_names=full_feature_names, -# ) +@pytest.mark.integration +@pytest.mark.universal_online_stores(only=["redis"]) +def test_online_store_cleanup(environment, universal_data_sources): + """ + Some online store implementations (like Redis) keep features from different features views + but with common entities together. + This might end up with deletion of all features attached to the entity, + when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). + + Plan: + 1. Register two feature views with common entity "driver" + 2. Materialize data + 3. Check if features are available (via online retrieval) + 4. Delete one feature view + 5. Check that features for other are still available + 6. Delete another feature view (and create again) + 7. Verify that features for both feature view were deleted + """ + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + driver_stats_fv = construct_universal_feature_views(data_sources).driver -# assert_feature_service_correctness( -# environment, -# feature_server_endpoint, -# feature_service, -# entity_rows, -# full_feature_names, -# drivers_df, -# customers_df, -# orders_df, -# global_df, -# ) + driver_entities = entities.driver_vals + df = pd.DataFrame( + { + "ts_1": [environment.end_date] * len(driver_entities), + "created_ts": [environment.end_date] * len(driver_entities), + "driver_id": driver_entities, + "value": np.random.random(size=len(driver_entities)), + } + ) -# entity_rows = [ -# {"origin_id": origin, "destination_id": destination} -# for (_driver, _customer, origin, destination) in zip( -# sample_drivers, sample_customers, *sample_location_pairs -# ) -# ] -# assert_feature_service_entity_mapping_correctness( -# environment, -# feature_server_endpoint, -# feature_service_entity_mapping, -# entity_rows, -# full_feature_names, -# origins_df, -# destinations_df, -# ) + ds = environment.data_source_creator.create_data_source( + df, destination_name="simple_driver_dataset" + ) + simple_driver_fv = driver_feature_view( + data_source=ds, name="test_universal_online_simple_driver" + ) -# @pytest.mark.integration -# @pytest.mark.universal_online_stores(only=["redis"]) -# def test_online_store_cleanup(environment, universal_data_sources): -# """ -# Some online store implementations (like Redis) keep features from different features views -# but with common entities together. -# This might end up with deletion of all features attached to the entity, -# when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). - -# Plan: -# 1. Register two feature views with common entity "driver" -# 2. Materialize data -# 3. Check if features are available (via online retrieval) -# 4. Delete one feature view -# 5. Check that features for other are still available -# 6. Delete another feature view (and create again) -# 7. Verify that features for both feature view were deleted -# """ -# fs = environment.feature_store -# entities, datasets, data_sources = universal_data_sources -# driver_stats_fv = construct_universal_feature_views(data_sources).driver - -# driver_entities = entities.driver_vals -# df = pd.DataFrame( -# { -# "ts_1": [environment.end_date] * len(driver_entities), -# "created_ts": [environment.end_date] * len(driver_entities), -# "driver_id": driver_entities, -# "value": np.random.random(size=len(driver_entities)), -# } -# ) + fs.apply([driver(), simple_driver_fv, driver_stats_fv]) -# ds = environment.data_source_creator.create_data_source( -# df, destination_name="simple_driver_dataset" -# ) + fs.materialize( + environment.start_date - timedelta(days=1), + environment.end_date + timedelta(days=1), + ) + expected_values = df.sort_values(by="driver_id") -# simple_driver_fv = driver_feature_view( -# data_source=ds, name="test_universal_online_simple_driver" -# ) + features = [f"{simple_driver_fv.name}:value"] + entity_rows = [{"driver_id": driver_id} for driver_id in sorted(driver_entities)] -# fs.apply([driver(), simple_driver_fv, driver_stats_fv]) + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert np.allclose(expected_values["value"], online_features["value"]) -# fs.materialize( -# environment.start_date - timedelta(days=1), -# environment.end_date + timedelta(days=1), -# ) -# expected_values = df.sort_values(by="driver_id") + fs.apply( + objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False + ) -# features = [f"{simple_driver_fv.name}:value"] -# entity_rows = [{"driver_id": driver_id} for driver_id in sorted(driver_entities)] + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert np.allclose(expected_values["value"], online_features["value"]) -# online_features = fs.get_online_features( -# features=features, entity_rows=entity_rows -# ).to_dict() -# assert np.allclose(expected_values["value"], online_features["value"]) + fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) -# fs.apply( -# objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False -# ) + def eventually_apply() -> Tuple[None, bool]: + try: + fs.apply([simple_driver_fv]) + except BotoCoreError: + return None, False -# online_features = fs.get_online_features( -# features=features, entity_rows=entity_rows -# ).to_dict() -# assert np.allclose(expected_values["value"], online_features["value"]) + return None, True -# fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) + # Online store backend might have eventual consistency in schema update + # So recreating table that was just deleted might need some retries + wait_retry_backoff(eventually_apply, timeout_secs=60) -# def eventually_apply() -> Tuple[None, bool]: -# try: -# fs.apply([simple_driver_fv]) -# except BotoCoreError: -# return None, False + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert all(v is None for v in online_features["value"]) -# return None, True -# # Online store backend might have eventual consistency in schema update -# # So recreating table that was just deleted might need some retries -# wait_retry_backoff(eventually_apply, timeout_secs=60) +def response_feature_name( + feature: str, feature_refs: List[str], full_feature_names: bool +) -> str: + if not full_feature_names: + return feature -# online_features = fs.get_online_features( -# features=features, entity_rows=entity_rows -# ).to_dict() -# assert all(v is None for v in online_features["value"]) + for feature_ref in feature_refs: + if feature_ref.endswith(feature): + return feature_ref.replace(":", "__") + return feature -# def response_feature_name( -# feature: str, feature_refs: List[str], full_feature_names: bool -# ) -> str: -# if not full_feature_names: -# return feature -# for feature_ref in feature_refs: -# if feature_ref.endswith(feature): -# return feature_ref.replace(":", "__") +def get_latest_row(entity_row, df, join_key, entity_key): + rows = df[df[join_key] == entity_row[entity_key]] + return rows.loc[rows["event_timestamp"].idxmax()].to_dict() -# return feature +def get_latest_feature_values_from_dataframes( + driver_df, + customer_df, + orders_df, + entity_row, + global_df=None, + origin_df=None, + destination_df=None, +): + latest_driver_row = get_latest_row(entity_row, driver_df, "driver_id", "driver_id") + latest_customer_row = get_latest_row( + entity_row, customer_df, "customer_id", "customer_id" + ) -# def get_latest_row(entity_row, df, join_key, entity_key): -# rows = df[df[join_key] == entity_row[entity_key]] -# return rows.loc[rows["event_timestamp"].idxmax()].to_dict() + # Since the event timestamp columns may contain timestamps of different timezones, + # we must first convert the timestamps to UTC before we can compare them. + order_rows = orders_df[ + (orders_df["driver_id"] == entity_row["driver_id"]) + & (orders_df["customer_id"] == entity_row["customer_id"]) + ] + timestamps = order_rows[["event_timestamp"]] + timestamps["event_timestamp"] = pd.to_datetime( + timestamps["event_timestamp"], utc=True + ) + max_index = timestamps["event_timestamp"].idxmax() + latest_orders_row = order_rows.loc[max_index] + + if global_df is not None: + latest_global_row = global_df.loc[ + global_df["event_timestamp"].idxmax() + ].to_dict() + if origin_df is not None: + latest_location_row = get_latest_feature_values_for_location_df( + entity_row, origin_df, destination_df + ) + request_data_features = entity_row.copy() + request_data_features.pop("driver_id") + request_data_features.pop("customer_id") + if global_df is not None: + return { + **latest_customer_row, + **latest_driver_row, + **latest_orders_row, + **latest_global_row, + **request_data_features, + } + if origin_df is not None: + request_data_features.pop("origin_id") + request_data_features.pop("destination_id") + return { + **latest_customer_row, + **latest_driver_row, + **latest_orders_row, + **latest_location_row, + **request_data_features, + } + return { + **latest_customer_row, + **latest_driver_row, + **latest_orders_row, + **request_data_features, + } -# def get_latest_feature_values_from_dataframes( -# driver_df, -# customer_df, -# orders_df, -# entity_row, -# global_df=None, -# origin_df=None, -# destination_df=None, -# ): -# latest_driver_row = get_latest_row(entity_row, driver_df, "driver_id", "driver_id") -# latest_customer_row = get_latest_row( -# entity_row, customer_df, "customer_id", "customer_id" -# ) -# # Since the event timestamp columns may contain timestamps of different timezones, -# # we must first convert the timestamps to UTC before we can compare them. -# order_rows = orders_df[ -# (orders_df["driver_id"] == entity_row["driver_id"]) -# & (orders_df["customer_id"] == entity_row["customer_id"]) -# ] -# timestamps = order_rows[["event_timestamp"]] -# timestamps["event_timestamp"] = pd.to_datetime( -# timestamps["event_timestamp"], utc=True -# ) -# max_index = timestamps["event_timestamp"].idxmax() -# latest_orders_row = order_rows.loc[max_index] - -# if global_df is not None: -# latest_global_row = global_df.loc[ -# global_df["event_timestamp"].idxmax() -# ].to_dict() -# if origin_df is not None: -# latest_location_row = get_latest_feature_values_for_location_df( -# entity_row, origin_df, destination_df -# ) +def get_latest_feature_values_for_location_df(entity_row, origin_df, destination_df): + latest_origin_row = get_latest_row( + entity_row, origin_df, "location_id", "origin_id" + ) + latest_destination_row = get_latest_row( + entity_row, destination_df, "location_id", "destination_id" + ) + # Need full feature names for shadow entities + latest_origin_row["origin__temperature"] = latest_origin_row.pop("temperature") + latest_destination_row["destination__temperature"] = latest_destination_row.pop( + "temperature" + ) -# request_data_features = entity_row.copy() -# request_data_features.pop("driver_id") -# request_data_features.pop("customer_id") -# if global_df is not None: -# return { -# **latest_customer_row, -# **latest_driver_row, -# **latest_orders_row, -# **latest_global_row, -# **request_data_features, -# } -# if origin_df is not None: -# request_data_features.pop("origin_id") -# request_data_features.pop("destination_id") -# return { -# **latest_customer_row, -# **latest_driver_row, -# **latest_orders_row, -# **latest_location_row, -# **request_data_features, -# } -# return { -# **latest_customer_row, -# **latest_driver_row, -# **latest_orders_row, -# **request_data_features, -# } - - -# def get_latest_feature_values_for_location_df(entity_row, origin_df, destination_df): -# latest_origin_row = get_latest_row( -# entity_row, origin_df, "location_id", "origin_id" -# ) -# latest_destination_row = get_latest_row( -# entity_row, destination_df, "location_id", "destination_id" -# ) -# # Need full feature names for shadow entities -# latest_origin_row["origin__temperature"] = latest_origin_row.pop("temperature") -# latest_destination_row["destination__temperature"] = latest_destination_row.pop( -# "temperature" -# ) + return { + **latest_origin_row, + **latest_destination_row, + } -# return { -# **latest_origin_row, -# **latest_destination_row, -# } +def get_latest_feature_values_from_location_df(entity_row, location_df): + return get_latest_row(entity_row, location_df, "location_id", "location_id") -# def get_latest_feature_values_from_location_df(entity_row, location_df): -# return get_latest_row(entity_row, location_df, "location_id", "location_id") +def assert_feature_service_correctness( + environment, + endpoint, + feature_service, + entity_rows, + full_feature_names, + drivers_df, + customers_df, + orders_df, + global_df, +): + feature_service_online_features_dict = get_online_features_dict( + environment=environment, + endpoint=endpoint, + features=feature_service, + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) + feature_service_keys = feature_service_online_features_dict.keys() + expected_feature_refs = [ + f"{projection.name_to_use()}__{feature.name}" + if full_feature_names + else feature.name + for projection in feature_service.feature_view_projections + for feature in projection.features + ] + assert set(feature_service_keys) == set(expected_feature_refs) | { + "customer_id", + "driver_id", + } -# def assert_feature_service_correctness( -# environment, -# endpoint, -# feature_service, -# entity_rows, -# full_feature_names, -# drivers_df, -# customers_df, -# orders_df, -# global_df, -# ): -# feature_service_online_features_dict = get_online_features_dict( -# environment=environment, -# endpoint=endpoint, -# features=feature_service, -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) -# feature_service_keys = feature_service_online_features_dict.keys() -# expected_feature_refs = [ -# f"{projection.name_to_use()}__{feature.name}" -# if full_feature_names -# else feature.name -# for projection in feature_service.feature_view_projections -# for feature in projection.features -# ] -# assert set(feature_service_keys) == set(expected_feature_refs) | { -# "customer_id", -# "driver_id", -# } + tc = unittest.TestCase() + for i, entity_row in enumerate(entity_rows): + df_features = get_latest_feature_values_from_dataframes( + driver_df=drivers_df, + customer_df=customers_df, + orders_df=orders_df, + global_df=global_df, + entity_row=entity_row, + ) + tc.assertAlmostEqual( + feature_service_online_features_dict[ + response_feature_name( + "conv_rate_plus_100", expected_feature_refs, full_feature_names + ) + ][i], + df_features["conv_rate"] + 100, + delta=0.0001, + ) -# tc = unittest.TestCase() -# for i, entity_row in enumerate(entity_rows): -# df_features = get_latest_feature_values_from_dataframes( -# driver_df=drivers_df, -# customer_df=customers_df, -# orders_df=orders_df, -# global_df=global_df, -# entity_row=entity_row, -# ) -# tc.assertAlmostEqual( -# feature_service_online_features_dict[ -# response_feature_name( -# "conv_rate_plus_100", expected_feature_refs, full_feature_names -# ) -# ][i], -# df_features["conv_rate"] + 100, -# delta=0.0001, -# ) +def assert_feature_service_entity_mapping_correctness( + environment, + endpoint, + feature_service, + entity_rows, + full_feature_names, + origins_df, + destinations_df, +): + if full_feature_names: + feature_service_online_features_dict = get_online_features_dict( + environment=environment, + endpoint=endpoint, + features=feature_service, + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) + feature_service_keys = feature_service_online_features_dict.keys() + + expected_features = [ + f"{projection.name_to_use()}__{feature.name}" + if full_feature_names + else feature.name + for projection in feature_service.feature_view_projections + for feature in projection.features + ] + assert set(feature_service_keys) == set(expected_features) | { + "destination_id", + "origin_id", + } -# def assert_feature_service_entity_mapping_correctness( -# environment, -# endpoint, -# feature_service, -# entity_rows, -# full_feature_names, -# origins_df, -# destinations_df, -# ): -# if full_feature_names: -# feature_service_online_features_dict = get_online_features_dict( -# environment=environment, -# endpoint=endpoint, -# features=feature_service, -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) -# feature_service_keys = feature_service_online_features_dict.keys() - -# expected_features = [ -# f"{projection.name_to_use()}__{feature.name}" -# if full_feature_names -# else feature.name -# for projection in feature_service.feature_view_projections -# for feature in projection.features -# ] -# assert set(feature_service_keys) == set(expected_features) | { -# "destination_id", -# "origin_id", -# } - -# for i, entity_row in enumerate(entity_rows): -# df_features = get_latest_feature_values_for_location_df( -# origin_df=origins_df, -# destination_df=destinations_df, -# entity_row=entity_row, -# ) -# for feature_name in ["origin__temperature", "destination__temperature"]: -# assert ( -# feature_service_online_features_dict[feature_name][i] -# == df_features[feature_name] -# ) -# else: -# # using 2 of the same FeatureView without full_feature_names=True will result in collision -# with pytest.raises(FeatureNameCollisionError): -# get_online_features_dict( -# environment=environment, -# endpoint=endpoint, -# features=feature_service, -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) + for i, entity_row in enumerate(entity_rows): + df_features = get_latest_feature_values_for_location_df( + origin_df=origins_df, + destination_df=destinations_df, + entity_row=entity_row, + ) + for feature_name in ["origin__temperature", "destination__temperature"]: + assert ( + feature_service_online_features_dict[feature_name][i] + == df_features[feature_name] + ) + else: + # using 2 of the same FeatureView without full_feature_names=True will result in collision + with pytest.raises(FeatureNameCollisionError): + get_online_features_dict( + environment=environment, + endpoint=endpoint, + features=feature_service, + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) From 2a4cd1018d9b14ac1f890f55a6b2c47a883711f9 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 21 Jun 2022 15:07:49 -0700 Subject: [PATCH 20/30] Fix test Signed-off-by: Kevin Zhang --- sdk/python/feast/data_source.py | 4 + sdk/python/feast/feature_store.py | 20 ++- .../test_stream_feature_view_apply.py | 138 +++++++++--------- 3 files changed, 86 insertions(+), 76 deletions(-) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 3682d84e57..c30145ddce 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -913,6 +913,10 @@ def to_proto(self) -> DataSourceProto: return data_source_proto +class PushMode(enum.Enum): + ONLINE = 1 + OFFLINE = 2 + ONLINE_AND_OFFLINE = 3 @typechecked class PushSource(DataSource): diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 9c2ea8a276..bd5e46bfa0 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -43,7 +43,7 @@ from feast import feature_server, flags, flags_helper, ui_server, utils from feast.base_feature_view import BaseFeatureView from feast.batch_feature_view import BatchFeatureView -from feast.data_source import DataSource +from feast.data_source import DataSource, PushMode from feast.diff.infra_diff import InfraDiff, diff_infra_protos from feast.diff.registry_diff import RegistryDiff, apply_diff_to_registry, diff_between from feast.dqm.errors import ValidationFailed @@ -1341,15 +1341,16 @@ def tqdm_builder(length): @log_exceptions_and_usage def push( - self, push_source_name: str, df: pd.DataFrame, allow_registry_cache: bool = True + self, push_source_name: str, df: pd.DataFrame, allow_registry_cache: bool = True, to: PushMode = PushMode.ONLINE ): """ Push features to a push source. This updates all the feature views that have the push source as stream source. Args: push_source_name: The name of the push source we want to push data to. - df: the data being pushed. - allow_registry_cache: whether to allow cached versions of the registry. + df: The data being pushed. + allow_registry_cache: Whether to allow cached versions of the registry. + to: Whether to push to online or offline store. Defaults to online store only. """ warnings.warn( "Push source is an experimental feature. " @@ -1373,9 +1374,14 @@ def push( } for fv in fvs_with_push_sources: - self.write_to_online_store( - fv.name, df, allow_registry_cache=allow_registry_cache - ) + if to == PushMode.ONLINE or to == PushMode.ONLINE_AND_OFFLINE: + self.write_to_online_store( + fv.name, df, allow_registry_cache=allow_registry_cache + ) + if to == PushMode.OFFLINE or to == PushMode.ONLINE_AND_OFFLINE: + self._write_to_offline_store( + fv.name, df, allow_registry_cache=allow_registry_cache + ) @log_exceptions_and_usage def write_to_online_store( diff --git a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py index 8e2af031c5..f92fd340f0 100644 --- a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py +++ b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py @@ -77,72 +77,72 @@ def simple_sfv(df): assert features["dummy_field"] == [None] -@pytest.mark.integration -def test_stream_feature_view_udf(simple_dataset_1) -> None: - """ - Test apply of StreamFeatureView udfs are serialized correctly and usable. - """ - runner = CliRunner() - with runner.local_repo( - get_example_repo("example_feature_repo_1.py"), "bigquery" - ) as fs, prep_file_source( - df=simple_dataset_1, timestamp_field="ts_1" - ) as file_source: - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - kafka_bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=file_source, - watermark_delay_threshold=timedelta(days=1), - ) - - @stream_feature_view( - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", - function="count", - time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - tags={}, - ) - def pandas_view(pandas_df): - import pandas as pd - - assert type(pandas_df) == pd.DataFrame - df = pandas_df.transform(lambda x: x + 10, axis=1) - df.insert(2, "C", [20.2, 230.0, 34.0], True) - return df - - import pandas as pd - - fs.apply([entity, pandas_view]) - - stream_feature_views = fs.list_stream_feature_views() - assert len(stream_feature_views) == 1 - assert stream_feature_views[0] == pandas_view - - sfv = stream_feature_views[0] - - df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) - new_df = sfv.udf(df) - expected_df = pd.DataFrame( - {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} - ) - assert new_df.equals(expected_df) +# @pytest.mark.integration +# def test_stream_feature_view_udf(simple_dataset_1) -> None: +# """ +# Test apply of StreamFeatureView udfs are serialized correctly and usable. +# """ +# runner = CliRunner() +# with runner.local_repo( +# get_example_repo("example_feature_repo_1.py"), "bigquery" +# ) as fs, prep_file_source( +# df=simple_dataset_1, timestamp_field="ts_1" +# ) as file_source: +# entity = Entity(name="driver_entity", join_keys=["test_key"]) + +# stream_source = KafkaSource( +# name="kafka", +# timestamp_field="event_timestamp", +# kafka_bootstrap_servers="", +# message_format=AvroFormat(""), +# topic="topic", +# batch_source=file_source, +# watermark_delay_threshold=timedelta(days=1), +# ) + +# @stream_feature_view( +# entities=[entity], +# ttl=timedelta(days=30), +# owner="test@example.com", +# online=True, +# schema=[Field(name="dummy_field", dtype=Float32)], +# description="desc", +# aggregations=[ +# Aggregation( +# column="dummy_field", function="max", time_window=timedelta(days=1), +# ), +# Aggregation( +# column="dummy_field2", +# function="count", +# time_window=timedelta(days=24), +# ), +# ], +# timestamp_field="event_timestamp", +# mode="spark", +# source=stream_source, +# tags={}, +# ) +# def pandas_view(pandas_df): +# import pandas as pd + +# assert type(pandas_df) == pd.DataFrame +# df = pandas_df.transform(lambda x: x + 10, axis=1) +# df.insert(2, "C", [20.2, 230.0, 34.0], True) +# return df + +# import pandas as pd + +# fs.apply([entity, pandas_view]) + +# stream_feature_views = fs.list_stream_feature_views() +# assert len(stream_feature_views) == 1 +# assert stream_feature_views[0] == pandas_view + +# sfv = stream_feature_views[0] + +# df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) +# new_df = sfv.udf(df) +# expected_df = pd.DataFrame( +# {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} +# ) +# assert new_df.equals(expected_df) From 48dfa8720eb1f4f296b6440fcc0fbbe8e8171395 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 10:49:23 -0700 Subject: [PATCH 21/30] Fix interface Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/passthrough_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 9d18e6b249..e702661641 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -110,7 +110,7 @@ def offline_write_batch( set_usage_attribute("provider", self.__class__.__name__) if self.offline_store: - self.offline_store.offline_write_batch(config, feature_view, data, progress) + self.offline_store.__class__.offline_write_batch(config, feature_view, data, progress) @log_exceptions_and_usage(sampler=RatioSampler(ratio=0.001)) def online_read( From 08dad4f2df8c2cf3828e164a03c03e668240216a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 11:11:08 -0700 Subject: [PATCH 22/30] Fix Signed-off-by: Kevin Zhang --- .../feature-servers/python-feature-server.md | 11 ++++++++--- sdk/python/feast/feature_server.py | 10 +++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/reference/feature-servers/python-feature-server.md b/docs/reference/feature-servers/python-feature-server.md index 352f0edc16..0b357565ee 100644 --- a/docs/reference/feature-servers/python-feature-server.md +++ b/docs/reference/feature-servers/python-feature-server.md @@ -2,7 +2,7 @@ ## Overview -The feature server is an HTTP endpoint that serves features with JSON I/O. This enables users to write + read features from Feast online stores using any programming language that can make HTTP requests. +The feature server is an HTTP endpoint that serves features with JSON I/O. This enables users to write + read features from Feast online stores using any programming language that can make HTTP requests. ## CLI @@ -155,6 +155,10 @@ curl -X POST \ ### Pushing features to the online store You can push data corresponding to a push source to the online store (note that timestamps need to be strings): +You can also define a pushmode to push offline data, either to the online store, offline store, or both. The feature server will throw an error if the online/offline +store doesn't support the push api functionality. + +The request definition for pushmode is a string parameter `to` where the options are: ["online", "offline", "both"]. ```text curl -X POST "http://localhost:6566/push" -d '{ "push_source_name": "driver_hourly_stats_push_source", @@ -187,9 +191,10 @@ event_dict = { } push_data = { "push_source_name":"driver_stats_push_source", - "df":event_dict + "df":event_dict, + "to":"online", } requests.post( - "http://localhost:6566/push", + "http://localhost:6566/push", data=json.dumps(push_data)) ``` diff --git a/sdk/python/feast/feature_server.py b/sdk/python/feast/feature_server.py index 8347bed6da..3228aa17d3 100644 --- a/sdk/python/feast/feature_server.py +++ b/sdk/python/feast/feature_server.py @@ -13,7 +13,7 @@ import feast from feast import proto_json from feast.protos.feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest - +from feast.data_source import PushMode # TODO: deprecate this in favor of push features class WriteToFeatureStoreRequest(BaseModel): @@ -26,6 +26,7 @@ class PushFeaturesRequest(BaseModel): push_source_name: str df: dict allow_registry_cache: bool = True + to: str = "online" def get_app(store: "feast.FeatureStore"): @@ -80,10 +81,17 @@ def push(body=Depends(get_body)): try: request = PushFeaturesRequest(**json.loads(body)) df = pd.DataFrame(request.df) + if request.to == "offline": + to = PushMode.OFFLINE + elif request.to == "online": + to = PushMode.ONLINE + else: + to = PushMode.ONLINE_AND_OFFLINE store.push( push_source_name=request.push_source_name, df=df, allow_registry_cache=request.allow_registry_cache, + to=to, ) except Exception as e: # Print the original exception on the server side From 527ad0deb2530576df10cf8db4961bec7078270d Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 11:55:31 -0700 Subject: [PATCH 23/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/data_source.py | 2 + sdk/python/feast/feature_server.py | 3 +- sdk/python/feast/feature_store.py | 6 ++- .../feast/infra/passthrough_provider.py | 4 +- .../offline_store/test_offline_write.py | 4 +- .../test_push_online_retrieval.py | 54 +++++++++++++++++++ 6 files changed, 69 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index c30145ddce..f5c40d2421 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -913,11 +913,13 @@ def to_proto(self) -> DataSourceProto: return data_source_proto + class PushMode(enum.Enum): ONLINE = 1 OFFLINE = 2 ONLINE_AND_OFFLINE = 3 + @typechecked class PushSource(DataSource): """ diff --git a/sdk/python/feast/feature_server.py b/sdk/python/feast/feature_server.py index 3228aa17d3..7bc634f7f5 100644 --- a/sdk/python/feast/feature_server.py +++ b/sdk/python/feast/feature_server.py @@ -12,8 +12,9 @@ import feast from feast import proto_json -from feast.protos.feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest from feast.data_source import PushMode +from feast.protos.feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest + # TODO: deprecate this in favor of push features class WriteToFeatureStoreRequest(BaseModel): diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index bd5e46bfa0..de52b9e3f3 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1341,7 +1341,11 @@ def tqdm_builder(length): @log_exceptions_and_usage def push( - self, push_source_name: str, df: pd.DataFrame, allow_registry_cache: bool = True, to: PushMode = PushMode.ONLINE + self, + push_source_name: str, + df: pd.DataFrame, + allow_registry_cache: bool = True, + to: PushMode = PushMode.ONLINE, ): """ Push features to a push source. This updates all the feature views that have the push source as stream source. diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index e702661641..8c6dd831dd 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -110,7 +110,9 @@ def offline_write_batch( set_usage_attribute("provider", self.__class__.__name__) if self.offline_store: - self.offline_store.__class__.offline_write_batch(config, feature_view, data, progress) + self.offline_store.__class__.offline_write_batch( + config, feature_view, data, progress + ) @log_exceptions_and_usage(sampler=RatioSampler(ratio=0.001)) def online_read( diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py index 5e7a242513..997299c11b 100644 --- a/sdk/python/tests/integration/offline_store/test_offline_write.py +++ b/sdk/python/tests/integration/offline_store/test_offline_write.py @@ -123,7 +123,9 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour Field(name="acc_rate", dtype=Float32), ], source=data_sources.driver, - ttl=timedelta(minutes=10), + ttl=timedelta( + minutes=10 + ), # This is to make sure all offline store data is out of date since get_historical_features() only searches backwards for a ttl window. ) now = datetime.utcnow() diff --git a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py index aa7e3e7f53..3d5d716ecb 100644 --- a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py +++ b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py @@ -1,8 +1,10 @@ import datetime +import numpy as np import pandas as pd import pytest +from feast.data_source import PushMode from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, ) @@ -39,3 +41,55 @@ def test_push_features_and_read(environment, universal_data_sources): online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] assert online_resp_dict["temperature"] == [4] + + +@pytest.mark.integration +@pytest.mark.universal_offline_stores(only=["file", "redshift"]) +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_push_features_and_read_from_offline_store(environment, universal_data_sources): + store = environment.feature_store + + (_, _, data_sources) = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") + + store.apply([driver(), customer(), location(), *feature_views.values()]) + entity_df = pd.DataFrame.from_dict({"location_id": [1], "event_timestamp": [now,],}) + + before_df = store.get_historical_features( + entity_df=entity_df, + features=["pushable_location_stats:temperature"], + full_feature_names=False, + ).to_df() + + data = { + "event_timestamp": [now], + "location_id": [1], + "temperature": [4], + "created": [now], + } + df_ingest = pd.DataFrame(data) + assert np.where( + before_df["location_id"].reset_index(drop=True) + == df_ingest["location_id"].reset_index(drop=True) + ) + assert np.where( + before_df["temperature"].reset_index(drop=True) + != df_ingest["temperature"].reset_index(drop=True) + ) + + store.push("location_stats_push_source", df_ingest, to=PushMode.OFFLINE) + + df = store.get_historical_features( + entity_df=entity_df, + features=["pushable_location_stats:temperature"], + full_feature_names=False, + ).to_df() + assert np.where( + df["location_id"].reset_index(drop=True) + == df_ingest["location_id"].reset_index(drop=True) + ) + assert np.where( + df["temperature"].reset_index(drop=True) + == df_ingest["temperature"].reset_index(drop=True) + ) From fdb5e8b9148e5c2bbc3510d8d4a83b098aa9b703 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:00:34 -0700 Subject: [PATCH 24/30] Update Signed-off-by: Kevin Zhang --- .../offline_store/test_push_offline_retrieval | 66 +++++++++++++++++++ .../test_push_online_retrieval.py | 54 +-------------- 2 files changed, 67 insertions(+), 53 deletions(-) create mode 100644 sdk/python/tests/integration/offline_store/test_push_offline_retrieval diff --git a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval b/sdk/python/tests/integration/offline_store/test_push_offline_retrieval new file mode 100644 index 0000000000..5aaed47313 --- /dev/null +++ b/sdk/python/tests/integration/offline_store/test_push_offline_retrieval @@ -0,0 +1,66 @@ +import datetime + +import numpy as np +import pandas as pd +import pytest + +from feast.data_source import PushMode +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, +) +from tests.integration.feature_repos.universal.entities import ( + customer, + driver, + location, +) + +@pytest.mark.integration +@pytest.mark.universal_offline_stores(only=["file", "redshift"]) +@pytest.mark.universal_online_stores(only=["sqlite"]) +def test_push_features_and_read_from_offline_store(environment, universal_data_sources): + store = environment.feature_store + + (_, _, data_sources) = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") + + store.apply([driver(), customer(), location(), *feature_views.values()]) + entity_df = pd.DataFrame.from_dict({"location_id": [1], "event_timestamp": [now]}) + + before_df = store.get_historical_features( + entity_df=entity_df, + features=["pushable_location_stats:temperature"], + full_feature_names=False, + ).to_df() + + data = { + "event_timestamp": [now], + "location_id": [1], + "temperature": [4], + "created": [now], + } + df_ingest = pd.DataFrame(data) + assert np.where( + before_df["location_id"].reset_index(drop=True) + == df_ingest["location_id"].reset_index(drop=True) + ) + assert np.where( + before_df["temperature"].reset_index(drop=True) + != df_ingest["temperature"].reset_index(drop=True) + ) + + store.push("location_stats_push_source", df_ingest, to=PushMode.OFFLINE) + + df = store.get_historical_features( + entity_df=entity_df, + features=["pushable_location_stats:temperature"], + full_feature_names=False, + ).to_df() + assert np.where( + df["location_id"].reset_index(drop=True) + == df_ingest["location_id"].reset_index(drop=True) + ) + assert np.where( + df["temperature"].reset_index(drop=True) + == df_ingest["temperature"].reset_index(drop=True) + ) diff --git a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py index 3d5d716ecb..6091363411 100644 --- a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py +++ b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py @@ -40,56 +40,4 @@ def test_push_features_and_read(environment, universal_data_sources): ) online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] - assert online_resp_dict["temperature"] == [4] - - -@pytest.mark.integration -@pytest.mark.universal_offline_stores(only=["file", "redshift"]) -@pytest.mark.universal_online_stores(only=["sqlite"]) -def test_push_features_and_read_from_offline_store(environment, universal_data_sources): - store = environment.feature_store - - (_, _, data_sources) = universal_data_sources - feature_views = construct_universal_feature_views(data_sources) - now = pd.Timestamp(datetime.datetime.utcnow()).round("ms") - - store.apply([driver(), customer(), location(), *feature_views.values()]) - entity_df = pd.DataFrame.from_dict({"location_id": [1], "event_timestamp": [now,],}) - - before_df = store.get_historical_features( - entity_df=entity_df, - features=["pushable_location_stats:temperature"], - full_feature_names=False, - ).to_df() - - data = { - "event_timestamp": [now], - "location_id": [1], - "temperature": [4], - "created": [now], - } - df_ingest = pd.DataFrame(data) - assert np.where( - before_df["location_id"].reset_index(drop=True) - == df_ingest["location_id"].reset_index(drop=True) - ) - assert np.where( - before_df["temperature"].reset_index(drop=True) - != df_ingest["temperature"].reset_index(drop=True) - ) - - store.push("location_stats_push_source", df_ingest, to=PushMode.OFFLINE) - - df = store.get_historical_features( - entity_df=entity_df, - features=["pushable_location_stats:temperature"], - full_feature_names=False, - ).to_df() - assert np.where( - df["location_id"].reset_index(drop=True) - == df_ingest["location_id"].reset_index(drop=True) - ) - assert np.where( - df["temperature"].reset_index(drop=True) - == df_ingest["temperature"].reset_index(drop=True) - ) + assert online_resp_dict["temperature"] == [4] \ No newline at end of file From 57318fdb8831b17dd26b7f8a627edf3b95746ab6 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:01:37 -0700 Subject: [PATCH 25/30] Fix Signed-off-by: Kevin Zhang --- .../integration/online_store/test_push_online_retrieval.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py index 6091363411..aa7e3e7f53 100644 --- a/sdk/python/tests/integration/online_store/test_push_online_retrieval.py +++ b/sdk/python/tests/integration/online_store/test_push_online_retrieval.py @@ -1,10 +1,8 @@ import datetime -import numpy as np import pandas as pd import pytest -from feast.data_source import PushMode from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, ) @@ -40,4 +38,4 @@ def test_push_features_and_read(environment, universal_data_sources): ) online_resp_dict = online_resp.to_dict() assert online_resp_dict["location_id"] == [1] - assert online_resp_dict["temperature"] == [4] \ No newline at end of file + assert online_resp_dict["temperature"] == [4] From ad3f608c03b497aacf277a150dbcabca70a1a9a4 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:15:41 -0700 Subject: [PATCH 26/30] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/utils/aws_utils.py | 2 ++ .../feature_repos/repo_configuration.py | 15 --------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index b284d24231..dc5e2f0c11 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -243,6 +243,7 @@ def delete_redshift_table( redshift_data_client, cluster_id, database, user, drop_query, ) + def delete_redshift_table( redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, ): @@ -251,6 +252,7 @@ def delete_redshift_table( redshift_data_client, cluster_id, database, user, drop_query, ) + def upload_arrow_table_to_redshift( table: Union[pyarrow.Table, Path], redshift_data_client, diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 75835f1c56..f4d5defcad 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -74,26 +74,11 @@ "connection_string": "127.0.0.1:6001,127.0.0.1:6002,127.0.0.1:6003", } -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> fec6cc0b (Lint) OFFLINE_STORE_TO_PROVIDER_CONFIG: Dict[str, DataSourceCreator] = { "file": ("local", FileDataSourceCreator), "gcp": ("gcp", BigQueryDataSourceCreator), "redshift": ("aws", RedshiftDataSourceCreator), "snowflake": ("aws", RedshiftDataSourceCreator), -<<<<<<< HEAD -======= -OFFLINE_STORE_TO_PROVIDER_CONFIG : Dict[ - str, DataSourceCreator] = { - "file": ("local", FileDataSourceCreator), - "gcp": ("gcp", BigQueryDataSourceCreator), - "redshift": ("aws", RedshiftDataSourceCreator), - "snowflake": ("aws", RedshiftDataSourceCreator), ->>>>>>> a1b0c4a6 (Add redshift) -======= ->>>>>>> fec6cc0b (Lint) } AVAILABLE_OFFLINE_STORES: List[Tuple[str, Type[DataSourceCreator]]] = [ From 4f7ffd8825d8756d17717918e329a13919a956fc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:16:48 -0700 Subject: [PATCH 27/30] Fix rebase Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/utils/aws_utils.py | 56 ----------------------- 1 file changed, 56 deletions(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index dc5e2f0c11..7badda9846 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -244,15 +244,6 @@ def delete_redshift_table( ) -def delete_redshift_table( - redshift_data_client, cluster_id: str, database: str, user: str, table_name: str, -): - drop_query = f"DROP {table_name} IF EXISTS" - execute_redshift_statement( - redshift_data_client, cluster_id, database, user, drop_query, - ) - - def upload_arrow_table_to_redshift( table: Union[pyarrow.Table, Path], redshift_data_client, @@ -436,53 +427,6 @@ def temporarily_upload_arrow_table_to_redshift( ) -@contextlib.contextmanager -def temporarily_upload_arrow_table_to_redshift( - table: Union[pyarrow.Table, Path], - redshift_data_client, - cluster_id: str, - database: str, - user: str, - s3_resource, - iam_role: str, - s3_path: str, - table_name: str, - schema: Optional[pyarrow.Schema] = None, - fail_if_exists: bool = True, -) -> Iterator[None]: - """Uploads a Arrow Table to Redshift as a new table with cleanup logic. - - This is essentially the same as upload_arrow_table_to_redshift (check out its docstring for full details), - but unlike it this method is a generator and should be used with `with` block. For example: - - >>> with temporarily_upload_arrow_table_to_redshift(...): # doctest: +SKIP - >>> # Use `table_name` table in Redshift here - >>> # `table_name` will not exist at this point, since it's cleaned up by the `with` block - - """ - # Upload the dataframe to Redshift - upload_arrow_table_to_redshift( - table, - redshift_data_client, - cluster_id, - database, - user, - s3_resource, - s3_path, - iam_role, - table_name, - schema, - fail_if_exists, - ) - - yield - - # Clean up the uploaded Redshift table - execute_redshift_statement( - redshift_data_client, cluster_id, database, user, f"DROP TABLE {table_name}", - ) - - def download_s3_directory(s3_resource, bucket: str, key: str, local_dir: str): """Download the S3 directory to a local disk""" bucket_obj = s3_resource.Bucket(bucket) From 7f4f2a12c68d90be31229fcd332cd129006fb39f Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:19:09 -0700 Subject: [PATCH 28/30] Fix naming Signed-off-by: Kevin Zhang --- ...test_push_offline_retrieval => test_push_offline_retrieval.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sdk/python/tests/integration/offline_store/{test_push_offline_retrieval => test_push_offline_retrieval.py} (100%) diff --git a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval b/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py similarity index 100% rename from sdk/python/tests/integration/offline_store/test_push_offline_retrieval rename to sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py From b48d37711309f76dca6178732f9f0fc03740672a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:21:35 -0700 Subject: [PATCH 29/30] Fix Signed-off-by: Kevin Zhang --- .../integration/offline_store/test_push_offline_retrieval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py b/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py index 5aaed47313..b2f91f442e 100644 --- a/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_push_offline_retrieval.py @@ -14,6 +14,7 @@ location, ) + @pytest.mark.integration @pytest.mark.universal_offline_stores(only=["file", "redshift"]) @pytest.mark.universal_online_stores(only=["sqlite"]) From 2e1ddb1a7dba2d5f1a2e15f40b3a6276c0242a8a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 22 Jun 2022 12:23:44 -0700 Subject: [PATCH 30/30] Uncomment Signed-off-by: Kevin Zhang --- .../online_store/test_universal_online.py | 148 +++++++++--------- .../test_stream_feature_view_apply.py | 138 ++++++++-------- 2 files changed, 143 insertions(+), 143 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 9d4db3e03e..c068e04111 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -441,80 +441,80 @@ def test_online_retrieval_with_event_timestamps( ) -# @pytest.mark.integration -# @pytest.mark.universal_online_stores -# # @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. -# @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) -# def test_stream_feature_view_online_retrieval( -# environment, universal_data_sources, feature_server_endpoint, full_feature_names -# ): -# """ -# Tests materialization and online retrieval for stream feature views. - -# This test is separate from test_online_retrieval since combining feature views and -# stream feature views into a single test resulted in test flakiness. This is tech -# debt that should be resolved soon. -# """ -# # Set up feature store. -# fs = environment.feature_store -# entities, datasets, data_sources = universal_data_sources -# feature_views = construct_universal_feature_views(data_sources) -# pushable_feature_view = feature_views.pushed_locations -# fs.apply([location(), pushable_feature_view]) - -# # Materialize. -# fs.materialize( -# environment.start_date - timedelta(days=1), -# environment.end_date + timedelta(days=1), -# ) - -# # Get online features by randomly sampling 10 entities that exist in the batch source. -# sample_locations = datasets.location_df.sample(10)["location_id"] -# entity_rows = [ -# {"location_id": sample_location} for sample_location in sample_locations -# ] - -# feature_refs = [ -# "pushable_location_stats:temperature", -# ] -# unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] - -# online_features_dict = get_online_features_dict( -# environment=environment, -# endpoint=feature_server_endpoint, -# features=feature_refs, -# entity_rows=entity_rows, -# full_feature_names=full_feature_names, -# ) - -# # Check that the response has the expected set of keys. -# keys = set(online_features_dict.keys()) -# expected_keys = set( -# f.replace(":", "__") if full_feature_names else f.split(":")[-1] -# for f in feature_refs -# ) | {"location_id"} -# assert ( -# keys == expected_keys -# ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" - -# # Check that the feature values match. -# tc = unittest.TestCase() -# for i, entity_row in enumerate(entity_rows): -# df_features = get_latest_feature_values_from_location_df( -# entity_row, datasets.location_df -# ) - -# assert df_features["location_id"] == online_features_dict["location_id"][i] -# for unprefixed_feature_ref in unprefixed_feature_refs: -# tc.assertAlmostEqual( -# df_features[unprefixed_feature_ref], -# online_features_dict[ -# response_feature_name( -# unprefixed_feature_ref, feature_refs, full_feature_names -# ) -# ][i], -# delta=0.0001, -# ) +@pytest.mark.integration +@pytest.mark.universal_online_stores +# @pytest.mark.goserver Disabling because the go fs tests are flaking in CI. TODO(achals): uncomment after fixed. +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_stream_feature_view_online_retrieval( + environment, universal_data_sources, feature_server_endpoint, full_feature_names +): + """ + Tests materialization and online retrieval for stream feature views. + + This test is separate from test_online_retrieval since combining feature views and + stream feature views into a single test resulted in test flakiness. This is tech + debt that should be resolved soon. + """ + # Set up feature store. + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + pushable_feature_view = feature_views.pushed_locations + fs.apply([location(), pushable_feature_view]) + + # Materialize. + fs.materialize( + environment.start_date - timedelta(days=1), + environment.end_date + timedelta(days=1), + ) + + # Get online features by randomly sampling 10 entities that exist in the batch source. + sample_locations = datasets.location_df.sample(10)["location_id"] + entity_rows = [ + {"location_id": sample_location} for sample_location in sample_locations + ] + + feature_refs = [ + "pushable_location_stats:temperature", + ] + unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] + + online_features_dict = get_online_features_dict( + environment=environment, + endpoint=feature_server_endpoint, + features=feature_refs, + entity_rows=entity_rows, + full_feature_names=full_feature_names, + ) + + # Check that the response has the expected set of keys. + keys = set(online_features_dict.keys()) + expected_keys = set( + f.replace(":", "__") if full_feature_names else f.split(":")[-1] + for f in feature_refs + ) | {"location_id"} + assert ( + keys == expected_keys + ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" + + # Check that the feature values match. + tc = unittest.TestCase() + for i, entity_row in enumerate(entity_rows): + df_features = get_latest_feature_values_from_location_df( + entity_row, datasets.location_df + ) + + assert df_features["location_id"] == online_features_dict["location_id"][i] + for unprefixed_feature_ref in unprefixed_feature_refs: + tc.assertAlmostEqual( + df_features[unprefixed_feature_ref], + online_features_dict[ + response_feature_name( + unprefixed_feature_ref, feature_refs, full_feature_names + ) + ][i], + delta=0.0001, + ) @pytest.mark.integration diff --git a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py index f92fd340f0..8e2af031c5 100644 --- a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py +++ b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py @@ -77,72 +77,72 @@ def simple_sfv(df): assert features["dummy_field"] == [None] -# @pytest.mark.integration -# def test_stream_feature_view_udf(simple_dataset_1) -> None: -# """ -# Test apply of StreamFeatureView udfs are serialized correctly and usable. -# """ -# runner = CliRunner() -# with runner.local_repo( -# get_example_repo("example_feature_repo_1.py"), "bigquery" -# ) as fs, prep_file_source( -# df=simple_dataset_1, timestamp_field="ts_1" -# ) as file_source: -# entity = Entity(name="driver_entity", join_keys=["test_key"]) - -# stream_source = KafkaSource( -# name="kafka", -# timestamp_field="event_timestamp", -# kafka_bootstrap_servers="", -# message_format=AvroFormat(""), -# topic="topic", -# batch_source=file_source, -# watermark_delay_threshold=timedelta(days=1), -# ) - -# @stream_feature_view( -# entities=[entity], -# ttl=timedelta(days=30), -# owner="test@example.com", -# online=True, -# schema=[Field(name="dummy_field", dtype=Float32)], -# description="desc", -# aggregations=[ -# Aggregation( -# column="dummy_field", function="max", time_window=timedelta(days=1), -# ), -# Aggregation( -# column="dummy_field2", -# function="count", -# time_window=timedelta(days=24), -# ), -# ], -# timestamp_field="event_timestamp", -# mode="spark", -# source=stream_source, -# tags={}, -# ) -# def pandas_view(pandas_df): -# import pandas as pd - -# assert type(pandas_df) == pd.DataFrame -# df = pandas_df.transform(lambda x: x + 10, axis=1) -# df.insert(2, "C", [20.2, 230.0, 34.0], True) -# return df - -# import pandas as pd - -# fs.apply([entity, pandas_view]) - -# stream_feature_views = fs.list_stream_feature_views() -# assert len(stream_feature_views) == 1 -# assert stream_feature_views[0] == pandas_view - -# sfv = stream_feature_views[0] - -# df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) -# new_df = sfv.udf(df) -# expected_df = pd.DataFrame( -# {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} -# ) -# assert new_df.equals(expected_df) +@pytest.mark.integration +def test_stream_feature_view_udf(simple_dataset_1) -> None: + """ + Test apply of StreamFeatureView udfs are serialized correctly and usable. + """ + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_1.py"), "bigquery" + ) as fs, prep_file_source( + df=simple_dataset_1, timestamp_field="ts_1" + ) as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + kafka_bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark_delay_threshold=timedelta(days=1), + ) + + @stream_feature_view( + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", function="max", time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + tags={}, + ) + def pandas_view(pandas_df): + import pandas as pd + + assert type(pandas_df) == pd.DataFrame + df = pandas_df.transform(lambda x: x + 10, axis=1) + df.insert(2, "C", [20.2, 230.0, 34.0], True) + return df + + import pandas as pd + + fs.apply([entity, pandas_view]) + + stream_feature_views = fs.list_stream_feature_views() + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == pandas_view + + sfv = stream_feature_views[0] + + df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + new_df = sfv.udf(df) + expected_df = pd.DataFrame( + {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} + ) + assert new_df.equals(expected_df)