From 9a1f24aba6abf50f5a91cd123d1d1a939cc0137c Mon Sep 17 00:00:00 2001 From: David Heryanto Date: Fri, 31 Jan 2020 14:33:57 +0800 Subject: [PATCH] Add export_schema method to export schema from FeatureSet --- sdk/python/feast/feature_set.py | 47 ++++++++++++++++++++++++++++ sdk/python/tests/test_feature_set.py | 12 +++---- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/sdk/python/feast/feature_set.py b/sdk/python/feast/feature_set.py index b7b00fea17..13a0428911 100644 --- a/sdk/python/feast/feature_set.py +++ b/sdk/python/feast/feature_set.py @@ -21,8 +21,10 @@ from google.protobuf import json_format from google.protobuf.duration_pb2 import Duration from google.protobuf.json_format import MessageToJson +from google.protobuf.message import Message from pandas.api.types import is_datetime64_ns_dtype from pyarrow.lib import TimestampType +from tensorflow_metadata.proto.v0 import schema_pb2 from tensorflow_metadata.proto.v0.schema_pb2 import Schema from feast.core.FeatureSet_pb2 import FeatureSet as FeatureSetProto @@ -695,6 +697,51 @@ def update_schema(self, schema: Schema): f"that does not exist in the FeatureSet '{self.name}' in Feast" ) + def export_schema(self) -> Schema: + schema = Schema() + for _, field in self._fields.items(): + # TODO: export type as well + feature = schema_pb2.Feature() + attributes_to_copy_from_field_to_feature = [ + "name", + "presence", + "group_presence", + "shape", + "value_count", + "domain", + "int_domain", + "float_domain", + "string_domain", + "bool_domain", + "struct_domain", + "_natural_language_domain", + "image_domain", + "mid_domain", + "url_domain", + "time_domain", + "time_of_day_domain", + ] + for attr in attributes_to_copy_from_field_to_feature: + if getattr(field, attr) is None: + continue + + if issubclass(type(getattr(feature, attr)), Message): + # Proto message field to copy is an embedded field, so MergeFrom() method must be used + getattr(feature, attr).MergeFrom(getattr(field, attr)) + elif issubclass(type(getattr(feature, attr)), (int, str, bool)): + # Proto message field is a simple Python type, so setattr() can be used + setattr(feature, attr, getattr(field, attr)) + else: + warnings.warn( + f"Attribute '{attr}' cannot be copied from Field " + f"'{field.name}' in FeatureSet '{self.name}' to a " + f"Feature in the Schema in Tensorflow metadata, because" + f"the type is neither a Protobuf message or Python " + f"int, str and bool" + ) + schema.feature.append(feature) + return schema + @classmethod def from_yaml(cls, yml: str): """ diff --git a/sdk/python/tests/test_feature_set.py b/sdk/python/tests/test_feature_set.py index 687e08724e..c328eb6c98 100644 --- a/sdk/python/tests/test_feature_set.py +++ b/sdk/python/tests/test_feature_set.py @@ -179,7 +179,7 @@ def test_update_schema(self): json_format.Parse( open(test_data_folder / "schema_bikeshare.json").read(), schema_bikeshare ) - feature_set_bikeshare = FeatureSet( + feature_set = FeatureSet( name="bikeshare", entities=[Entity(name="station_id", dtype=ValueType.INT64),], features=[ @@ -191,23 +191,23 @@ def test_update_schema(self): ], ) # Before update - for entity in feature_set_bikeshare.entities: + for entity in feature_set.entities: assert entity.presence is None assert entity.shape is None - for feature in feature_set_bikeshare.features: + for feature in feature_set.features: assert feature.presence is None assert feature.shape is None assert feature.string_domain is None assert feature.float_domain is None assert feature.int_domain is None - feature_set_bikeshare.update_schema(schema_bikeshare) + feature_set.update_schema(schema_bikeshare) # After update - for entity in feature_set_bikeshare.entities: + for entity in feature_set.entities: assert entity.presence is not None assert entity.shape is not None - for feature in feature_set_bikeshare.features: + for feature in feature_set.features: assert feature.presence is not None assert feature.shape is not None if feature.name in ["location", "name", "status"]: