From 80e5c4af7aa233cb891c6a30f1a48fcb3963e6c2 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:45:54 -0500 Subject: [PATCH 01/52] Make base.py framework agnostic and add helper transforms --- sdks/python/apache_beam/ml/transforms/base.py | 437 ++++++++++++++++-- 1 file changed, 394 insertions(+), 43 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index b3a30bb5f1256..b8a9beafb9862 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -14,18 +14,34 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytype: skip-file - import abc +import collections +import logging +import os +import tempfile +import uuid +from typing import Any from typing import Dict from typing import Generic from typing import List +from typing import Mapping from typing import Optional from typing import Sequence from typing import TypeVar +from typing import Union + +import jsonpickle +import numpy as np import apache_beam as beam +from apache_beam.io.filesystems import FileSystems from apache_beam.metrics.metric import Metrics +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import ModelT +from apache_beam.options.pipeline_options import PipelineOptions + +_LOGGER = logging.getLogger(__name__) +_ATTRIBUTE_FILE_NAME = 'attributes.json' __all__ = ['MLTransform', 'ProcessHandler', 'BaseOperation'] @@ -42,12 +58,68 @@ OperationOutputT = TypeVar('OperationOutputT') +def _convert_list_of_dicts_to_dict_of_lists( + list_of_dicts: Sequence[Dict[str, Any]]) -> Dict[str, List[Any]]: + keys_to_element_list = collections.defaultdict(list) + for d in list_of_dicts: + for key, value in d.items(): + keys_to_element_list[key].append(value) + return keys_to_element_list + + +def _convert_dict_of_lists_to_lists_of_dict( + dict_of_lists: Dict[str, List[Any]], + batch_length: int) -> List[Dict[str, Any]]: + result: List[Dict[str, Any]] = [{} for _ in range(batch_length)] + for key, values in dict_of_lists.items(): + for i in range(len(values)): + result[i][key] = values[i] + return result + + class ArtifactMode(object): PRODUCE = 'produce' CONSUME = 'consume' -class BaseOperation(Generic[OperationInputT, OperationOutputT], abc.ABC): +class PTransformProvider: + """ + Data processing transforms that are intended to be used with MLTransform + should subclass PTransformProvider and implement the following methods: + 1. get_ptransform_for_processing() + 2. requires_chaining() + + get_ptransform_for_processing() method should return a PTransform that can be + used to process the data. + + requires_chaining() method should return True if the data processing + transforms needs to be chained sequentially with compatible data processing + transforms. + """ + @abc.abstractmethod + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + """ + Returns a PTransform that can be used to process the data. + """ + + @abc.abstractmethod + def requires_chaining(self): + """ + Returns True if the data processing transforms needs to be chained + sequentially with compatible data processing transforms. + """ + + def get_counter(self): + """ + Returns the counter name for the data processing transform. + """ + counter_name = self.__class__.__name__ + return Metrics.counter(MLTransform, f'BeamML_{counter_name}') + + +class BaseOperation(Generic[OperationInputT, OperationOutputT], + PTransformProvider, + abc.ABC): def __init__(self, columns: List[str]) -> None: """ Base Opertation class data processing transformations. @@ -76,33 +148,55 @@ def __call__(self, data: OperationInputT, transformed_data = self.apply_transform(data, output_column_name) return transformed_data - def get_counter(self): - """ - Returns the counter name for the operation. - """ - counter_name = self.__class__.__name__ - return Metrics.counter(MLTransform, f'BeamML_{counter_name}') - -class ProcessHandler(Generic[ExampleT, MLTransformOutputT], abc.ABC): +class ProcessHandler(beam.PTransform[beam.PCollection[ExampleT], + beam.PCollection[MLTransformOutputT]], + abc.ABC): """ Only for internal use. No backwards compatibility guarantees. """ @abc.abstractmethod - def process_data( - self, pcoll: beam.PCollection[ExampleT] - ) -> beam.PCollection[MLTransformOutputT]: + def append_transform(self, transform: BaseOperation): """ - Logic to process the data. This will be the entrypoint in - beam.MLTransform to process incoming data. + Append transforms to the ProcessHandler. """ + +# TODO: Add support for inference_fn +class EmbeddingsManager(PTransformProvider): + def __init__( + self, + columns: List[str], + *, + # common args for all ModelHandlers. + load_model_args: Optional[Dict[str, Any]] = None, + min_batch_size: Optional[int] = None, + max_batch_size: Optional[int] = None, + large_model: bool = False, + **kwargs): + self.load_model_args = load_model_args or {} + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.large_model = large_model + self.columns = columns + + if kwargs: + _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys()) + + # TODO: Add set_model_handler method. @abc.abstractmethod - def append_transform(self, transform: BaseOperation): + def get_model_handler(self) -> ModelHandler: """ - Append transforms to the ProcessHandler. + Return framework specific model handler. """ + def requires_chaining(self): + # each embedding config requires a separate PTransform. so no chaining. + return False + + def get_columns_to_apply(self): + return self.columns + class MLTransform(beam.PTransform[beam.PCollection[ExampleT], beam.PCollection[MLTransformOutputT]], @@ -112,7 +206,8 @@ def __init__( *, write_artifact_location: Optional[str] = None, read_artifact_location: Optional[str] = None, - transforms: Optional[Sequence[BaseOperation]] = None): + transforms: Optional[List[Union[BaseOperation, + EmbeddingsManager]]] = None): """ MLTransform is a Beam PTransform that can be used to apply transformations to the data. MLTransform is used to wrap the @@ -157,9 +252,6 @@ def __init__( i-th transform is the output of the (i-1)-th transform. Multi-input transforms are not supported yet. """ - if transforms: - _ = [self._validate_transform(transform) for transform in transforms] - if read_artifact_location and write_artifact_location: raise ValueError( 'Only one of read_artifact_location or write_artifact_location can ' @@ -177,19 +269,10 @@ def __init__( artifact_location = write_artifact_location # type: ignore[assignment] artifact_mode = ArtifactMode.PRODUCE - # avoid circular import - # pylint: disable=wrong-import-order, wrong-import-position - from apache_beam.ml.transforms.handlers import TFTProcessHandler - # TODO: When new ProcessHandlers(eg: JaxProcessHandler) are introduced, - # create a mapping between transforms and ProcessHandler since - # ProcessHandler is not exposed to the user. - process_handler: ProcessHandler = TFTProcessHandler( - artifact_location=artifact_location, - artifact_mode=artifact_mode, - transforms=transforms) # type: ignore[arg-type] - - self._process_handler = process_handler - self.transforms = transforms + self._parent_artifact_location = artifact_location + + self._artifact_mode = artifact_mode + self.transforms = transforms or [] self._counter = Metrics.counter( MLTransform, f'BeamML_{self.__class__.__name__}') @@ -209,10 +292,33 @@ def expand( Returns: A PCollection of MLTransformOutputT type """ + _ = [self._validate_transform(transform) for transform in self.transforms] + if self._artifact_mode == ArtifactMode.PRODUCE: + ptransform_partitioner = _MLTransformToPTransformMapper( + transforms=self.transforms, + artifact_location=self._parent_artifact_location, + artifact_mode=self._artifact_mode, + pipeline_options=pcoll.pipeline.options) + ptransform_list = ptransform_partitioner.create_and_save_ptransform_list() + else: + ptransform_list = ( + _MLTransformToPTransformMapper.load_transforms_from_artifact_location( + self._parent_artifact_location)) + + # the saved transforms has artifact mode set to PRODUCE. + # set the artifact mode to CONSUME. + if self._artifact_mode == ArtifactMode.CONSUME: + for i in range(len(ptransform_list)): + if hasattr(ptransform_list[i], 'artifact_mode'): + ptransform_list[i].artifact_mode = self._artifact_mode + + for ptransform in ptransform_list: + pcoll = pcoll | ptransform + _ = ( pcoll.pipeline | "MLTransformMetricsUsage" >> MLTransformMetricsUsage(self)) - return self._process_handler.process_data(pcoll) + return pcoll # type: ignore[return-value] def with_transform(self, transform: BaseOperation): """ @@ -222,14 +328,21 @@ def with_transform(self, transform: BaseOperation): Returns: A MLTransform instance. """ - self._validate_transform(transform) - self._process_handler.append_transform(transform) + # self._validate_transform(transform) + # avoid circular import + # pylint: disable=wrong-import-order, wrong-import-position + self.transforms.append(transform) return self def _validate_transform(self, transform): - if not isinstance(transform, BaseOperation): + # every data processing transform should subclass PTransformProvider. Raise + # an error if the transform does not subclass PTransformProvider since the + # downstream code expects the transform to be a subclass of + # PTransformProvider. + if not isinstance(transform, PTransformProvider): raise TypeError( - 'transform must be a subclass of BaseOperation. ' + 'transform must be a subclass of PTransformProvider and implement ' + 'get_ptransform_for_processing() method.' 'Got: %s instead.' % type(transform)) @@ -243,9 +356,7 @@ def _increment_counters(): # increment for MLTransform. self._ml_transform._counter.inc() # increment if data processing transforms are passed. - transforms = ( - self._ml_transform.transforms or - self._ml_transform._process_handler.transforms) + transforms = self._ml_transform.transforms if transforms: for transform in transforms: transform.get_counter().inc() @@ -254,3 +365,243 @@ def _increment_counters(): pipeline | beam.Create([None]) | beam.Map(lambda _: _increment_counters())) + + +class _TransformAttributeManager: + """ + Base class used for saving and loading the attributes. + """ + @staticmethod + def save_attributes(artifact_location): + """ + Save the attributes to json file using stdlib json. + """ + raise NotImplementedError + + @staticmethod + def load_attributes(artifact_location): + """ + Load the attributes from json file. + """ + raise NotImplementedError + + +class _JsonPickleTransformAttributeManager(_TransformAttributeManager): + """ + Use Jsonpickle to save and load the attributes. Here the attributes refer + to the list of PTransforms that are used to process the data. + + jsonpickle is used to serialize the PTransforms and save it to a json file and + is compatible across python versions. + """ + @staticmethod + def _is_remote_path(path): + is_gcs = path.find('gs://') != -1 + # TODO: Add support for other remote paths. + if not is_gcs and path.find('://') != -1: + raise RuntimeError( + "Artifact locations are currently supported for only available for " + "local paths and GCS paths. Got: %s" % path) + return is_gcs + + @staticmethod + def save_attributes( + ptransform_list, + artifact_location, + **kwargs, + ): + if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location): + try: + options = kwargs.get('options') + except KeyError: + raise RuntimeError( + 'pipeline options are required to save the attributes.' + 'in the artifact location %s' % artifact_location) + + temp_dir = tempfile.mkdtemp() + temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME) + with open(temp_json_file, 'w+') as f: + f.write(jsonpickle.encode(ptransform_list)) + with open(temp_json_file, 'rb') as f: + from apache_beam.runners.dataflow.internal import apiclient + _LOGGER.info('Creating artifact location: %s', artifact_location) + apiclient.DataflowApplicationClient(options=options).stage_file( + gcs_or_local_path=artifact_location, + file_name=_ATTRIBUTE_FILE_NAME, + stream=f, + mime_type='application/json') + else: + if not FileSystems.exists(artifact_location): + FileSystems.mkdirs(artifact_location) + # FileSystems.open() fails if the file does not exist. + with open(os.path.join(artifact_location, _ATTRIBUTE_FILE_NAME), + 'w+') as f: + f.write(jsonpickle.encode(ptransform_list)) + + @staticmethod + def load_attributes(artifact_location): + with FileSystems.open(os.path.join(artifact_location, _ATTRIBUTE_FILE_NAME), + 'rb') as f: + return jsonpickle.decode(f.read()) + + +_transform_attribute_manager = _JsonPickleTransformAttributeManager + + +class _MLTransformToPTransformMapper: + """ + This class takes in a list of data processing transforms compatible to be + wrapped around MLTransform and returns a list of PTransforms that are used to + run the data processing transforms. + + The _MLTransformToPTransformMapper is responsible for loading and saving the + PTransforms or attributes of PTransforms to the artifact location to seal + the gap between the training and inference pipelines. + """ + def __init__( + self, + transforms: List[Union[BaseOperation, EmbeddingsManager]], + artifact_location: str, + artifact_mode: str, + pipeline_options: Optional[PipelineOptions] = None, + ): + self.transforms = transforms + self._parent_artifact_location = artifact_location + self.artifact_mode = artifact_mode + self.pipeline_options = pipeline_options + + def create_and_save_ptransform_list(self): + ptransform_list = self.create_ptransform_list() + self.save_transforms_in_artifact_location(ptransform_list) + return ptransform_list + + def create_ptransform_list(self): + previous_ptransform_type = None + current_ptransform = None + ptransform_list = [] + for transform in self.transforms: + if not isinstance(transform, PTransformProvider): + raise RuntimeError( + 'Transforms must be instances of PTransformProvider and ' + 'implement get_ptransform_for_processing() method.') + # for each instance of PTransform, create a new artifact location + current_ptransform = transform.get_ptransform_for_processing( + artifact_location=os.path.join( + self._parent_artifact_location, uuid.uuid4().hex[:6]), + artifact_mode=self.artifact_mode) + # Determine if a new ptransform should be added to the list + is_different_type = (type(current_ptransform) != previous_ptransform_type) + if is_different_type or not transform.requires_chaining(): + ptransform_list.append(current_ptransform) + previous_ptransform_type = type(current_ptransform) + + if hasattr(ptransform_list[-1], 'append_transform'): + ptransform_list[-1].append_transform(transform) + + return ptransform_list + + def save_transforms_in_artifact_location(self, ptransform_list): + """ + Save the ptransform references to json file. + """ + _transform_attribute_manager.save_attributes( + ptransform_list=ptransform_list, + artifact_location=self._parent_artifact_location, + options=self.pipeline_options) + + @staticmethod + def load_transforms_from_artifact_location(artifact_location): + return _transform_attribute_manager.load_attributes(artifact_location) + + +class _TextEmbeddingHandler(ModelHandler): + """ + A ModelHandler intended to be work on list[dict[str, str]] inputs. + + The inputs to the model handler are expected to be a list of dicts. + + For example, if the original mode is used with RunInference to take a + PCollection[E] to a PCollection[P], this ModelHandler would take a + PCollection[Dict[str, E]] to a PCollection[Dict[str, P]]. + + _TextEmbeddingHandler will accept an EmbeddingsManager instance, which + contains the details of the model to be loaded and the inference_fn to be + used. The purpose of _TextEmbeddingHandler is to generate embeddings for + text inputs using the EmbeddingsManager instance. + + If the input is not a text column, a RuntimeError will be raised. + + This is an internal class and offers no backwards compatibility guarantees. + + Args: + embeddings_manager: An EmbeddingsManager instance. + """ + def __init__(self, embeddings_manager: EmbeddingsManager): + self.embedding_config = embeddings_manager + self._underlying = self.embedding_config.get_model_handler() + self.columns = self.embedding_config.get_columns_to_apply() + + def load_model(self): + model = self._underlying.load_model() + return model + + def _validate_column_data(self, batch): + if not isinstance(batch[0], (str, bytes)): + raise TypeError('Embeddings can only be generated on text columns.') + + def _validate_batch(self, batch: Sequence[Dict[str, List[str]]]): + if not batch or not isinstance(batch[0], dict): + raise TypeError( + 'Expected data to be dicts, got ' + f'{type(batch[0])} instead.') + + def _process_batch( + self, + dict_batch: Dict[str, List[Any]], + model: ModelT, + inference_args: Optional[Dict[str, Any]]) -> Dict[str, List[Any]]: + result: Dict[str, List[Any]] = collections.defaultdict(list) + for key, batch in dict_batch.items(): + if key in self.columns: + self._validate_column_data(batch) + prediction = self._underlying.run_inference( + batch, model, inference_args) + if isinstance(prediction, np.ndarray): + prediction = prediction.tolist() + result[key] = prediction # type: ignore[assignment] + else: + result[key] = prediction # type: ignore[assignment] + else: + result[key] = batch + return result + + def run_inference( + self, + batch: Sequence[Dict[str, List[str]]], + model: ModelT, + inference_args: Optional[Dict[str, Any]] = None, + ) -> List[Dict[str, Union[List[float], List[str]]]]: + """ + Runs inference on a batch of text inputs. The inputs are expected to be + a list of dicts. Each dict should have the same keys, and the shape + should be of the same size for a single key across the batch. + """ + self._validate_batch(batch) + batch_len = len(batch) + dict_batch = _convert_list_of_dicts_to_dict_of_lists(list_of_dicts=batch) + transformed_batch = self._process_batch(dict_batch, model, inference_args) + return _convert_dict_of_lists_to_lists_of_dict( + dict_of_lists=transformed_batch, batch_length=batch_len) + + def get_metrics_namespace(self) -> str: + return ( + self._underlying.get_metrics_namespace() or + 'BeamML_TextEmbeddingHandler') + + def batch_elements_kwargs(self) -> Mapping[str, Any]: + batch_sizes_map = {} + if self.embedding_config.max_batch_size: + batch_sizes_map['max_batch_size'] = self.embedding_config.max_batch_size + if self.embedding_config.min_batch_size: + batch_sizes_map['min_batch_size'] = self.embedding_config.min_batch_size + return (self._underlying.batch_elements_kwargs() or batch_sizes_map) From 0d34847252c6457a305a704729a4173bdd110c22 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:46:20 -0500 Subject: [PATCH 02/52] Add tests for base.py --- .../apache_beam/ml/transforms/base_test.py | 185 ++++++++++++++++-- 1 file changed, 170 insertions(+), 15 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 2e447964541ba..1f9e5a85d1c2a 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -20,7 +20,11 @@ import tempfile import typing import unittest +from typing import Any +from typing import Dict from typing import List +from typing import Optional +from typing import Sequence import numpy as np from parameterized import param @@ -28,28 +32,30 @@ import apache_beam as beam from apache_beam.metrics.metric import MetricsFilter +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.transforms import base from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports try: - from apache_beam.ml.transforms import base from apache_beam.ml.transforms import tft from apache_beam.ml.transforms.tft import TFTOperation except ImportError: tft = None # type: ignore -if tft is None: - raise unittest.SkipTest('tensorflow_transform is not installed') - +try: -class _FakeOperation(TFTOperation): - def __init__(self, name, *args, **kwargs): - super().__init__(*args, **kwargs) - self.name = name + class _FakeOperation(TFTOperation): + def __init__(self, name, *args, **kwargs): + super().__init__(*args, **kwargs) + self.name = name - def apply_transform(self, inputs, output_column_name, **kwargs): - return {output_column_name: inputs} + def apply_transform(self, inputs, output_column_name, **kwargs): + return {output_column_name: inputs} +except: # pylint: disable=bare-except + pass class BaseMLTransformTest(unittest.TestCase): @@ -59,6 +65,7 @@ def setUp(self) -> None: def tearDown(self): shutil.rmtree(self.artifact_location) + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_appends_transforms_to_process_handler_correctly(self): fake_fn_1 = _FakeOperation(name='fake_fn_1', columns=['x']) transforms = [fake_fn_1] @@ -67,12 +74,11 @@ def test_ml_transform_appends_transforms_to_process_handler_correctly(self): ml_transform = ml_transform.with_transform( transform=_FakeOperation(name='fake_fn_2', columns=['x'])) - self.assertEqual(len(ml_transform._process_handler.transforms), 2) - self.assertEqual( - ml_transform._process_handler.transforms[0].name, 'fake_fn_1') - self.assertEqual( - ml_transform._process_handler.transforms[1].name, 'fake_fn_2') + self.assertEqual(len(ml_transform.transforms), 2) + self.assertEqual(ml_transform.transforms[0].name, 'fake_fn_1') + self.assertEqual(ml_transform.transforms[1].name, 'fake_fn_2') + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_on_dict(self): transforms = [tft.ScaleTo01(columns=['x'])] data = [{'x': 1}, {'x': 2}] @@ -91,6 +97,7 @@ def test_ml_transform_on_dict(self): assert_that( actual_output, equal_to(expected_output, equals_fn=np.array_equal)) + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_on_list_dict(self): transforms = [tft.ScaleTo01(columns=['x'])] data = [{'x': [1, 2, 3]}, {'x': [4, 5, 6]}] @@ -162,6 +169,7 @@ def test_ml_transform_on_list_dict(self): }, ), ]) + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_dict_output_pcoll_schema( self, input_data, input_types, expected_dtype): transforms = [tft.ScaleTo01(columns=['x'])] @@ -178,6 +186,7 @@ def test_ml_transform_dict_output_pcoll_schema( if name in expected_dtype: self.assertEqual(expected_dtype[name], typ) + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_fail_for_non_global_windows_in_produce_mode(self): transforms = [tft.ScaleTo01(columns=['x'])] with beam.Pipeline() as p: @@ -193,6 +202,7 @@ def test_ml_transform_fail_for_non_global_windows_in_produce_mode(self): write_artifact_location=self.artifact_location, )) + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transform_on_multiple_columns_single_transform(self): transforms = [tft.ScaleTo01(columns=['x', 'y'])] data = [{'x': [1, 2, 3], 'y': [1.0, 10.0, 20.0]}] @@ -217,6 +227,7 @@ def test_ml_transform_on_multiple_columns_single_transform(self): equal_to(expected_output_y, equals_fn=np.array_equal), label='y') + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_ml_transforms_on_multiple_columns_multiple_transforms(self): transforms = [ tft.ScaleTo01(columns=['x']), @@ -245,6 +256,7 @@ def test_ml_transforms_on_multiple_columns_multiple_transforms(self): equal_to(expected_output_y, equals_fn=np.array_equal), label='actual_output_y') + @unittest.skipIf(tft is None, 'tft module is not installed.') def test_mltransform_with_counter(self): transforms = [ tft.ComputeAndApplyVocabulary(columns=['y']), @@ -269,6 +281,149 @@ def test_mltransform_with_counter(self): self.assertEqual( result.metrics().query(mltransform_counter)['counters'][0].result, 1) + def test_non_ptransfrom_provider_class_to_mltransform(self): + class Add: + def __call__(self, x): + return x + 1 + + with self.assertRaisesRegex( + TypeError, 'transform must be a subclass of PTransformProvider'): + with beam.Pipeline() as p: + _ = ( + p + | beam.Create([{ + 'x': 1 + }]) + | base.MLTransform( + write_artifact_location=self.artifact_location).with_transform( + Add())) + + +class FakeModel: + def __call__(self, example: List[str]) -> List[str]: + for i in range(len(example)): + example[i] = example[i][::-1] + return example + + +class FakeModelHandler(ModelHandler): + def run_inference( + self, + batch: Sequence[str], + model: Any, + inference_args: Optional[Dict[str, Any]] = None): + return model(batch) + + def load_model(self): + return FakeModel() + + +class FakeEmbeddingsManager(base.EmbeddingsManager): + def __init__(self, columns): + super().__init__(columns=columns) + + def get_model_handler(self) -> ModelHandler: + return FakeModelHandler() + + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + return (RunInference(model_handler=base._TextEmbeddingHandler(self))) + + +class TextEmbeddingHandlerTest(unittest.TestCase): + def setUp(self) -> None: + self.embedding_conig = FakeEmbeddingsManager(columns=['x']) + self.artifact_location = tempfile.mkdtemp() + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_handler_with_incompatible_datatype(self): + text_handler = base._TextEmbeddingHandler( + embeddings_manager=self.embedding_conig) + data = [ + ('x', 1), + ('x', 2), + ('x', 3), + ] + with self.assertRaises(TypeError): + text_handler.run_inference(data, None, None) + + def test_handler_with_dict_inputs(self): + data = [ + { + 'x': "Hello world" + }, + { + 'x': "Apache Beam" + }, + ] + expected_data = [{key: value[::-1] + for key, value in d.items()} for d in data] + with beam.Pipeline() as p: + result = ( + p + | beam.Create(data) + | base.MLTransform( + write_artifact_location=self.artifact_location).with_transform( + self.embedding_conig)) + assert_that( + result, + equal_to(expected_data), + ) + + def test_handler_with_batch_sizes(self): + self.embedding_conig.max_batch_size = 100 + self.embedding_conig.min_batch_size = 10 + data = [ + { + 'x': "Hello world" + }, + { + 'x': "Apache Beam" + }, + ] * 100 + expected_data = [{key: value[::-1] + for key, value in d.items()} for d in data] + with beam.Pipeline() as p: + result = ( + p + | beam.Create(data) + | base.MLTransform( + write_artifact_location=self.artifact_location).with_transform( + self.embedding_conig)) + assert_that( + result, + equal_to(expected_data), + ) + + def test_handler_on_multiple_columns(self): + self.embedding_conig.columns = ['x', 'y'] + data = [ + { + 'x': "Hello world", 'y': "Apache Beam", 'z': 'unchanged' + }, + { + 'x': "Apache Beam", 'y': "Hello world", 'z': 'unchanged' + }, + ] + self.embedding_conig.columns = ['x', 'y'] + expected_data = [{ + key: (value[::-1] if key in self.embedding_conig.columns else value) + for key, + value in d.items() + } for d in data] + with beam.Pipeline() as p: + result = ( + p + | beam.Create(data) + | base.MLTransform( + write_artifact_location=self.artifact_location).with_transform( + self.embedding_conig)) + assert_that( + result, + equal_to(expected_data), + ) + if __name__ == '__main__': unittest.main() From 58b24f6ea63384dd9492a8d83112775eddd6c8d3 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:47:39 -0500 Subject: [PATCH 03/52] Add sentence-transformers --- .../ml/transforms/embeddings/__init__.py | 21 ++ .../embeddings/sentence_transformer.py | 128 +++++++++++ .../embeddings/sentence_transformer_test.py | 212 ++++++++++++++++++ 3 files changed, 361 insertions(+) create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/__init__.py create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py b/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py new file mode 100644 index 0000000000000..bda6256b79ef4 --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# TODO: Add dead letter queue for RunInference transforms. + +""" +This module contains embedding configs that can be used to generate +embeddings using MLTransform. +""" diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py new file mode 100644 index 0000000000000..5b31dbca00820 --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py @@ -0,0 +1,128 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["SentenceTransformerEmbeddings"] + +from typing import Any +from typing import Callable +from typing import Dict +from typing import List +from typing import Mapping +from typing import Optional +from typing import Sequence + +import apache_beam as beam +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.transforms.base import EmbeddingsManager +from apache_beam.ml.transforms.base import _TextEmbeddingHandler +from sentence_transformers import SentenceTransformer + + +# TODO: Use HuggingFaceModelHandlerTensor once the import issue is fixed. +# Right now, the hugging face model handler import torch and tensorflow +# at the same time, which adds too much weigth to the container unnecessarily. +class _SentenceTransformerModelHandler(ModelHandler): + """ + Note: Intended for internal use and guarantees no backwards compatibility. + """ + def __init__( + self, + model_name: str, + model_class: Callable, + load_model_args: Optional[dict] = None, + min_batch_size: Optional[int] = None, + max_batch_size: Optional[int] = None, + max_seq_length: Optional[int] = None, + large_model: bool = False, + **kwargs): + self._max_seq_length = max_seq_length + self._model_uri = model_name + self._model_class = model_class + self._load_model_args = load_model_args + self._min_batch_size = min_batch_size + self._max_batch_size = max_batch_size + self._large_model = large_model + self._kwargs = kwargs + + def run_inference( + self, + batch: Sequence[str], + model: SentenceTransformer, + inference_args: Optional[Dict[str, Any]] = None, + ): + inference_args = inference_args or {} + return model.encode(batch, **inference_args) + + def load_model(self): + model = self._model_class(self._model_uri) + if self._max_seq_length: + model.max_seq_length = self._max_seq_length + return model + + def share_model_across_processes(self) -> bool: + return self._large_model + + def batch_elements_kwargs(self) -> Mapping[str, Any]: + batch_sizes = {} + if self._min_batch_size: + batch_sizes["min_batch_size"] = self._min_batch_size + if self._max_batch_size: + batch_sizes["max_batch_size"] = self._max_batch_size + return batch_sizes + + +class SentenceTransformerEmbeddings(EmbeddingsManager): + def __init__( + self, + model_name: str, + columns: List[str], + max_seq_length: Optional[int] = None, + **kwargs): + """ + Embedding config for sentence-transformers. This config can be used with + MLTransform to embed text data. Models are loaded using the RunInference + PTransform with the help of ModelHandler. + Args: + model_name: Name of the model to use. The model should be hosted on + HuggingFace Hub or compatible with sentence_transformers. + columns: List of columns to be embedded. + max_seq_length: Max sequence length to use for the model if applicable. + min_batch_size: The minimum batch size to be used for inference. + max_batch_size: The maximum batch size to be used for inference. + large_model: Whether to share the model across processes. + """ + super().__init__(columns, **kwargs) + self.model_name = model_name + self.max_seq_length = max_seq_length + + def get_model_handler(self): + return _SentenceTransformerModelHandler( + model_class=SentenceTransformer, + max_seq_length=self.max_seq_length, + model_name=self.model_name, + load_model_args=self.load_model_args, + min_batch_size=self.min_batch_size, + max_batch_size=self.max_batch_size, + large_model=self.large_model) + + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + # wrap the model handler in a _TextEmbeddingHandler since + # the SentenceTransformerEmbeddings works on text input data. + return (RunInference(model_handler=_TextEmbeddingHandler(self))) + + def requires_chaining(self): + return False diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py new file mode 100644 index 0000000000000..63f401180dc2d --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -0,0 +1,212 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +import apache_beam as beam +from apache_beam.ml.transforms.base import MLTransform + +# pylint: disable=ungrouped-imports +try: + from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings +except ImportError: + SentenceTransformerEmbeddings = None # type: ignore + +# pylint: disable=ungrouped-imports +try: + import tensorflow_transform as tft + from apache_beam.ml.transforms.tft import ScaleTo01 +except ImportError: + tft = None + +test_query = "This is a test" +test_query_column = "feature_1" +DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" + + +def get_pipeline_wth_embedding_config( + pipeline: beam.Pipeline, embedding_config, artifact_location): + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform(write_artifact_location=artifact_location). + with_transform(embedding_config)) + return transformed_pcoll + + +@unittest.skipIf( + SentenceTransformerEmbeddings is None, + 'sentence-transformers is not installed.') +class SentenceTrasformerEmbeddingsTest(unittest.TestCase): + def setUp(self) -> None: + self.artifact_location = tempfile.mkdtemp() + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_sentence_transformer_embeddings(self): + model_name = DEFAULT_MODEL_NAME + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) + with beam.Pipeline() as pipeline: + result_pcoll = get_pipeline_wth_embedding_config( + pipeline=pipeline, + embedding_config=embedding_config, + artifact_location=self.artifact_location) + + def assert_element(element): + assert len(element[test_query_column]) == 768 + + _ = (result_pcoll | beam.Map(assert_element)) + + @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.') + def test_embeddings_with_scale_to_0_1(self): + model_name = DEFAULT_MODEL_NAME + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, + columns=[test_query_column], + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config).with_transform( + ScaleTo01(columns=[test_query_column]))) + + def assert_element(element): + assert max(element.feature_1) == 1 + + _ = (transformed_pcoll | beam.Map(assert_element)) + + def pipeline_with_configurable_artifact_location( + self, + pipeline, + embedding_config=None, + read_artifact_location=None, + write_artifact_location=None): + if write_artifact_location: + return ( + pipeline + | MLTransform(write_artifact_location=write_artifact_location). + with_transform(embedding_config)) + elif read_artifact_location: + return ( + pipeline + | MLTransform(read_artifact_location=read_artifact_location)) + else: + raise NotImplementedError + + def test_embeddings_with_read_artifact_location(self): + with beam.Pipeline() as p: + model_name = DEFAULT_MODEL_NAME + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=self.artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=self.artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.13 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.1342099905014038 + | beam.Map(assert_element)) + + def test_sentence_transformer_with_int_data_types(self): + model_name = DEFAULT_MODEL_NAME + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) + with self.assertRaises(TypeError): + with beam.Pipeline() as pipeline: + _ = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: 1 + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def test_with_gcs_artifact_location(self): + artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers') + with beam.Pipeline() as p: + model_name = DEFAULT_MODEL_NAME + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.13 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.1342099905014038 + | beam.Map(assert_element)) + + +if __name__ == '__main__': + unittest.main() From 88f9ceb61bced33c68e3f2790e37d7ff15b6508c Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:47:53 -0500 Subject: [PATCH 04/52] Add tensorflow hub --- .../transforms/embeddings/tensorflow_hub.py | 124 +++++++++++ .../embeddings/tensorflow_hub_test.py | 198 ++++++++++++++++++ 2 files changed, 322 insertions(+) create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py new file mode 100644 index 0000000000000..62bd00e10359a --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py @@ -0,0 +1,124 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable +from typing import List +from typing import Optional + +import apache_beam as beam +import tensorflow as tf +import tensorflow_hub as hub +import tensorflow_text as text # required to register TF ops. # pylint: disable=unused-import +from apache_beam.ml.inference import utils +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import PredictionResult +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor +from apache_beam.ml.inference.tensorflow_inference import default_tensor_inference_fn +from apache_beam.ml.transforms.base import EmbeddingsManager +from apache_beam.ml.transforms.base import _TextEmbeddingHandler + +__all__ = ['TensorflowHubTextEmbeddings'] + + +class _TensorflowHubModelHandler(TFModelHandlerTensor): + """ + Note: Intended for internal use only. No backwards compatibility guarantees. + """ + def __init__(self, preprocessing_url: Optional[str], *args, **kwargs): + self.preprocessing_url = preprocessing_url + super().__init__(*args, **kwargs) + + def load_model(self): + # unable to load the models with tf.keras.models.load_model so + # using hub.KerasLayer instead + model = hub.KerasLayer(self._model_uri) + return model + + def _convert_prediction_result_to_list( + self, predictions: Iterable[PredictionResult]): + result = [] + for prediction in predictions: + inference = prediction.inference.numpy().tolist() + result.append(inference) + return result + + def run_inference(self, batch, model, inference_args, model_id=None): + if not inference_args: + inference_args = {} + if not self.preprocessing_url: + predictions = default_tensor_inference_fn( + model=model, + batch=batch, + inference_args=inference_args, + model_id=model_id) + return self._convert_prediction_result_to_list(predictions) + + vectorized_batch = tf.stack(batch, axis=0) + preprocessor_fn = hub.KerasLayer(self.preprocessing_url) + vectorized_batch = preprocessor_fn(vectorized_batch) + predictions = model(vectorized_batch) + # https://www.tensorflow.org/text/tutorials/classify_text_with_bert#using_the_bert_model # pylint: disable=line-too-long + # pooled_output -> represents the text as a whole. This is an embeddings + # of the whole text. The shape is [batch_size, embedding_dimension] + # sequence_output -> represents the text as a sequence of tokens. This is + # an embeddings of each token in the text. The shape is + # [batch_size, max_sequence_length, embedding_dimension] + # pooled output is the embeedings as per the documentation. so let's use + # that. + embeddings = predictions['pooled_output'] + predictions = utils._convert_to_result(batch, embeddings, model_id) + return self._convert_prediction_result_to_list(predictions) + + +class TensorflowHubTextEmbeddings(EmbeddingsManager): + def __init__( + self, + columns: List[str], + hub_url: str, + preprocessing_url: Optional[str] = None, + **kwargs): + super().__init__(columns=columns, **kwargs) + self.model_uri = hub_url + self.preprocessing_url = preprocessing_url + """ + Embedding config for tensorflow hub models. This config can be used with + MLTransform to embed text data. Models are loaded using the RunInference + PTransform with the help of a ModelHandler. + + Args: + columns: The columns containing the text to be embedded. + hub_url: The url of the tensorflow hub model. + preprocessing_url: The url of the preprocessing model. This is optional. + If provided, the preprocessing model will be used to preprocess the + text before feeding it to the main model. + min_batch_size: The minimum batch size to be used for inference. + max_batch_size: The maximum batch size to be used for inference. + large_model: Whether to share the model across processes. + """ + + def get_model_handler(self) -> ModelHandler: + # override the default inference function + return _TensorflowHubModelHandler( + model_uri=self.model_uri, + preprocessing_url=self.preprocessing_url, + min_batch_size=self.min_batch_size, + max_batch_size=self.max_batch_size, + large_model=self.large_model, + ) + + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + return (RunInference(model_handler=_TextEmbeddingHandler(self))) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py new file mode 100644 index 0000000000000..6b918153945ae --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py @@ -0,0 +1,198 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +import apache_beam as beam +from apache_beam.ml.transforms.base import MLTransform + +hub_url = 'https://tfhub.dev/google/LEALLA/LEALLA-small/1' +test_query_column = 'test_query' +test_query = 'This is a test query' + +# pylint: disable=ungrouped-imports +try: + import tensorflow as tf # disable=unused-import + from apache_beam.ml.transforms.embeddings.tensorflow_hub import TensorflowHubTextEmbeddings +except ImportError: + tf = None + +try: + from apache_beam.ml.transforms.tft import ScaleTo01 +except ImportError: + ScaleTo01 = None # type: ignore + + +@unittest.skipIf(tf is None, 'Tensorflow is not installed.') +class TFHubEmbeddingsTest(unittest.TestCase): + def setUp(self) -> None: + self.artifact_location = tempfile.mkdtemp() + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_tfhub_text_embeddings(self): + embedding_config = TensorflowHubTextEmbeddings( + hub_url=hub_url, columns=[test_query_column]) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def assert_element(element): + assert len(element[test_query_column]) == 128 + + _ = (transformed_pcoll | beam.Map(assert_element)) + + @unittest.skipIf(ScaleTo01 is None, 'Tensorflow Transform is not installed.') + def test_embeddings_with_scale_to_0_1(self): + embedding_config = TensorflowHubTextEmbeddings( + hub_url=hub_url, + columns=[test_query_column], + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config).with_transform( + ScaleTo01(columns=[test_query_column]))) + + def assert_element(element): + assert max(element[test_query_column]) == 1 + + _ = ( + transformed_pcoll | beam.Map(lambda x: x.as_dict()) + | beam.Map(assert_element)) + + def pipeline_with_configurable_artifact_location( + self, + pipeline, + embedding_config=None, + read_artifact_location=None, + write_artifact_location=None): + if write_artifact_location: + return ( + pipeline + | MLTransform(write_artifact_location=write_artifact_location). + with_transform(embedding_config)) + elif read_artifact_location: + return ( + pipeline + | MLTransform(read_artifact_location=read_artifact_location)) + else: + raise NotImplementedError + + def test_embeddings_with_read_artifact_location(self): + with beam.Pipeline() as p: + embedding_config = TensorflowHubTextEmbeddings( + hub_url=hub_url, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=self.artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=self.artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.21 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.14797046780586243 + | beam.Map(assert_element)) + + def test_with_int_data_types(self): + embedding_config = TensorflowHubTextEmbeddings( + hub_url=hub_url, columns=[test_query_column]) + with self.assertRaises(TypeError): + with beam.Pipeline() as pipeline: + _ = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: 1 + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def test_with_gcs_artifact_location(self): + artifact_location = 'gs://apache-beam-ml/testing/tensorflow_hub' + with beam.Pipeline() as p: + embedding_config = TensorflowHubTextEmbeddings( + hub_url=hub_url, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.21 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.14797046780586243 + | beam.Map(assert_element)) + + +if __name__ == '__main__': + unittest.main() From 23f70278e2bf5fd0f34cf5f496184906522b59f3 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:48:22 -0500 Subject: [PATCH 05/52] Add vertex_ai --- .../ml/transforms/embeddings/vertex_ai.py | 160 ++++++++++++++ .../transforms/embeddings/vertex_ai_test.py | 197 ++++++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py new file mode 100644 index 0000000000000..31f5240093441 --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -0,0 +1,160 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Vertex AI Python SDK is required for this module. +# Follow https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk # pylint: disable=line-too-long +# to install Vertex AI Python SDK. + +from typing import Any +from typing import Dict +from typing import Iterable +from typing import List +from typing import Optional +from typing import Sequence + +from google.auth.credentials import Credentials + +import apache_beam as beam +import vertexai +from apache_beam.ml.inference.base import ModelHandler +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.transforms.base import EmbeddingsManager +from apache_beam.ml.transforms.base import _TextEmbeddingHandler +from vertexai.language_models import TextEmbeddingInput +from vertexai.language_models import TextEmbeddingModel + +__all__ = ["VertexAITextEmbeddings"] + +TASK_TYPE = "RETRIEVAL_DOCUMENT" +TASK_TYPE_INPUTS = [ + "RETRIEVAL_DOCUMENT", + "RETRIEVAL_QUERY", + "SEMANTIC_SIMILARITY", + "CLASSIFICATION", + "CLUSTERING" +] + + +class _VertexAITextEmbeddingHandler(ModelHandler): + """ + Note: Intended for internal use and guarantees no backwards compatibility. + """ + def __init__( + self, + model_name: str, + title: Optional[str] = None, + task_type: str = TASK_TYPE, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[Credentials] = None, + ): + vertexai.init(project=project, location=location, credentials=credentials) + self.model_name = model_name + if task_type not in TASK_TYPE_INPUTS: + raise ValueError( + f"task_type must be one of {TASK_TYPE_INPUTS}, got {task_type}") + self.task_type = task_type + self.title = title + + def run_inference( + self, + batch: Sequence[str], + model: Any, + inference_args: Optional[Dict[str, Any]] = None, + ) -> Iterable: + embeddings = [] + batch_size = 5 # Vertex AI limits requests to 5 at a time. + for i in range(0, len(batch), batch_size): + text_batch = batch[i:i + batch_size] + text_batch = [ + TextEmbeddingInput( + text=text, title=self.title, task_type=self.task_type) + for text in text_batch + ] + embeddings_batch = model.get_embeddings(text_batch) + embeddings.extend([el.values for el in embeddings_batch]) + return embeddings + + def load_model(self): + model = TextEmbeddingModel.from_pretrained(self.model_name) + return model + + +class VertexAITextEmbeddings(EmbeddingsManager): + def __init__( + self, + model_name: str, + columns: List[str], + title: Optional[str] = None, + task_type: str = TASK_TYPE, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[Credentials] = None, + **kwargs, + ): + """ + Embedding Config for Vertex AI Text Embedding models following + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long + + Text Embeddings are generated for a batch of text using the Vertex AI SDK. + Embeddings are returned in a list for each text in the batch. Look at + https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md # pylint: disable=line-too-long + for more information on model versions and lifecycle. + + Args: + model_name: The name of the Vertex AI Text Embedding model. + columns: The columns containing the text to be embedded. + task_type: The name of the downstream task the embeddings will be used for. + Valid values: + RETRIEVAL_QUERY + Specifies the given text is a query in a search/retrieval setting. + RETRIEVAL_DOCUMENT + Specifies the given text is a document from the corpus being searched. + SEMANTIC_SIMILARITY + Specifies the given text will be used for STS. + CLASSIFICATION + Specifies that the given text will be classified. + CLUSTERING + Specifies that the embeddings will be used for clustering. + title: Optional identifier of the text content. + project: The default GCP project to make Vertex API calls. + location: The default location to use when making API calls. + credentials: The default custom + credentials to use when making API calls. If not provided credentials + will be ascertained from the environment. + + """ + self.model_name = model_name + self.project = project + self.location = location + self.credentials = credentials + self.title = title + self.task_type = task_type + super().__init__(columns=columns, **kwargs) + + def get_model_handler(self) -> ModelHandler: + return _VertexAITextEmbeddingHandler( + model_name=self.model_name, + project=self.project, + location=self.location, + credentials=self.credentials, + title=self.title, + task_type=self.task_type, + ) + + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + return (RunInference(model_handler=_TextEmbeddingHandler(self))) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py new file mode 100644 index 0000000000000..7124aab9cbf23 --- /dev/null +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py @@ -0,0 +1,197 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +import apache_beam as beam +from apache_beam.ml.transforms.base import MLTransform + +try: + from apache_beam.ml.transforms.embeddings.vertex_ai import VertexAITextEmbeddings +except ImportError: + VertexAITextEmbeddings = None # type: ignore + +# pylint: disable=ungrouped-imports +try: + import tensorflow_transform as tft + from apache_beam.ml.transforms.tft import ScaleTo01 +except ImportError: + tft = None + +test_query = "This is a test" +test_query_column = "feature_1" +model_name: str = "textembedding-gecko@002" + + +@unittest.skipIf( + VertexAITextEmbeddings is None, 'Vertex AI Python SDK is not installed.') +class VertexAIEmbeddingsTest(unittest.TestCase): + def setUp(self) -> None: + self.artifact_location = tempfile.mkdtemp() + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_vertex_ai_text_embeddings(self): + embedding_config = VertexAITextEmbeddings( + model_name=model_name, columns=[test_query_column]) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def assert_element(element): + assert len(element[test_query_column]) == 768 + + _ = (transformed_pcoll | beam.Map(assert_element)) + + @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.') + def test_embeddings_with_scale_to_0_1(self): + embedding_config = VertexAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config).with_transform( + ScaleTo01(columns=[test_query_column]))) + + def assert_element(element): + assert max(element.feature_1) == 1 + + _ = (transformed_pcoll | beam.Map(assert_element)) + + def pipeline_with_configurable_artifact_location( + self, + pipeline, + embedding_config=None, + read_artifact_location=None, + write_artifact_location=None): + if write_artifact_location: + return ( + pipeline + | MLTransform(write_artifact_location=write_artifact_location). + with_transform(embedding_config)) + elif read_artifact_location: + return ( + pipeline + | MLTransform(read_artifact_location=read_artifact_location)) + else: + raise NotImplementedError + + def test_embeddings_with_read_artifact_location(self): + with beam.Pipeline() as p: + embedding_config = VertexAITextEmbeddings( + model_name=model_name, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=self.artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=self.artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.15 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.14797046780586243 + | beam.Map(assert_element)) + + def test_with_int_data_types(self): + embedding_config = VertexAITextEmbeddings( + model_name=model_name, columns=[test_query_column]) + with self.assertRaises(TypeError): + with beam.Pipeline() as pipeline: + _ = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: 1 + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def test_with_gcs_artifact_location(self): + artifact_location = ('gs://apache-beam-ml/testing/vertex_ai') + with beam.Pipeline() as p: + embedding_config = VertexAITextEmbeddings( + model_name=model_name, columns=[test_query_column]) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }])) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=artifact_location) + + with beam.Pipeline() as p: + data = ( + p + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }])) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=artifact_location) + + def assert_element(element): + assert round(element, 2) == 0.15 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + # 0.14797046780586243 + | beam.Map(assert_element)) + + +if __name__ == '__main__': + unittest.main() From 04ebdb0a3079e8ba5b5d69af9dbb010f38998493 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:48:49 -0500 Subject: [PATCH 06/52] Make TFTProcessHandler a PTransform --- .../apache_beam/ml/transforms/handlers.py | 11 +++++++--- .../ml/transforms/handlers_test.py | 10 ++++----- sdks/python/apache_beam/ml/transforms/tft.py | 22 +++++++++++++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/handlers.py b/sdks/python/apache_beam/ml/transforms/handlers.py index 8695d5146efae..1a673c51df261 100644 --- a/sdks/python/apache_beam/ml/transforms/handlers.py +++ b/sdks/python/apache_beam/ml/transforms/handlers.py @@ -15,6 +15,7 @@ # limitations under the License. # # pytype: skip-file +# pylint: skip-file import collections import hashlib @@ -217,6 +218,9 @@ def __init__( def append_transform(self, transform): self.transforms.append(transform) + def get_transforms(self): + return self.transforms + def _map_column_names_to_types(self, row_type): """ Return a dictionary of column names and types. @@ -319,6 +323,7 @@ def _get_raw_data_feature_spec_per_column( f"Please provide a valid type from the following: " f"{_default_type_to_tensor_type_map.keys()}") return tf.io.VarLenFeature(_default_type_to_tensor_type_map[dtype]) + # return tf.io.VarLenFeature() def get_raw_data_metadata( self, input_types: Dict[str, type]) -> dataset_metadata.DatasetMetadata: @@ -387,7 +392,7 @@ def _get_transformed_data_schema( transformed_types[name] = typing.Sequence[bytes] # type: ignore[assignment] return transformed_types - def process_data( + def expand( self, raw_data: beam.PCollection[tft_process_handler_input_type] ) -> beam.PCollection[tft_process_handler_output_type]: """ @@ -512,7 +517,7 @@ def process_data( # The schema only contains the columns that are transformed. transformed_dataset = ( - transformed_dataset | "ConvertToRowType" >> + transformed_dataset + | "ConvertToRowType" >> beam.Map(lambda x: beam.Row(**x)).with_output_types(row_type)) - return transformed_dataset diff --git a/sdks/python/apache_beam/ml/transforms/handlers_test.py b/sdks/python/apache_beam/ml/transforms/handlers_test.py index 327c8c76c0e9f..d39a1d775f3f2 100644 --- a/sdks/python/apache_beam/ml/transforms/handlers_test.py +++ b/sdks/python/apache_beam/ml/transforms/handlers_test.py @@ -298,7 +298,7 @@ def test_tft_process_handler_verify_artifacts(self): transforms=[tft.ScaleTo01(columns=['x'])], artifact_location=self.artifact_location, ) - _ = process_handler.process_data(raw_data) + _ = raw_data | process_handler self.assertTrue( os.path.exists( @@ -315,7 +315,7 @@ def test_tft_process_handler_verify_artifacts(self): raw_data = (p | beam.Create([{'x': np.array([2, 5])}])) process_handler = handlers.TFTProcessHandler( artifact_location=self.artifact_location, artifact_mode='consume') - transformed_data = process_handler.process_data(raw_data) + transformed_data = raw_data | process_handler transformed_data |= beam.Map(lambda x: x.x) # the previous min is 1 and max is 6. So this should scale by (1, 6) @@ -494,7 +494,7 @@ def test_tft_process_handler_unused_column(self): transforms=[scale_to_0_1_fn], artifact_location=self.artifact_location, ) - transformed_pcoll = process_handler.process_data(raw_data) + transformed_pcoll = raw_data | process_handler transformed_pcoll_x = transformed_pcoll | beam.Map(lambda x: x.x) transformed_pcoll_y = transformed_pcoll | beam.Map(lambda x: x.y) assert_that( @@ -520,7 +520,7 @@ def test_consume_mode_with_extra_columns_in_the_input(self): transforms=[tft.ScaleTo01(columns=['x'])], artifact_location=self.artifact_location, ) - _ = process_handler.process_data(raw_data) + _ = raw_data | process_handler test_data = [{ 'x': np.array([2, 5]), 'y': np.array([1, 2]), 'z': 'fake_string' @@ -548,7 +548,7 @@ def test_consume_mode_with_extra_columns_in_the_input(self): raw_data = (p | beam.Create(test_data)) process_handler = handlers.TFTProcessHandler( artifact_location=self.artifact_location, artifact_mode='consume') - transformed_data = process_handler.process_data(raw_data) + transformed_data = raw_data | process_handler transformed_data_x = transformed_data | beam.Map(lambda x: x.x) transformed_data_y = transformed_data | beam.Map(lambda x: x.y) diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py index c7b8ff0153247..8705b79aa309a 100644 --- a/sdks/python/apache_beam/ml/transforms/tft.py +++ b/sdks/python/apache_beam/ml/transforms/tft.py @@ -42,6 +42,7 @@ from typing import Tuple from typing import Union +import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft from apache_beam.ml.transforms.base import BaseOperation @@ -95,6 +96,27 @@ def __init__(self, columns: List[str]) -> None: "Columns are not specified. Please specify the column for the " " op %s" % self.__class__.__name__) + def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: + from apache_beam.ml.transforms.handlers import TFTProcessHandler + params = {} + artifact_location = kwargs.get('artifact_location') + if not artifact_location: + raise RuntimeError( + "artifact_location is not specified. Please specify the " + "artifact_location for the op %s" % self.__class__.__name__) + + transforms = kwargs.get('transforms') + if transforms: + params['transforms'] = transforms + + artifact_mode = kwargs.get('artifact_mode') + if artifact_mode: + params['artifact_mode'] = artifact_mode + return TFTProcessHandler(artifact_location=artifact_location, **params) + + def requires_chaining(self): + return True + @tf.function def _split_string_with_delimiter(self, data, delimiter): """ From f86c259d51e312a91776f4e40c8172645e02a9ff Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:50:18 -0500 Subject: [PATCH 07/52] raise RuntimeError in ArtifactsFetcher when it is used for embeddings --- sdks/python/apache_beam/ml/transforms/tft_test.py | 7 ++++++- sdks/python/apache_beam/ml/transforms/utils.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/tft_test.py b/sdks/python/apache_beam/ml/transforms/tft_test.py index 38ded6a809af0..9f15db45bd285 100644 --- a/sdks/python/apache_beam/ml/transforms/tft_test.py +++ b/sdks/python/apache_beam/ml/transforms/tft_test.py @@ -711,8 +711,13 @@ def test_count_per_key_on_list(self): ])) def validate_count_per_key(key_vocab_filename): + files = os.listdir(self.artifact_location) + files.remove(base._ATTRIBUTE_FILE_NAME) key_vocab_location = os.path.join( - self.artifact_location, 'transform_fn/assets', key_vocab_filename) + self.artifact_location, + files[0], + 'transform_fn/assets', + key_vocab_filename) with open(key_vocab_location, 'r') as f: key_vocab_list = [line.strip() for line in f] return key_vocab_list diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py index 19bb02c5ae1b9..b66cb4162ce29 100644 --- a/sdks/python/apache_beam/ml/transforms/utils.py +++ b/sdks/python/apache_beam/ml/transforms/utils.py @@ -17,9 +17,11 @@ __all__ = ['ArtifactsFetcher'] +import os import typing import tensorflow_transform as tft +from apache_beam.ml.transforms import base class ArtifactsFetcher(): @@ -28,8 +30,13 @@ class ArtifactsFetcher(): to the TFTProcessHandlers in MLTransform. """ def __init__(self, artifact_location): - self.artifact_location = artifact_location - self.transform_output = tft.TFTransformOutput(self.artifact_location) + files = os.listdir(artifact_location) + files.remove(base._ATTRIBUTE_FILE_NAME) + if len(files) > 1: + raise NotImplementedError( + 'Multiple files in artifact location not supported yet.') + self._artifact_location = os.path.join(artifact_location, files[0]) + self.transform_output = tft.TFTransformOutput(self._artifact_location) def get_vocab_list( self, From fc4ec0086be15e43f435eb340942a7f497d874e3 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:50:45 -0500 Subject: [PATCH 08/52] Add JsonPickle to requirements --- sdks/python/container/py310/base_image_requirements.txt | 1 + sdks/python/container/py311/base_image_requirements.txt | 1 + sdks/python/container/py38/base_image_requirements.txt | 1 + sdks/python/container/py39/base_image_requirements.txt | 1 + sdks/python/setup.py | 1 + 5 files changed, 5 insertions(+) diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index a9f94104374e0..fc1ce3f28eeab 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -82,6 +82,7 @@ idna==3.4 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonpickle==3.0.2 jsonschema==4.19.1 jsonschema-specifications==2023.7.1 mmh3==4.0.1 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 865b856683a4d..7b55936530a09 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -79,6 +79,7 @@ idna==3.4 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonpickle==3.0.2 jsonschema==4.19.1 jsonschema-specifications==2023.7.1 mmh3==4.0.1 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index 5dffff5f80d9a..fb89284967167 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -85,6 +85,7 @@ importlib-resources==6.1.0 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonpickle==3.0.2 jsonschema==4.19.1 jsonschema-specifications==2023.7.1 mmh3==4.0.1 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 1b8ad7a2e748f..c0dcd6baf6a33 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -83,6 +83,7 @@ importlib-metadata==6.8.0 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonpickle==3.0.2 jsonschema==4.19.1 jsonschema-specifications==2023.7.1 mmh3==4.0.1 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 1785cd75df80b..6c99dad55504d 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -286,6 +286,7 @@ def get_portability_package_data(): 'httplib2>=0.8,<0.23.0', 'js2py>=0.74,<1', 'jsonschema>=4.0.0,<5.0.0', + 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. # Use a strict upper bound. 'numpy>=1.14.3,<1.25.0', # Update pyproject.toml as well. From 3da5ce836bd752169f1f58daa7ea15bfc77d139f Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 10:51:03 -0500 Subject: [PATCH 09/52] Add tox tests --- sdks/python/test-suites/tox/py38/build.gradle | 12 +++++++++ sdks/python/tox.ini | 27 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle index b1ed5f88c7c93..c4fd300ca9435 100644 --- a/sdks/python/test-suites/tox/py38/build.gradle +++ b/sdks/python/test-suites/tox/py38/build.gradle @@ -141,6 +141,18 @@ toxTask "testPy38transformers-430", "py38-transformers-430", "${posargs}" test.dependsOn "testPy38transformers-430" preCommitPyCoverage.dependsOn "testPy38transformers-430" +toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "${posargs}" +test.dependsOn "testPy38sentenceTransformers-222" +preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222" + +toxTask "testPy38tensorflowHub-014", "py38-tfhub-014", "${posargs}" +test.dependsOn "testPy38tensorflowHub-014" +preCommitPyCoverage.dependsOn "testPy38tensorflowHub-014" + +toxTask "testPy38tensorflowHub-015", "py38-tfhub-015", "${posargs}" +test.dependsOn "testPy38tensorflowHub-015" +preCommitPyCoverage.dependsOn "testPy38tensorflowHub-015" + toxTask "whitespacelint", "whitespacelint", "${posargs}" task archiveFilesToLint(type: Zip) { diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index e4cf09cacba40..57533af31a286 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -423,3 +423,30 @@ commands = # Run all Vertex AI unit tests # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_vertex_ai {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + + +[testenv:py{38,39,310,311}-sentence-transformers-222] +deps = + sentence-transformers==2.2.2 +extras = test,gcp +commands = + # Log aiplatform and its dependencies version for debugging + /bin/sh -c "pip freeze | grep -E sentence-transformers" + # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. + bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' + +[testenv:py{38,39,310,311}-tfhub-{014,015}] +deps = + 014: + tensorflow-hub>=0.14.0,<0.15.0 + tensorflow-text + 015: + tensorflow-hub>=0.15.0,<0.16.0 + tensorflow-text + +extras = test,gcp +commands = + # Log aiplatform and its dependencies version for debugging + /bin/sh -c "pip freeze | grep -E tensorflow" + # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. + bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' From 4b4ee588c2db1185d37f0a1c38eed1df3d7275b7 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 29 Nov 2023 13:16:08 -0500 Subject: [PATCH 10/52] Mock frameworks in pydocs Fix tox.ini Fix pydoc Fix indent in pydoc --- sdks/python/apache_beam/ml/transforms/base.py | 8 +++++++- .../ml/transforms/embeddings/tensorflow_hub.py | 6 +++--- .../apache_beam/ml/transforms/embeddings/vertex_ai.py | 4 +--- sdks/python/scripts/generate_pydoc.sh | 4 +++- sdks/python/tox.ini | 9 +++------ 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index b8a9beafb9862..4ecbeacb8e672 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -43,7 +43,13 @@ _LOGGER = logging.getLogger(__name__) _ATTRIBUTE_FILE_NAME = 'attributes.json' -__all__ = ['MLTransform', 'ProcessHandler', 'BaseOperation'] +__all__ = [ + 'MLTransform', + 'ProcessHandler', + 'PTransformProvider', + 'BaseOperation', + 'EmbeddingsManager' +] TransformedDatasetT = TypeVar('TransformedDatasetT') TransformedMetadataT = TypeVar('TransformedMetadataT') diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py index 62bd00e10359a..4b01f7ec44b9a 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py @@ -91,9 +91,6 @@ def __init__( hub_url: str, preprocessing_url: Optional[str] = None, **kwargs): - super().__init__(columns=columns, **kwargs) - self.model_uri = hub_url - self.preprocessing_url = preprocessing_url """ Embedding config for tensorflow hub models. This config can be used with MLTransform to embed text data. Models are loaded using the RunInference @@ -109,6 +106,9 @@ def __init__( max_batch_size: The maximum batch size to be used for inference. large_model: Whether to share the model across processes. """ + super().__init__(columns=columns, **kwargs) + self.model_uri = hub_url + self.preprocessing_url = preprocessing_url def get_model_handler(self) -> ModelHandler: # override the default inference function diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 31f5240093441..e4c6745bb5665 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -104,12 +104,10 @@ def __init__( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[Credentials] = None, - **kwargs, - ): + **kwargs): """ Embedding Config for Vertex AI Text Embedding models following https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long - Text Embeddings are generated for a batch of text using the Vertex AI SDK. Embeddings are returned in a list for each text in the batch. Look at https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md # pylint: disable=line-too-long diff --git a/sdks/python/scripts/generate_pydoc.sh b/sdks/python/scripts/generate_pydoc.sh index 06ad06320fcf4..8d5b43167dd11 100755 --- a/sdks/python/scripts/generate_pydoc.sh +++ b/sdks/python/scripts/generate_pydoc.sh @@ -133,7 +133,9 @@ autodoc_inherit_docstrings = False autodoc_member_order = 'bysource' autodoc_mock_imports = ["tensorrt", "cuda", "torch", "onnxruntime", "onnx", "tensorflow", "tensorflow_hub", - "tensorflow_transform", "tensorflow_metadata", "transformers"] + "tensorflow_transform", "tensorflow_metadata", "transformers", "tensorflow_text", + "sentence_transformers", + ] # Allow a special section for documenting DataFrame API napoleon_custom_sections = ['Differences from pandas'] diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 57533af31a286..1cea858e8bbc2 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -437,12 +437,9 @@ commands = [testenv:py{38,39,310,311}-tfhub-{014,015}] deps = - 014: - tensorflow-hub>=0.14.0,<0.15.0 - tensorflow-text - 015: - tensorflow-hub>=0.15.0,<0.16.0 - tensorflow-text + 014: tensorflow-hub>=0.14.0,<0.15.0 + 015: tensorflow-hub>=0.15.0,<0.16.0 + tensorflow-text extras = test,gcp commands = From 01ba2175330257745c890b80f6db7f26eae24c62 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 18:25:24 +0000 Subject: [PATCH 11/52] Add Row type check --- sdks/python/apache_beam/typehints/typehints.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/typehints/typehints.py b/sdks/python/apache_beam/typehints/typehints.py index 4fd4b97e82cde..70eb78b6ffc60 100644 --- a/sdks/python/apache_beam/typehints/typehints.py +++ b/sdks/python/apache_beam/typehints/typehints.py @@ -1020,13 +1020,13 @@ def __getitem__(self, type_param): class CollectionHint(CompositeTypeHint): """ A Collection type-hint. - + Collection[X] defines a type-hint for a collection of homogenous types. 'X' may be either a built-in Python type or another nested TypeConstraint. This represents a collections.abc.Collection type, which implements __contains__, __iter__, and __len__. This acts as a parent type for - sets but has fewer guarantees for mixins. + sets but has fewer guarantees for mixins. """ class CollectionTypeConstraint(SequenceTypeConstraint): def __init__(self, type_param): @@ -1302,6 +1302,8 @@ def is_consistent_with(sub, base): relation, but also handles the special Any type as well as type parameterization. """ + from apache_beam.pvalue import Row + from apache_beam.typehints.row_type import RowTypeConstraint if sub == base: # Common special case. return True @@ -1313,6 +1315,8 @@ def is_consistent_with(sub, base): return all(is_consistent_with(c, base) for c in sub.union_types) elif isinstance(base, TypeConstraint): return base._consistent_with_check_(sub) + elif isinstance(sub, RowTypeConstraint): + return base == Row elif isinstance(sub, TypeConstraint): # Nothing but object lives above any type constraints. return base == object From f080c25ca6310cc8a4614d71f23d3632f716bdc7 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 18:29:35 +0000 Subject: [PATCH 12/52] Remove requires_chaining --- sdks/python/apache_beam/ml/transforms/base.py | 26 ++++--------------- .../embeddings/sentence_transformer.py | 5 +--- .../transforms/embeddings/tensorflow_hub.py | 2 +- .../embeddings/tensorflow_hub_test.py | 10 +++---- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 4ecbeacb8e672..580c7c7e912d4 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -93,14 +93,10 @@ class PTransformProvider: Data processing transforms that are intended to be used with MLTransform should subclass PTransformProvider and implement the following methods: 1. get_ptransform_for_processing() - 2. requires_chaining() get_ptransform_for_processing() method should return a PTransform that can be used to process the data. - requires_chaining() method should return True if the data processing - transforms needs to be chained sequentially with compatible data processing - transforms. """ @abc.abstractmethod def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: @@ -108,13 +104,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: Returns a PTransform that can be used to process the data. """ - @abc.abstractmethod - def requires_chaining(self): - """ - Returns True if the data processing transforms needs to be chained - sequentially with compatible data processing transforms. - """ - def get_counter(self): """ Returns the counter name for the data processing transform. @@ -196,10 +185,6 @@ def get_model_handler(self) -> ModelHandler: Return framework specific model handler. """ - def requires_chaining(self): - # each embedding config requires a separate PTransform. so no chaining. - return False - def get_columns_to_apply(self): return self.columns @@ -495,15 +480,14 @@ def create_ptransform_list(self): artifact_location=os.path.join( self._parent_artifact_location, uuid.uuid4().hex[:6]), artifact_mode=self.artifact_mode) - # Determine if a new ptransform should be added to the list - is_different_type = (type(current_ptransform) != previous_ptransform_type) - if is_different_type or not transform.requires_chaining(): + append_transform = hasattr(current_ptransform, 'append_transform') + if (type(current_ptransform) != previous_ptransform_type) or not append_transform: ptransform_list.append(current_ptransform) previous_ptransform_type = type(current_ptransform) - - if hasattr(ptransform_list[-1], 'append_transform'): + # If different PTransform is appended to the list and the PTransform + # supports append_transform, append the transform to the PTransform. + if append_transform: ptransform_list[-1].append_transform(transform) - return ptransform_list def save_transforms_in_artifact_location(self, ptransform_list): diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py index 5b31dbca00820..f41e24c0f7a4a 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py @@ -122,7 +122,4 @@ def get_model_handler(self): def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: # wrap the model handler in a _TextEmbeddingHandler since # the SentenceTransformerEmbeddings works on text input data. - return (RunInference(model_handler=_TextEmbeddingHandler(self))) - - def requires_chaining(self): - return False + return (RunInference(model_handler=_TextEmbeddingHandler(self))) \ No newline at end of file diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py index 4b01f7ec44b9a..4612ca9d1d425 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py @@ -45,7 +45,7 @@ def __init__(self, preprocessing_url: Optional[str], *args, **kwargs): def load_model(self): # unable to load the models with tf.keras.models.load_model so # using hub.KerasLayer instead - model = hub.KerasLayer(self._model_uri) + model = hub.KerasLayer(self._model_uri, ) return model def _convert_prediction_result_to_list( diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py index 6b918153945ae..8c571e0cf4621 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py @@ -21,7 +21,7 @@ import apache_beam as beam from apache_beam.ml.transforms.base import MLTransform -hub_url = 'https://tfhub.dev/google/LEALLA/LEALLA-small/1' +hub_url = 'https://tfhub.dev/google/nnlm-en-dim128/2' test_query_column = 'test_query' test_query = 'This is a test query' @@ -134,12 +134,12 @@ def test_embeddings_with_read_artifact_location(self): pipeline=data, read_artifact_location=self.artifact_location) def assert_element(element): - assert round(element, 2) == 0.21 + # 0.29836970567703247 + assert round(element, 2) == 0.3 _ = ( result_pcoll | beam.Map(lambda x: max(x[test_query_column])) - # 0.14797046780586243 | beam.Map(assert_element)) def test_with_int_data_types(self): @@ -185,12 +185,12 @@ def test_with_gcs_artifact_location(self): pipeline=data, read_artifact_location=artifact_location) def assert_element(element): - assert round(element, 2) == 0.21 + # 0.29836970567703247 + assert round(element, 2) == 0.3 _ = ( result_pcoll | beam.Map(lambda x: max(x[test_query_column])) - # 0.14797046780586243 | beam.Map(assert_element)) From 6111c31066264f1387395fc8c86d7f061720c921 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 18:39:48 +0000 Subject: [PATCH 13/52] change name of PTransformProvider to MLTransformProvider --- sdks/python/apache_beam/ml/transforms/base.py | 27 ++++++++++--------- .../apache_beam/ml/transforms/base_test.py | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 580c7c7e912d4..096773339bb7e 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -46,7 +46,7 @@ __all__ = [ 'MLTransform', 'ProcessHandler', - 'PTransformProvider', + 'MLTransformProvider', 'BaseOperation', 'EmbeddingsManager' ] @@ -88,10 +88,10 @@ class ArtifactMode(object): CONSUME = 'consume' -class PTransformProvider: +class MLTransformProvider: """ Data processing transforms that are intended to be used with MLTransform - should subclass PTransformProvider and implement the following methods: + should subclass MLTransformProvider and implement the following methods: 1. get_ptransform_for_processing() get_ptransform_for_processing() method should return a PTransform that can be @@ -113,7 +113,7 @@ def get_counter(self): class BaseOperation(Generic[OperationInputT, OperationOutputT], - PTransformProvider, + MLTransformProvider, abc.ABC): def __init__(self, columns: List[str]) -> None: """ @@ -158,7 +158,7 @@ def append_transform(self, transform: BaseOperation): # TODO: Add support for inference_fn -class EmbeddingsManager(PTransformProvider): +class EmbeddingsManager(MLTransformProvider): def __init__( self, columns: List[str], @@ -326,13 +326,13 @@ def with_transform(self, transform: BaseOperation): return self def _validate_transform(self, transform): - # every data processing transform should subclass PTransformProvider. Raise - # an error if the transform does not subclass PTransformProvider since the + # every data processing transform should subclass MLTransformProvider. Raise + # an error if the transform does not subclass MLTransformProvider since the # downstream code expects the transform to be a subclass of - # PTransformProvider. - if not isinstance(transform, PTransformProvider): + # MLTransformProvider. + if not isinstance(transform, MLTransformProvider): raise TypeError( - 'transform must be a subclass of PTransformProvider and implement ' + 'transform must be a subclass of MLTransformProvider and implement ' 'get_ptransform_for_processing() method.' 'Got: %s instead.' % type(transform)) @@ -471,9 +471,9 @@ def create_ptransform_list(self): current_ptransform = None ptransform_list = [] for transform in self.transforms: - if not isinstance(transform, PTransformProvider): + if not isinstance(transform, MLTransformProvider): raise RuntimeError( - 'Transforms must be instances of PTransformProvider and ' + 'Transforms must be instances of MLTransformProvider and ' 'implement get_ptransform_for_processing() method.') # for each instance of PTransform, create a new artifact location current_ptransform = transform.get_ptransform_for_processing( @@ -481,7 +481,8 @@ def create_ptransform_list(self): self._parent_artifact_location, uuid.uuid4().hex[:6]), artifact_mode=self.artifact_mode) append_transform = hasattr(current_ptransform, 'append_transform') - if (type(current_ptransform) != previous_ptransform_type) or not append_transform: + if (type(current_ptransform) != + previous_ptransform_type) or not append_transform: ptransform_list.append(current_ptransform) previous_ptransform_type = type(current_ptransform) # If different PTransform is appended to the list and the PTransform diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 1f9e5a85d1c2a..e79157cea7565 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -287,7 +287,7 @@ def __call__(self, x): return x + 1 with self.assertRaisesRegex( - TypeError, 'transform must be a subclass of PTransformProvider'): + TypeError, 'transform must be a subclass of MLTransformProvider'): with beam.Pipeline() as p: _ = ( p From ba24e81e0994a78791f030ac42abc0a6525966b7 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 18:57:00 +0000 Subject: [PATCH 14/52] remove batch_len in utility fun --- sdks/python/apache_beam/ml/transforms/base.py | 14 +++++++--- .../apache_beam/ml/transforms/base_test.py | 28 +++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 096773339bb7e..667c9acb069ad 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -74,10 +74,16 @@ def _convert_list_of_dicts_to_dict_of_lists( def _convert_dict_of_lists_to_lists_of_dict( - dict_of_lists: Dict[str, List[Any]], - batch_length: int) -> List[Dict[str, Any]]: + dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]: + + batch_length = len(next(iter(dict_of_lists.values()))) result: List[Dict[str, Any]] = [{} for _ in range(batch_length)] + # all the values in the dict_of_lists should have same length for key, values in dict_of_lists.items(): + assert len(values) == batch_length, ( + "This function expects all the values " + "in the dict_of_lists to have same length." + ) for i in range(len(values)): result[i][key] = values[i] return result @@ -578,11 +584,11 @@ def run_inference( should be of the same size for a single key across the batch. """ self._validate_batch(batch) - batch_len = len(batch) dict_batch = _convert_list_of_dicts_to_dict_of_lists(list_of_dicts=batch) transformed_batch = self._process_batch(dict_batch, model, inference_args) return _convert_dict_of_lists_to_lists_of_dict( - dict_of_lists=transformed_batch, batch_length=batch_len) + dict_of_lists=transformed_batch, + ) def get_metrics_namespace(self) -> str: return ( diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index e79157cea7565..4e73a915adc50 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -425,5 +425,33 @@ def test_handler_on_multiple_columns(self): ) +class TestUtilFunctions(unittest.TestCase): + def test_list_of_dicts_to_dict_of_lists_normal(self): + input_list = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}] + expected_output = {'a': [1, 3], 'b': [2, 4]} + self.assertEqual( + base._convert_list_of_dicts_to_dict_of_lists(input_list), + expected_output) + + def test_list_of_dicts_to_dict_of_lists_on_list_inputs(self): + input_list = [{'a': [1, 2, 10], 'b': 3}, {'a': [1], 'b': 5}] + expected_output = {'a': [[1, 2, 10], [1]], 'b': [3, 5]} + self.assertEqual( + base._convert_list_of_dicts_to_dict_of_lists(input_list), + expected_output) + + def test_dict_of_lists_to_lists_of_dict_normal(self): + input_dict = {'a': [1, 3], 'b': [2, 4]} + expected_output = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}] + self.assertEqual( + base._convert_dict_of_lists_to_lists_of_dict(input_dict), + expected_output) + + def test_dict_of_lists_to_lists_of_dict_unequal_length(self): + input_dict = {'a': [1, 3], 'b': [2]} + with self.assertRaises(AssertionError): + base._convert_dict_of_lists_to_lists_of_dict(input_dict) + + if __name__ == '__main__': unittest.main() From d690aec81da16aa491254301ca92c477bf71b75b Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 20:13:14 +0000 Subject: [PATCH 15/52] Change type annotation and redundant comments --- sdks/python/apache_beam/ml/transforms/base.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 667c9acb069ad..fcd2c3299e384 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -75,7 +75,6 @@ def _convert_list_of_dicts_to_dict_of_lists( def _convert_dict_of_lists_to_lists_of_dict( dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]: - batch_length = len(next(iter(dict_of_lists.values()))) result: List[Dict[str, Any]] = [{} for _ in range(batch_length)] # all the values in the dict_of_lists should have same length @@ -203,8 +202,7 @@ def __init__( *, write_artifact_location: Optional[str] = None, read_artifact_location: Optional[str] = None, - transforms: Optional[List[Union[BaseOperation, - EmbeddingsManager]]] = None): + transforms: Optional[MLTransformProvider] = None): """ MLTransform is a Beam PTransform that can be used to apply transformations to the data. MLTransform is used to wrap the @@ -317,7 +315,7 @@ def expand( | "MLTransformMetricsUsage" >> MLTransformMetricsUsage(self)) return pcoll # type: ignore[return-value] - def with_transform(self, transform: BaseOperation): + def with_transform(self, transform: MLTransformProvider): """ Add a transform to the MLTransform pipeline. Args: @@ -325,9 +323,7 @@ def with_transform(self, transform: BaseOperation): Returns: A MLTransform instance. """ - # self._validate_transform(transform) - # avoid circular import - # pylint: disable=wrong-import-order, wrong-import-position + self._validate_transform(transform) self.transforms.append(transform) return self @@ -457,7 +453,7 @@ class _MLTransformToPTransformMapper: """ def __init__( self, - transforms: List[Union[BaseOperation, EmbeddingsManager]], + transforms: List[MLTransformProvider], artifact_location: str, artifact_mode: str, pipeline_options: Optional[PipelineOptions] = None, From af7496b8bcdecc929cc7224a23cec6df0a50df93 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 20:16:44 +0000 Subject: [PATCH 16/52] Remove get_transforms method --- sdks/python/apache_beam/ml/transforms/handlers.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/handlers.py b/sdks/python/apache_beam/ml/transforms/handlers.py index 1a673c51df261..620a417c29422 100644 --- a/sdks/python/apache_beam/ml/transforms/handlers.py +++ b/sdks/python/apache_beam/ml/transforms/handlers.py @@ -15,7 +15,6 @@ # limitations under the License. # # pytype: skip-file -# pylint: skip-file import collections import hashlib @@ -218,9 +217,6 @@ def __init__( def append_transform(self, transform): self.transforms.append(transform) - def get_transforms(self): - return self.transforms - def _map_column_names_to_types(self, row_type): """ Return a dictionary of column names and types. @@ -323,7 +319,6 @@ def _get_raw_data_feature_spec_per_column( f"Please provide a valid type from the following: " f"{_default_type_to_tensor_type_map.keys()}") return tf.io.VarLenFeature(_default_type_to_tensor_type_map[dtype]) - # return tf.io.VarLenFeature() def get_raw_data_metadata( self, input_types: Dict[str, type]) -> dataset_metadata.DatasetMetadata: From d713555e24cb2f02c3ce40d51f6b82a4176ef856 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 20:18:42 +0000 Subject: [PATCH 17/52] remove requires_chaining from tft --- sdks/python/apache_beam/ml/transforms/tft.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py index 8705b79aa309a..3a103962045f6 100644 --- a/sdks/python/apache_beam/ml/transforms/tft.py +++ b/sdks/python/apache_beam/ml/transforms/tft.py @@ -114,9 +114,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: params['artifact_mode'] = artifact_mode return TFTProcessHandler(artifact_location=artifact_location, **params) - def requires_chaining(self): - return True - @tf.function def _split_string_with_delimiter(self, data, delimiter): """ From 50450f34b67d80375f1a3bf739de8ade4ee2f40c Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 4 Dec 2023 22:42:54 +0000 Subject: [PATCH 18/52] add tests to sentence-transformers --- .../embeddings/sentence_transformer_test.py | 172 +++++++++--------- 1 file changed, 89 insertions(+), 83 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index 63f401180dc2d..bdf30ec14fc36 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -18,8 +18,12 @@ import tempfile import unittest +from parameterized import parameterized + import apache_beam as beam from apache_beam.ml.transforms.base import MLTransform +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to # pylint: disable=ungrouped-imports try: @@ -37,6 +41,39 @@ test_query = "This is a test" test_query_column = "feature_1" DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" +_parameterized_inputs = [ + ([{ + test_query_column: '样例数据-1' + }, { + test_query_column: '样例数据-2' + }, { + test_query_column: '样例数据-3' + }, { + test_query_column: '样例数据-4' + }], + 'BAAI/bge-base-en-v1.5', [0.1091, 0.122, 0.104, 0.1093]), + ([{ + test_query_column: test_query, + }], DEFAULT_MODEL_NAME, [0.1342]), + ( + [{ + test_query_column: 'query: how much protein should a female eat', + }, + { + test_query_column: ( + "passage: As a general guideline, the CDC's " + "average requirement of protein for women " + "ages 19 to 70 is 46 grams per day. But, " + "as you can see from this chart, you'll need " + "to increase that if you're expecting or training" + " for a marathon. Check out the chart below " + "to see how much protein " + "you should be eating each day.") + }], + 'intfloat/e5-base-v2', + # this model requires inputs to be specified as query: and passage: + [0.0982, 0.1033]), +] def get_pipeline_wth_embedding_config( @@ -99,60 +136,36 @@ def assert_element(element): _ = (transformed_pcoll | beam.Map(assert_element)) - def pipeline_with_configurable_artifact_location( - self, - pipeline, - embedding_config=None, - read_artifact_location=None, - write_artifact_location=None): - if write_artifact_location: - return ( - pipeline - | MLTransform(write_artifact_location=write_artifact_location). - with_transform(embedding_config)) - elif read_artifact_location: - return ( - pipeline - | MLTransform(read_artifact_location=read_artifact_location)) - else: - raise NotImplementedError + @parameterized.expand(_parameterized_inputs) + def test_embeddings_with_read_artifact_location( + self, inputs, model_name, output): + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) - def test_embeddings_with_read_artifact_location(self): with beam.Pipeline() as p: - model_name = DEFAULT_MODEL_NAME - embedding_config = SentenceTransformerEmbeddings( - model_name=model_name, columns=[test_query_column]) + result_pcoll = ( + p + | "CreateData" >> beam.Create(inputs) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + max_ele_pcoll = ( + result_pcoll + | beam.Map(lambda x: round(max(x[test_query_column]), 4))) - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=self.artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=self.artifact_location) + assert_that(max_ele_pcoll, equal_to(output)) - def assert_element(element): - assert round(element, 2) == 0.13 + with beam.Pipeline() as p: + result_pcoll = ( + p + | "CreateData" >> beam.Create(inputs) + | "MLTransform" >> + MLTransform(read_artifact_location=self.artifact_location)) + max_ele_pcoll = ( + result_pcoll + | beam.Map(lambda x: round(max(x[test_query_column]), 4))) - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - # 0.1342099905014038 - | beam.Map(assert_element)) + assert_that(max_ele_pcoll, equal_to(output)) def test_sentence_transformer_with_int_data_types(self): model_name = DEFAULT_MODEL_NAME @@ -169,43 +182,36 @@ def test_sentence_transformer_with_int_data_types(self): write_artifact_location=self.artifact_location).with_transform( embedding_config)) - def test_with_gcs_artifact_location(self): + @parameterized.expand(_parameterized_inputs) + def test_with_gcs_artifact_location(self, inputs, model_name, output): artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers') - with beam.Pipeline() as p: - model_name = DEFAULT_MODEL_NAME - embedding_config = SentenceTransformerEmbeddings( - model_name=model_name, columns=[test_query_column]) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=artifact_location) + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, columns=[test_query_column]) - def assert_element(element): - assert round(element, 2) == 0.13 + with beam.Pipeline() as p: + result_pcoll = ( + p + | "CreateData" >> beam.Create(inputs) + | "MLTransform" >> + MLTransform(write_artifact_location=artifact_location).with_transform( + embedding_config)) + max_ele_pcoll = ( + result_pcoll + | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + + assert_that(max_ele_pcoll, equal_to(output)) - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - # 0.1342099905014038 - | beam.Map(assert_element)) + with beam.Pipeline() as p: + result_pcoll = ( + p + | "CreateData" >> beam.Create(inputs) + | "MLTransform" >> + MLTransform(read_artifact_location=artifact_location)) + max_ele_pcoll = ( + result_pcoll + | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + + assert_that(max_ele_pcoll, equal_to(output)) if __name__ == '__main__': From 8823a752e76f660278f00d9891c42fe87d0f5a9a Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Tue, 5 Dec 2023 15:22:41 -0500 Subject: [PATCH 19/52] Pass inference_args to RunInference --- sdks/python/apache_beam/ml/transforms/base.py | 9 ++-- .../embeddings/sentence_transformer.py | 8 ++- .../embeddings/sentence_transformer_test.py | 51 +++++++++++++------ .../transforms/embeddings/tensorflow_hub.py | 8 ++- .../ml/transforms/embeddings/vertex_ai.py | 5 +- 5 files changed, 57 insertions(+), 24 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index fcd2c3299e384..7b4bf0b0ef2a2 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -179,6 +179,7 @@ def __init__( self.max_batch_size = max_batch_size self.large_model = large_model self.columns = columns + self.inference_args = kwargs.pop('inference_args', {}) if kwargs: _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys()) @@ -300,9 +301,8 @@ def expand( _MLTransformToPTransformMapper.load_transforms_from_artifact_location( self._parent_artifact_location)) - # the saved transforms has artifact mode set to PRODUCE. - # set the artifact mode to CONSUME. - if self._artifact_mode == ArtifactMode.CONSUME: + # the saved transforms has artifact mode set to PRODUCE. + # set the artifact mode to CONSUME. for i in range(len(ptransform_list)): if hasattr(ptransform_list[i], 'artifact_mode'): ptransform_list[i].artifact_mode = self._artifact_mode @@ -598,3 +598,6 @@ def batch_elements_kwargs(self) -> Mapping[str, Any]: if self.embedding_config.min_batch_size: batch_sizes_map['min_batch_size'] = self.embedding_config.min_batch_size return (self._underlying.batch_elements_kwargs() or batch_sizes_map) + + def validate_inference_args(self, _): + pass diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py index f41e24c0f7a4a..935e9281c2d59 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py @@ -68,7 +68,7 @@ def run_inference( return model.encode(batch, **inference_args) def load_model(self): - model = self._model_class(self._model_uri) + model = self._model_class(self._model_uri, **self._load_model_args) if self._max_seq_length: model.max_seq_length = self._max_seq_length return model @@ -122,4 +122,8 @@ def get_model_handler(self): def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: # wrap the model handler in a _TextEmbeddingHandler since # the SentenceTransformerEmbeddings works on text input data. - return (RunInference(model_handler=_TextEmbeddingHandler(self))) \ No newline at end of file + return ( + RunInference( + model_handler=_TextEmbeddingHandler(self), + inference_args=self.inference_args, + )) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index bdf30ec14fc36..21289797133c0 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -28,6 +28,7 @@ # pylint: disable=ungrouped-imports try: from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings + import torch except ImportError: SentenceTransformerEmbeddings = None # type: ignore @@ -76,18 +77,6 @@ ] -def get_pipeline_wth_embedding_config( - pipeline: beam.Pipeline, embedding_config, artifact_location): - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform(write_artifact_location=artifact_location). - with_transform(embedding_config)) - return transformed_pcoll - - @unittest.skipIf( SentenceTransformerEmbeddings is None, 'sentence-transformers is not installed.') @@ -103,10 +92,14 @@ def test_sentence_transformer_embeddings(self): embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) with beam.Pipeline() as pipeline: - result_pcoll = get_pipeline_wth_embedding_config( - pipeline=pipeline, - embedding_config=embedding_config, - artifact_location=self.artifact_location) + result_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) def assert_element(element): assert len(element[test_query_column]) == 768 @@ -213,6 +206,32 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): assert_that(max_ele_pcoll, equal_to(output)) + def test_embeddings_with_inference_args(self): + model_name = DEFAULT_MODEL_NAME + + inference_args = {'convert_to_numpy': False} + embedding_config = SentenceTransformerEmbeddings( + model_name=model_name, + columns=[test_query_column], + inference_args=inference_args) + with beam.Pipeline() as pipeline: + result_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def assert_element(element): + assert type(element) == torch.Tensor + + _ = ( + result_pcoll + | beam.Map(lambda x: x[test_query_column]) + | beam.Map(assert_element)) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py index 4612ca9d1d425..a545d4b3d3a20 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py @@ -45,7 +45,7 @@ def __init__(self, preprocessing_url: Optional[str], *args, **kwargs): def load_model(self): # unable to load the models with tf.keras.models.load_model so # using hub.KerasLayer instead - model = hub.KerasLayer(self._model_uri, ) + model = hub.KerasLayer(self._model_uri, **self._load_model_args) return model def _convert_prediction_result_to_list( @@ -121,4 +121,8 @@ def get_model_handler(self) -> ModelHandler: ) def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: - return (RunInference(model_handler=_TextEmbeddingHandler(self))) + return ( + RunInference( + model_handler=_TextEmbeddingHandler(self), + inference_args=self.inference_args, + )) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index e4c6745bb5665..b61dc98fd5cdd 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -155,4 +155,7 @@ def get_model_handler(self) -> ModelHandler: ) def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: - return (RunInference(model_handler=_TextEmbeddingHandler(self))) + return ( + RunInference( + model_handler=_TextEmbeddingHandler(self), + inference_args=self.inference_args)) From a7e2bd354c2f3901d8a3f1ab5207b78a87ef5b18 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 20:25:08 +0000 Subject: [PATCH 20/52] Add TODO GH issue --- .../ml/transforms/embeddings/sentence_transformer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py index f41e24c0f7a4a..fa4c210860fa7 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py @@ -32,7 +32,8 @@ from sentence_transformers import SentenceTransformer -# TODO: Use HuggingFaceModelHandlerTensor once the import issue is fixed. +# TODO: https://github.com/apache/beam/issues/29621 +# Use HuggingFaceModelHandlerTensor once the import issue is fixed. # Right now, the hugging face model handler import torch and tensorflow # at the same time, which adds too much weigth to the container unnecessarily. class _SentenceTransformerModelHandler(ModelHandler): From f77ae6046fea55a35bc9dd5c1d9e35de72c3552e Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 20:49:02 +0000 Subject: [PATCH 21/52] refactor variables in vertex_ai embeddings --- .../apache_beam/ml/transforms/embeddings/vertex_ai.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index b61dc98fd5cdd..297549d4f3284 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -39,7 +39,8 @@ __all__ = ["VertexAITextEmbeddings"] -TASK_TYPE = "RETRIEVAL_DOCUMENT" +DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT" +# TODO: Can this list be automatically pulled from Vertex SDK? TASK_TYPE_INPUTS = [ "RETRIEVAL_DOCUMENT", "RETRIEVAL_QUERY", @@ -47,6 +48,7 @@ "CLASSIFICATION", "CLUSTERING" ] +_BATCH_SIZE = 5 # Vertex AI limits requests to 5 at a time. class _VertexAITextEmbeddingHandler(ModelHandler): @@ -57,7 +59,7 @@ def __init__( self, model_name: str, title: Optional[str] = None, - task_type: str = TASK_TYPE, + task_type: str = DEFAULT_TASK_TYPE, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[Credentials] = None, @@ -77,7 +79,7 @@ def run_inference( inference_args: Optional[Dict[str, Any]] = None, ) -> Iterable: embeddings = [] - batch_size = 5 # Vertex AI limits requests to 5 at a time. + batch_size = _BATCH_SIZE for i in range(0, len(batch), batch_size): text_batch = batch[i:i + batch_size] text_batch = [ @@ -100,7 +102,7 @@ def __init__( model_name: str, columns: List[str], title: Optional[str] = None, - task_type: str = TASK_TYPE, + task_type: str = DEFAULT_TASK_TYPE, project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[Credentials] = None, From 95ed3c59e3c29a3f0709f558f04d3f20e4fc5406 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:05:03 +0000 Subject: [PATCH 22/52] remove try/catch and throw error if options is empty for GCS artifact location --- sdks/python/apache_beam/ml/transforms/base.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 7b4bf0b0ef2a2..4d90346e61b42 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -323,11 +323,6 @@ def with_transform(self, transform: MLTransformProvider): Returns: A MLTransform instance. """ - self._validate_transform(transform) - self.transforms.append(transform) - return self - - def _validate_transform(self, transform): # every data processing transform should subclass MLTransformProvider. Raise # an error if the transform does not subclass MLTransformProvider since the # downstream code expects the transform to be a subclass of @@ -337,6 +332,8 @@ def _validate_transform(self, transform): 'transform must be a subclass of MLTransformProvider and implement ' 'get_ptransform_for_processing() method.' 'Got: %s instead.' % type(transform)) + self.transforms.append(transform) + return self class MLTransformMetricsUsage(beam.PTransform): @@ -404,13 +401,6 @@ def save_attributes( **kwargs, ): if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location): - try: - options = kwargs.get('options') - except KeyError: - raise RuntimeError( - 'pipeline options are required to save the attributes.' - 'in the artifact location %s' % artifact_location) - temp_dir = tempfile.mkdtemp() temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME) with open(temp_json_file, 'w+') as f: @@ -418,6 +408,12 @@ def save_attributes( with open(temp_json_file, 'rb') as f: from apache_beam.runners.dataflow.internal import apiclient _LOGGER.info('Creating artifact location: %s', artifact_location) + # pipeline options required to for the client to configure project. + options = kwargs.get('options') + if not options: + raise RuntimeError( + 'pipeline options are required to save the attributes.' + 'in the artifact location %s' % artifact_location) apiclient.DataflowApplicationClient(options=options).stage_file( gcs_or_local_path=artifact_location, file_name=_ATTRIBUTE_FILE_NAME, From c235499cfbbe0dab53eb7cbb721810415bfb074c Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:29:02 +0000 Subject: [PATCH 23/52] Refactor NotImplementedError message --- sdks/python/apache_beam/ml/transforms/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py index b66cb4162ce29..b0aef5898cf08 100644 --- a/sdks/python/apache_beam/ml/transforms/utils.py +++ b/sdks/python/apache_beam/ml/transforms/utils.py @@ -32,9 +32,13 @@ class ArtifactsFetcher(): def __init__(self, artifact_location): files = os.listdir(artifact_location) files.remove(base._ATTRIBUTE_FILE_NAME) + # TODO: Integrate ArtifactFetcher into MLTransform. if len(files) > 1: raise NotImplementedError( - 'Multiple files in artifact location not supported yet.') + "MLTransform may have been utilized alongside transforms written " + "in TensorFlow Transform, in conjunction with those from different " + "frameworks. Currently, retrieving artifacts from this " + "multi-framework setup is not supported.") self._artifact_location = os.path.join(artifact_location, files[0]) self.transform_output = tft.TFTransformOutput(self._artifact_location) From 6eebfa40c63707ab7c7991fd31a93cba748ed973 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:33:34 +0000 Subject: [PATCH 24/52] remove tensorflow hub from this PR --- .../transforms/embeddings/tensorflow_hub.py | 128 ----------- .../embeddings/tensorflow_hub_test.py | 198 ------------------ sdks/python/tox.ini | 15 +- 3 files changed, 1 insertion(+), 340 deletions(-) delete mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py delete mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py deleted file mode 100644 index a545d4b3d3a20..0000000000000 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py +++ /dev/null @@ -1,128 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Iterable -from typing import List -from typing import Optional - -import apache_beam as beam -import tensorflow as tf -import tensorflow_hub as hub -import tensorflow_text as text # required to register TF ops. # pylint: disable=unused-import -from apache_beam.ml.inference import utils -from apache_beam.ml.inference.base import ModelHandler -from apache_beam.ml.inference.base import PredictionResult -from apache_beam.ml.inference.base import RunInference -from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor -from apache_beam.ml.inference.tensorflow_inference import default_tensor_inference_fn -from apache_beam.ml.transforms.base import EmbeddingsManager -from apache_beam.ml.transforms.base import _TextEmbeddingHandler - -__all__ = ['TensorflowHubTextEmbeddings'] - - -class _TensorflowHubModelHandler(TFModelHandlerTensor): - """ - Note: Intended for internal use only. No backwards compatibility guarantees. - """ - def __init__(self, preprocessing_url: Optional[str], *args, **kwargs): - self.preprocessing_url = preprocessing_url - super().__init__(*args, **kwargs) - - def load_model(self): - # unable to load the models with tf.keras.models.load_model so - # using hub.KerasLayer instead - model = hub.KerasLayer(self._model_uri, **self._load_model_args) - return model - - def _convert_prediction_result_to_list( - self, predictions: Iterable[PredictionResult]): - result = [] - for prediction in predictions: - inference = prediction.inference.numpy().tolist() - result.append(inference) - return result - - def run_inference(self, batch, model, inference_args, model_id=None): - if not inference_args: - inference_args = {} - if not self.preprocessing_url: - predictions = default_tensor_inference_fn( - model=model, - batch=batch, - inference_args=inference_args, - model_id=model_id) - return self._convert_prediction_result_to_list(predictions) - - vectorized_batch = tf.stack(batch, axis=0) - preprocessor_fn = hub.KerasLayer(self.preprocessing_url) - vectorized_batch = preprocessor_fn(vectorized_batch) - predictions = model(vectorized_batch) - # https://www.tensorflow.org/text/tutorials/classify_text_with_bert#using_the_bert_model # pylint: disable=line-too-long - # pooled_output -> represents the text as a whole. This is an embeddings - # of the whole text. The shape is [batch_size, embedding_dimension] - # sequence_output -> represents the text as a sequence of tokens. This is - # an embeddings of each token in the text. The shape is - # [batch_size, max_sequence_length, embedding_dimension] - # pooled output is the embeedings as per the documentation. so let's use - # that. - embeddings = predictions['pooled_output'] - predictions = utils._convert_to_result(batch, embeddings, model_id) - return self._convert_prediction_result_to_list(predictions) - - -class TensorflowHubTextEmbeddings(EmbeddingsManager): - def __init__( - self, - columns: List[str], - hub_url: str, - preprocessing_url: Optional[str] = None, - **kwargs): - """ - Embedding config for tensorflow hub models. This config can be used with - MLTransform to embed text data. Models are loaded using the RunInference - PTransform with the help of a ModelHandler. - - Args: - columns: The columns containing the text to be embedded. - hub_url: The url of the tensorflow hub model. - preprocessing_url: The url of the preprocessing model. This is optional. - If provided, the preprocessing model will be used to preprocess the - text before feeding it to the main model. - min_batch_size: The minimum batch size to be used for inference. - max_batch_size: The maximum batch size to be used for inference. - large_model: Whether to share the model across processes. - """ - super().__init__(columns=columns, **kwargs) - self.model_uri = hub_url - self.preprocessing_url = preprocessing_url - - def get_model_handler(self) -> ModelHandler: - # override the default inference function - return _TensorflowHubModelHandler( - model_uri=self.model_uri, - preprocessing_url=self.preprocessing_url, - min_batch_size=self.min_batch_size, - max_batch_size=self.max_batch_size, - large_model=self.large_model, - ) - - def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: - return ( - RunInference( - model_handler=_TextEmbeddingHandler(self), - inference_args=self.inference_args, - )) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py deleted file mode 100644 index 8c571e0cf4621..0000000000000 --- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py +++ /dev/null @@ -1,198 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import shutil -import tempfile -import unittest - -import apache_beam as beam -from apache_beam.ml.transforms.base import MLTransform - -hub_url = 'https://tfhub.dev/google/nnlm-en-dim128/2' -test_query_column = 'test_query' -test_query = 'This is a test query' - -# pylint: disable=ungrouped-imports -try: - import tensorflow as tf # disable=unused-import - from apache_beam.ml.transforms.embeddings.tensorflow_hub import TensorflowHubTextEmbeddings -except ImportError: - tf = None - -try: - from apache_beam.ml.transforms.tft import ScaleTo01 -except ImportError: - ScaleTo01 = None # type: ignore - - -@unittest.skipIf(tf is None, 'Tensorflow is not installed.') -class TFHubEmbeddingsTest(unittest.TestCase): - def setUp(self) -> None: - self.artifact_location = tempfile.mkdtemp() - - def tearDown(self) -> None: - shutil.rmtree(self.artifact_location) - - def test_tfhub_text_embeddings(self): - embedding_config = TensorflowHubTextEmbeddings( - hub_url=hub_url, columns=[test_query_column]) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) - - def assert_element(element): - assert len(element[test_query_column]) == 128 - - _ = (transformed_pcoll | beam.Map(assert_element)) - - @unittest.skipIf(ScaleTo01 is None, 'Tensorflow Transform is not installed.') - def test_embeddings_with_scale_to_0_1(self): - embedding_config = TensorflowHubTextEmbeddings( - hub_url=hub_url, - columns=[test_query_column], - ) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config).with_transform( - ScaleTo01(columns=[test_query_column]))) - - def assert_element(element): - assert max(element[test_query_column]) == 1 - - _ = ( - transformed_pcoll | beam.Map(lambda x: x.as_dict()) - | beam.Map(assert_element)) - - def pipeline_with_configurable_artifact_location( - self, - pipeline, - embedding_config=None, - read_artifact_location=None, - write_artifact_location=None): - if write_artifact_location: - return ( - pipeline - | MLTransform(write_artifact_location=write_artifact_location). - with_transform(embedding_config)) - elif read_artifact_location: - return ( - pipeline - | MLTransform(read_artifact_location=read_artifact_location)) - else: - raise NotImplementedError - - def test_embeddings_with_read_artifact_location(self): - with beam.Pipeline() as p: - embedding_config = TensorflowHubTextEmbeddings( - hub_url=hub_url, columns=[test_query_column]) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=self.artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=self.artifact_location) - - def assert_element(element): - # 0.29836970567703247 - assert round(element, 2) == 0.3 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element)) - - def test_with_int_data_types(self): - embedding_config = TensorflowHubTextEmbeddings( - hub_url=hub_url, columns=[test_query_column]) - with self.assertRaises(TypeError): - with beam.Pipeline() as pipeline: - _ = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: 1 - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) - - def test_with_gcs_artifact_location(self): - artifact_location = 'gs://apache-beam-ml/testing/tensorflow_hub' - with beam.Pipeline() as p: - embedding_config = TensorflowHubTextEmbeddings( - hub_url=hub_url, columns=[test_query_column]) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=artifact_location) - - def assert_element(element): - # 0.29836970567703247 - assert round(element, 2) == 0.3 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element)) - - -if __name__ == '__main__': - unittest.main() diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 1cea858e8bbc2..88c60bce4b190 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -433,17 +433,4 @@ commands = # Log aiplatform and its dependencies version for debugging /bin/sh -c "pip freeze | grep -E sentence-transformers" # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. - bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' - -[testenv:py{38,39,310,311}-tfhub-{014,015}] -deps = - 014: tensorflow-hub>=0.14.0,<0.15.0 - 015: tensorflow-hub>=0.15.0,<0.16.0 - tensorflow-text - -extras = test,gcp -commands = - # Log aiplatform and its dependencies version for debugging - /bin/sh -c "pip freeze | grep -E tensorflow" - # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. - bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' + bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' \ No newline at end of file From c27aabba78d8b504de34d25b28f6a8a3652595cc Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:55:52 +0000 Subject: [PATCH 25/52] Add _validate_transform method --- sdks/python/apache_beam/ml/transforms/base.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 4d90346e61b42..e7498e3aad34f 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -323,17 +323,15 @@ def with_transform(self, transform: MLTransformProvider): Returns: A MLTransform instance. """ - # every data processing transform should subclass MLTransformProvider. Raise - # an error if the transform does not subclass MLTransformProvider since the - # downstream code expects the transform to be a subclass of - # MLTransformProvider. + self._validate_transform(transform) + self.transforms.append(transform) + return self + + def _validate_transform(self, transform): if not isinstance(transform, MLTransformProvider): raise TypeError( - 'transform must be a subclass of MLTransformProvider and implement ' - 'get_ptransform_for_processing() method.' + 'transform must be a subclass of BaseOperation. ' 'Got: %s instead.' % type(transform)) - self.transforms.append(transform) - return self class MLTransformMetricsUsage(beam.PTransform): @@ -536,7 +534,9 @@ def load_model(self): def _validate_column_data(self, batch): if not isinstance(batch[0], (str, bytes)): - raise TypeError('Embeddings can only be generated on text columns.') + raise TypeError( + 'Embeddings can only be generated on Dict[str, str].' + f'Got Dict[str, {type(batch[0])}] instead.') def _validate_batch(self, batch: Sequence[Dict[str, List[str]]]): if not batch or not isinstance(batch[0], dict): From 422a86a2c7a966a59199b30529506304eb6d2a4b Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:17:28 +0000 Subject: [PATCH 26/52] add more tests --- sdks/python/apache_beam/ml/transforms/base.py | 2 +- .../apache_beam/ml/transforms/base_test.py | 101 ++++++++++++++++++ .../transforms/embeddings/vertex_ai_test.py | 49 +++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index e7498e3aad34f..8c192759c46e3 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -449,7 +449,7 @@ def __init__( self, transforms: List[MLTransformProvider], artifact_location: str, - artifact_mode: str, + artifact_mode: str = ArtifactMode.PRODUCE, pipeline_options: Optional[PipelineOptions] = None, ): self.transforms = transforms diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 4e73a915adc50..6badf79369657 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -16,6 +16,7 @@ # # pytype: skip-file +import os import shutil import tempfile import typing @@ -41,6 +42,7 @@ # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports try: from apache_beam.ml.transforms import tft + from apache_beam.ml.transforms.handlers import TFTProcessHandler from apache_beam.ml.transforms.tft import TFTOperation except ImportError: tft = None # type: ignore @@ -424,6 +426,21 @@ def test_handler_on_multiple_columns(self): equal_to(expected_data), ) + def test_handler_with_list_data(self): + data = [{ + 'x': ['Hello world', 'Apache Beam'], + }, { + 'x': ['Apache Beam', 'Hello world'], + }] + with self.assertRaises(TypeError): + with beam.Pipeline() as p: + _ = ( + p + | beam.Create(data) + | base.MLTransform( + write_artifact_location=self.artifact_location).with_transform( + self.embedding_conig)) + class TestUtilFunctions(unittest.TestCase): def test_list_of_dicts_to_dict_of_lists_normal(self): @@ -453,5 +470,89 @@ def test_dict_of_lists_to_lists_of_dict_unequal_length(self): base._convert_dict_of_lists_to_lists_of_dict(input_dict) +class TestJsonPickleTransformAttributeManager(unittest.TestCase): + def setUp(self): + self.attribute_manager = base._transform_attribute_manager + self.artifact_location = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.artifact_location) + + @unittest.skipIf(tft is None, 'tft module is not installed.') + def test_save_tft_process_handler(self): + transforms = [ + tft.ScaleTo01(columns=['x']), + tft.ComputeAndApplyVocabulary(columns=['y']) + ] + process_handler = TFTProcessHandler( + transforms=transforms, + artifact_location=self.artifact_location, + ) + self.attribute_manager.save_attributes( + ptransform_list=[process_handler], + artifact_location=self.artifact_location, + ) + + files = os.listdir(self.artifact_location) + self.assertTrue(len(files) == 1) + self.assertTrue(files[0] == base._ATTRIBUTE_FILE_NAME) + + def test_save_run_inference(self): + self.attribute_manager.save_attributes( + ptransform_list=[RunInference(model_handler=FakeModelHandler())], + artifact_location=self.artifact_location, + ) + files = os.listdir(self.artifact_location) + self.assertTrue(len(files) == 1) + self.assertTrue(files[0] == base._ATTRIBUTE_FILE_NAME) + + def test_save_and_load_run_inference(self): + ptransform_list = [RunInference(model_handler=FakeModelHandler())] + self.attribute_manager.save_attributes( + ptransform_list=ptransform_list, + artifact_location=self.artifact_location, + ) + loaded_ptransform_list = self.attribute_manager.load_attributes( + artifact_location=self.artifact_location, + ) + + self.assertTrue(len(loaded_ptransform_list) == len(ptransform_list)) + self.assertListEqual( + list(loaded_ptransform_list[0].__dict__.keys()), + list(ptransform_list[0].__dict__.keys())) + + get_keys = lambda x: list(x.__dict__.keys()) + for i, transform in enumerate(ptransform_list): + self.assertListEqual( + get_keys(transform), get_keys(loaded_ptransform_list[i])) + if hasattr(transform, 'model_handler'): + model_handler = transform.model_handler + loaded_model_handler = loaded_ptransform_list[i].model_handler + self.assertListEqual( + get_keys(model_handler), get_keys(loaded_model_handler)) + + def test_mltransform_to_ptransform_wrapper(self): + transforms = [ + FakeEmbeddingsManager(columns=['x']), + FakeEmbeddingsManager(columns=['y', 'z']), + ] + ptransform_mapper = base._MLTransformToPTransformMapper( + transforms=transforms, + artifact_location=self.artifact_location, + artifact_mode=None) + + ptransform_list = ptransform_mapper.create_ptransform_list() + self.assertTrue(len(ptransform_list) == 2) + + self.assertEqual(type(ptransform_list[0]), RunInference) + expected_columns = [['x'], ['y', 'z']] + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py index 7124aab9cbf23..388df7ae30da5 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py @@ -19,6 +19,8 @@ import unittest import apache_beam as beam +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.transforms import base from apache_beam.ml.transforms.base import MLTransform try: @@ -192,6 +194,53 @@ def assert_element(element): # 0.14797046780586243 | beam.Map(assert_element)) + def test_mltransform_to_ptransform_with_vertex(self): + model_name = 'textembedding-gecko@002' + transforms = [ + VertexAITextEmbeddings( + columns=['x'], + model_name=model_name, + task_type='RETRIEVAL_DOCUMENT'), + VertexAITextEmbeddings( + columns=['y', 'z'], model_name=model_name, task_type='CLUSTERING') + ] + ptransform_mapper = base._MLTransformToPTransformMapper( + transforms=transforms, + artifact_location=self.artifact_location, + artifact_mode=None) + + ptransform_list = ptransform_mapper.create_and_save_ptransform_list() + self.assertTrue(len(ptransform_list) == 2) + + self.assertEqual(type(ptransform_list[0]), RunInference) + expected_columns = [['x'], ['y', 'z']] + expected_task_type = ['RETRIEVAL_DOCUMENT', 'CLUSTERING'] + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.task_type, + expected_task_type[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + ptransform_list = ( + base._MLTransformToPTransformMapper. + load_transforms_from_artifact_location(self.artifact_location)) + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.task_type, + expected_task_type[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + if __name__ == '__main__': unittest.main() From 08b36658f706bd2e235c0b8b5ff02134aea63807 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 03:19:06 +0000 Subject: [PATCH 27/52] fix test --- sdks/python/apache_beam/ml/transforms/handlers_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/handlers_test.py b/sdks/python/apache_beam/ml/transforms/handlers_test.py index 730f25fe8c141..f13a916824c4c 100644 --- a/sdks/python/apache_beam/ml/transforms/handlers_test.py +++ b/sdks/python/apache_beam/ml/transforms/handlers_test.py @@ -596,7 +596,7 @@ def test_handler_with_same_input_elements(self): transforms=[tft.ComputeAndApplyVocabulary(columns=['x'])], artifact_location=self.artifact_location, ) - transformed_data = process_handler.process_data(raw_data) + transformed_data = raw_data | process_handler expected_data = [ beam.Row(x=np.array([4])), From 91255adcdd67053bd5f811c5781e0a71cec1c8c2 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:09:59 +0000 Subject: [PATCH 28/52] Fix test --- sdks/python/apache_beam/ml/transforms/base.py | 2 +- sdks/python/apache_beam/ml/transforms/base_test.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 8c192759c46e3..a174c328623ae 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -203,7 +203,7 @@ def __init__( *, write_artifact_location: Optional[str] = None, read_artifact_location: Optional[str] = None, - transforms: Optional[MLTransformProvider] = None): + transforms: Optional[List[MLTransformProvider]] = None): """ MLTransform is a Beam PTransform that can be used to apply transformations to the data. MLTransform is used to wrap the diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 6badf79369657..2374d110e708c 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -288,8 +288,7 @@ class Add: def __call__(self, x): return x + 1 - with self.assertRaisesRegex( - TypeError, 'transform must be a subclass of MLTransformProvider'): + with self.assertRaisesRegex(TypeError, 'transform must be a subclass of'): with beam.Pipeline() as p: _ = ( p From c7237c3e204c07ed991b4404a48129c6d73f8736 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:06:12 +0000 Subject: [PATCH 29/52] Add more tests in sentence-transformer --- .../embeddings/sentence_transformer.py | 4 +- .../embeddings/sentence_transformer_test.py | 39 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py index 8eeaa67ce611c..044c4bb003763 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py @@ -51,7 +51,7 @@ def __init__( large_model: bool = False, **kwargs): self._max_seq_length = max_seq_length - self._model_uri = model_name + self.model_name = model_name self._model_class = model_class self._load_model_args = load_model_args self._min_batch_size = min_batch_size @@ -69,7 +69,7 @@ def run_inference( return model.encode(batch, **inference_args) def load_model(self): - model = self._model_class(self._model_uri, **self._load_model_args) + model = self._model_class(self.model_name, **self._load_model_args) if self._max_seq_length: model.max_seq_length = self._max_seq_length return model diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index 21289797133c0..f346b52fcaa88 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -21,6 +21,8 @@ from parameterized import parameterized import apache_beam as beam +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.transforms import base from apache_beam.ml.transforms.base import MLTransform from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to @@ -232,6 +234,43 @@ def assert_element(element): | beam.Map(lambda x: x[test_query_column]) | beam.Map(assert_element)) + def test_mltransform_to_ptransform_with_vertex(self): + model_name = '' + transforms = [ + SentenceTransformerEmbeddings(columns=['x'], model_name=model_name), + SentenceTransformerEmbeddings( + columns=['y', 'z'], model_name=model_name) + ] + ptransform_mapper = base._MLTransformToPTransformMapper( + transforms=transforms, + artifact_location=self.artifact_location, + artifact_mode=None) + + ptransform_list = ptransform_mapper.create_and_save_ptransform_list() + self.assertTrue(len(ptransform_list) == 2) + + self.assertEqual(type(ptransform_list[0]), RunInference) + expected_columns = [['x'], ['y', 'z']] + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + ptransform_list = ( + base._MLTransformToPTransformMapper. + load_transforms_from_artifact_location(self.artifact_location)) + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + if __name__ == '__main__': unittest.main() From a9428855efc4541ba6873989036ab5b338806a1d Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:50:41 +0000 Subject: [PATCH 30/52] use np.max instead of max --- .../ml/transforms/embeddings/sentence_transformer_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index f346b52fcaa88..99b362674813c 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -18,6 +18,7 @@ import tempfile import unittest +import numpy as np from parameterized import parameterized import apache_beam as beam @@ -192,7 +193,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): embedding_config)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(np.max(x[test_query_column]), 4))) assert_that(max_ele_pcoll, equal_to(output)) @@ -204,7 +205,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): MLTransform(read_artifact_location=artifact_location)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(np.max(x[test_query_column]), 4))) assert_that(max_ele_pcoll, equal_to(output)) From 89c19fb912c9c1627175900891781b9145428d0b Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:52:30 +0000 Subject: [PATCH 31/52] round to 2 decimals --- .../embeddings/sentence_transformer_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index 99b362674813c..e2d108e8fefcc 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -55,10 +55,10 @@ }, { test_query_column: '样例数据-4' }], - 'BAAI/bge-base-en-v1.5', [0.1091, 0.122, 0.104, 0.1093]), + 'BAAI/bge-base-en-v1.5', [0.11, 0.12, 0.10, 0.11]), ([{ test_query_column: test_query, - }], DEFAULT_MODEL_NAME, [0.1342]), + }], DEFAULT_MODEL_NAME, [0.13]), ( [{ test_query_column: 'query: how much protein should a female eat', @@ -76,7 +76,7 @@ }], 'intfloat/e5-base-v2', # this model requires inputs to be specified as query: and passage: - [0.0982, 0.1033]), + [0.1, 0.1]), ] @@ -147,7 +147,7 @@ def test_embeddings_with_read_artifact_location( embedding_config)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(max(x[test_query_column]), 2))) assert_that(max_ele_pcoll, equal_to(output)) @@ -159,7 +159,7 @@ def test_embeddings_with_read_artifact_location( MLTransform(read_artifact_location=self.artifact_location)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(max(x[test_query_column]), 2))) assert_that(max_ele_pcoll, equal_to(output)) @@ -193,7 +193,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): embedding_config)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(np.max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(np.max(x[test_query_column]), 2))) assert_that(max_ele_pcoll, equal_to(output)) @@ -205,7 +205,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): MLTransform(read_artifact_location=artifact_location)) max_ele_pcoll = ( result_pcoll - | beam.Map(lambda x: round(np.max(x[test_query_column]), 4))) + | beam.Map(lambda x: round(np.max(x[test_query_column]), 2))) assert_that(max_ele_pcoll, equal_to(output)) From 2db4a20e1871effb34147f5ff6c1ab4a2330e76d Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Wed, 6 Dec 2023 14:00:01 -0500 Subject: [PATCH 32/52] Remove gradle command action --- .github/actions/gradle-command-action | 1 - 1 file changed, 1 deletion(-) delete mode 160000 .github/actions/gradle-command-action diff --git a/.github/actions/gradle-command-action b/.github/actions/gradle-command-action deleted file mode 160000 index 90ccf054e6b99..0000000000000 --- a/.github/actions/gradle-command-action +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 90ccf054e6b9905f30f98c938bce4c6acd323b6b From b7a48d5af9e7783bbdd3f7ec027899c01343d762 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 19:23:36 +0000 Subject: [PATCH 33/52] Refactor throwing dataflow client exception --- sdks/python/apache_beam/ml/transforms/base.py | 22 +++++++++++-------- .../apache_beam/ml/transforms/base_test.py | 9 ++++++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index a174c328623ae..c0eadedc85ac5 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -408,15 +408,19 @@ def save_attributes( _LOGGER.info('Creating artifact location: %s', artifact_location) # pipeline options required to for the client to configure project. options = kwargs.get('options') - if not options: - raise RuntimeError( - 'pipeline options are required to save the attributes.' - 'in the artifact location %s' % artifact_location) - apiclient.DataflowApplicationClient(options=options).stage_file( - gcs_or_local_path=artifact_location, - file_name=_ATTRIBUTE_FILE_NAME, - stream=f, - mime_type='application/json') + try: + apiclient.DataflowApplicationClient(options=options).stage_file( + gcs_or_local_path=artifact_location, + file_name=_ATTRIBUTE_FILE_NAME, + stream=f, + mime_type='application/json') + except Exception as exc: + if not options: + raise RuntimeError( + "Failed to create Dataflow client. " + "Pipeline options are required to save the attributes." + "in the artifact location %s" % artifact_location) from exc + raise else: if not FileSystems.exists(artifact_location): FileSystems.mkdirs(artifact_location) diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 2374d110e708c..6b381ba272487 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -552,6 +552,15 @@ def test_mltransform_to_ptransform_wrapper(self): self.assertEqual( ptransform_list[i]._model_handler.columns, expected_columns[i]) + def test_with_gcs_location_with_none_options(self): + path = 'gs://fake_path' + with self.assertRaises(RuntimeError): + self.attribute_manager.save_attributes( + ptransform_list=[], artifact_location=path, options=None) + with self.assertRaises(RuntimeError): + self.attribute_manager.save_attributes( + ptransform_list=[], artifact_location=path) + if __name__ == '__main__': unittest.main() From bad1b3b7ff892463957a12b322003bf37a1a188b Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 20:23:46 +0000 Subject: [PATCH 34/52] skip the test if gcp is not installed --- sdks/python/apache_beam/ml/transforms/base_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 6b381ba272487..8e1515e7ece1a 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -59,6 +59,11 @@ def apply_transform(self, inputs, output_column_name, **kwargs): except: # pylint: disable=bare-except pass +try: + from apache_beam.runners.dataflow.internal import apiclient +except ImportError: + apiclient = None # type: ignore + class BaseMLTransformTest(unittest.TestCase): def setUp(self) -> None: @@ -552,6 +557,7 @@ def test_mltransform_to_ptransform_wrapper(self): self.assertEqual( ptransform_list[i]._model_handler.columns, expected_columns[i]) + @unittest.skipIf(apiclient is None, 'apache_beam[gcp] is not installed.') def test_with_gcs_location_with_none_options(self): path = 'gs://fake_path' with self.assertRaises(RuntimeError): From b850cee2260c2131bbceb1e536dfc250684e5d83 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:19:06 +0000 Subject: [PATCH 35/52] remove toxTests for hub --- .../ml/transforms/embeddings/sentence_transformer_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index e2d108e8fefcc..832d9b77932f7 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -76,7 +76,7 @@ }], 'intfloat/e5-base-v2', # this model requires inputs to be specified as query: and passage: - [0.1, 0.1]), + [0.12, 0.13]), ] From ffff21a1386cfd8ddadc74cacb5b460032b6228e Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:19:46 +0000 Subject: [PATCH 36/52] remove toxTests for hub --- sdks/python/test-suites/tox/py38/build.gradle | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle index c4fd300ca9435..a5a624b998e8d 100644 --- a/sdks/python/test-suites/tox/py38/build.gradle +++ b/sdks/python/test-suites/tox/py38/build.gradle @@ -145,14 +145,6 @@ toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "$ test.dependsOn "testPy38sentenceTransformers-222" preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222" -toxTask "testPy38tensorflowHub-014", "py38-tfhub-014", "${posargs}" -test.dependsOn "testPy38tensorflowHub-014" -preCommitPyCoverage.dependsOn "testPy38tensorflowHub-014" - -toxTask "testPy38tensorflowHub-015", "py38-tfhub-015", "${posargs}" -test.dependsOn "testPy38tensorflowHub-015" -preCommitPyCoverage.dependsOn "testPy38tensorflowHub-015" - toxTask "whitespacelint", "whitespacelint", "${posargs}" task archiveFilesToLint(type: Zip) { From 88412ea622bd8f329f55b49e16afccf0b8799fd4 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Thu, 7 Dec 2023 18:14:15 +0000 Subject: [PATCH 37/52] Fix values in assert for sentence_transformer_test --- .../ml/transforms/embeddings/sentence_transformer_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py index 832d9b77932f7..e2d108e8fefcc 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py @@ -76,7 +76,7 @@ }], 'intfloat/e5-base-v2', # this model requires inputs to be specified as query: and passage: - [0.12, 0.13]), + [0.1, 0.1]), ] From 617f9d6e97acd381c2e95794e9ef1016a4a3c4c7 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Thu, 7 Dec 2023 18:19:35 +0000 Subject: [PATCH 38/52] rename sentence_transformers to huggingface --- .../embeddings/{sentence_transformer.py => huggingface.py} | 0 .../{sentence_transformer_test.py => huggingface_test.py} | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename sdks/python/apache_beam/ml/transforms/embeddings/{sentence_transformer.py => huggingface.py} (100%) rename sdks/python/apache_beam/ml/transforms/embeddings/{sentence_transformer_test.py => huggingface_test.py} (98%) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py similarity index 100% rename from sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py rename to sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py similarity index 98% rename from sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py rename to sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index e2d108e8fefcc..d8b77ba2d83e8 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -30,7 +30,7 @@ # pylint: disable=ungrouped-imports try: - from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings + from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings import torch except ImportError: SentenceTransformerEmbeddings = None # type: ignore From 5cae04b2dddcfa426f3d64cf52235f9952c718ba Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Thu, 7 Dec 2023 18:34:30 +0000 Subject: [PATCH 39/52] fix pydocs --- .../apache_beam/ml/transforms/embeddings/huggingface.py | 1 + sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py index 044c4bb003763..e979296b0b830 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py @@ -97,6 +97,7 @@ def __init__( Embedding config for sentence-transformers. This config can be used with MLTransform to embed text data. Models are loaded using the RunInference PTransform with the help of ModelHandler. + Args: model_name: Name of the model to use. The model should be hosted on HuggingFace Hub or compatible with sentence_transformers. diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 297549d4f3284..c0b7aa41819ec 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -119,7 +119,7 @@ def __init__( model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. task_type: The name of the downstream task the embeddings will be used for. - Valid values: + Valid values are listed below. RETRIEVAL_QUERY Specifies the given text is a query in a search/retrieval setting. RETRIEVAL_DOCUMENT @@ -129,7 +129,7 @@ def __init__( CLASSIFICATION Specifies that the given text will be classified. CLUSTERING - Specifies that the embeddings will be used for clustering. + Specifies that the embeddings will be used for clustering. title: Optional identifier of the text content. project: The default GCP project to make Vertex API calls. location: The default location to use when making API calls. From 489200f20dfee5cdd0b596f30ce83f225679d13e Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:55:57 +0000 Subject: [PATCH 40/52] Change the model name for tests since it is getting different results on different machines --- .../ml/transforms/embeddings/huggingface_test.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index d8b77ba2d83e8..d932eb1212dfa 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -47,15 +47,11 @@ DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" _parameterized_inputs = [ ([{ - test_query_column: '样例数据-1' + test_query_column: 'That is a happy person' }, { - test_query_column: '样例数据-2' - }, { - test_query_column: '样例数据-3' - }, { - test_query_column: '样例数据-4' + test_query_column: 'That is a very happy person' }], - 'BAAI/bge-base-en-v1.5', [0.11, 0.12, 0.10, 0.11]), + 'thenlper/gte-base', [0.11, 0.11]), ([{ test_query_column: test_query, }], DEFAULT_MODEL_NAME, [0.13]), From 816174a26e2002111405119d18a7d890c7abfafc Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Thu, 7 Dec 2023 21:00:04 +0000 Subject: [PATCH 41/52] Fix pydoc in vertexai --- .../ml/transforms/embeddings/vertex_ai.py | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index c0b7aa41819ec..843f15293a130 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -118,25 +118,14 @@ def __init__( Args: model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. - task_type: The name of the downstream task the embeddings will be used for. - Valid values are listed below. - RETRIEVAL_QUERY - Specifies the given text is a query in a search/retrieval setting. - RETRIEVAL_DOCUMENT - Specifies the given text is a document from the corpus being searched. - SEMANTIC_SIMILARITY - Specifies the given text will be used for STS. - CLASSIFICATION - Specifies that the given text will be classified. - CLUSTERING - Specifies that the embeddings will be used for clustering. - title: Optional identifier of the text content. - project: The default GCP project to make Vertex API calls. - location: The default location to use when making API calls. - credentials: The default custom - credentials to use when making API calls. If not provided credentials - will be ascertained from the environment. - + task_type: The downstream task for the embeddings. + Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING. + title: Identifier of the text content. + project: The default GCP project for API calls. + location: The default location for API calls. + credentials: Custom credentials for API calls. + Defaults to environment credentials. """ self.model_name = model_name self.project = project From cfb18831abc458ed5ed987bcd729d2f06d9710c1 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 00:55:03 +0000 Subject: [PATCH 42/52] add suffix to artifact_location --- .../transforms/embeddings/huggingface_test.py | 68 +++++++++---------- .../ml/transforms/embeddings/vertex_ai.py | 2 +- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index d932eb1212dfa..516c13a2d60be 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -14,9 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import shutil import tempfile import unittest +import uuid import numpy as np from parameterized import parameterized @@ -55,24 +57,12 @@ ([{ test_query_column: test_query, }], DEFAULT_MODEL_NAME, [0.13]), - ( - [{ - test_query_column: 'query: how much protein should a female eat', - }, - { - test_query_column: ( - "passage: As a general guideline, the CDC's " - "average requirement of protein for women " - "ages 19 to 70 is 46 grams per day. But, " - "as you can see from this chart, you'll need " - "to increase that if you're expecting or training" - " for a marathon. Check out the chart below " - "to see how much protein " - "you should be eating each day.") - }], - 'intfloat/e5-base-v2', - # this model requires inputs to be specified as query: and passage: - [0.1, 0.1]), + ([{ + test_query_column: 'This is an example sentence', + }, { + test_query_column: ("Each sentence is converted") + }], + 'sentence-transformers/all-MiniLM-L6-v2', [0.15, 0.14]), ] @@ -88,6 +78,7 @@ def tearDown(self) -> None: def test_sentence_transformer_embeddings(self): model_name = DEFAULT_MODEL_NAME + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) with beam.Pipeline() as pipeline: @@ -96,9 +87,9 @@ def test_sentence_transformer_embeddings(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> + MLTransform(write_artifact_location=artifact_location).with_transform( + embedding_config)) def assert_element(element): assert len(element[test_query_column]) == 768 @@ -107,6 +98,7 @@ def assert_element(element): @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.') def test_embeddings_with_scale_to_0_1(self): + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) model_name = DEFAULT_MODEL_NAME embedding_config = SentenceTransformerEmbeddings( model_name=model_name, @@ -118,10 +110,10 @@ def test_embeddings_with_scale_to_0_1(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config).with_transform( - ScaleTo01(columns=[test_query_column]))) + | "MLTransform" >> + MLTransform(write_artifact_location=artifact_location).with_transform( + embedding_config).with_transform( + ScaleTo01(columns=[test_query_column]))) def assert_element(element): assert max(element.feature_1) == 1 @@ -134,13 +126,14 @@ def test_embeddings_with_read_artifact_location( embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) with beam.Pipeline() as p: result_pcoll = ( p | "CreateData" >> beam.Create(inputs) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> + MLTransform(write_artifact_location=artifact_location).with_transform( + embedding_config)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(max(x[test_query_column]), 2))) @@ -152,7 +145,7 @@ def test_embeddings_with_read_artifact_location( p | "CreateData" >> beam.Create(inputs) | "MLTransform" >> - MLTransform(read_artifact_location=self.artifact_location)) + MLTransform(read_artifact_location=artifact_location)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(max(x[test_query_column]), 2))) @@ -161,6 +154,7 @@ def test_embeddings_with_read_artifact_location( def test_sentence_transformer_with_int_data_types(self): model_name = DEFAULT_MODEL_NAME + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) with self.assertRaises(TypeError): @@ -171,12 +165,13 @@ def test_sentence_transformer_with_int_data_types(self): test_query_column: 1 }]) | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( + write_artifact_location=artifact_location).with_transform( embedding_config)) @parameterized.expand(_parameterized_inputs) def test_with_gcs_artifact_location(self, inputs, model_name, output): - artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers') + artifact_location = os.path.join( + 'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex) embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) @@ -207,7 +202,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): def test_embeddings_with_inference_args(self): model_name = DEFAULT_MODEL_NAME - + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) inference_args = {'convert_to_numpy': False} embedding_config = SentenceTransformerEmbeddings( model_name=model_name, @@ -219,9 +214,9 @@ def test_embeddings_with_inference_args(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> + MLTransform(write_artifact_location=artifact_location).with_transform( + embedding_config)) def assert_element(element): assert type(element) == torch.Tensor @@ -233,6 +228,7 @@ def assert_element(element): def test_mltransform_to_ptransform_with_vertex(self): model_name = '' + artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) transforms = [ SentenceTransformerEmbeddings(columns=['x'], model_name=model_name), SentenceTransformerEmbeddings( @@ -240,7 +236,7 @@ def test_mltransform_to_ptransform_with_vertex(self): ] ptransform_mapper = base._MLTransformToPTransformMapper( transforms=transforms, - artifact_location=self.artifact_location, + artifact_location=artifact_location, artifact_mode=None) ptransform_list = ptransform_mapper.create_and_save_ptransform_list() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 843f15293a130..2dacb6f88b71f 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -119,7 +119,7 @@ def __init__( model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. task_type: The downstream task for the embeddings. - Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING. title: Identifier of the text content. project: The default GCP project for API calls. From 2cb6f03763d18dff1c86cf4f8f5ef8cb47457233 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 01:14:09 +0000 Subject: [PATCH 43/52] Revert "add suffix to artifact_location" This reverts commit cfb18831abc458ed5ed987bcd729d2f06d9710c1. --- .../transforms/embeddings/huggingface_test.py | 68 ++++++++++--------- .../ml/transforms/embeddings/vertex_ai.py | 2 +- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index 516c13a2d60be..d932eb1212dfa 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -14,11 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import shutil import tempfile import unittest -import uuid import numpy as np from parameterized import parameterized @@ -57,12 +55,24 @@ ([{ test_query_column: test_query, }], DEFAULT_MODEL_NAME, [0.13]), - ([{ - test_query_column: 'This is an example sentence', - }, { - test_query_column: ("Each sentence is converted") - }], - 'sentence-transformers/all-MiniLM-L6-v2', [0.15, 0.14]), + ( + [{ + test_query_column: 'query: how much protein should a female eat', + }, + { + test_query_column: ( + "passage: As a general guideline, the CDC's " + "average requirement of protein for women " + "ages 19 to 70 is 46 grams per day. But, " + "as you can see from this chart, you'll need " + "to increase that if you're expecting or training" + " for a marathon. Check out the chart below " + "to see how much protein " + "you should be eating each day.") + }], + 'intfloat/e5-base-v2', + # this model requires inputs to be specified as query: and passage: + [0.1, 0.1]), ] @@ -78,7 +88,6 @@ def tearDown(self) -> None: def test_sentence_transformer_embeddings(self): model_name = DEFAULT_MODEL_NAME - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) with beam.Pipeline() as pipeline: @@ -87,9 +96,9 @@ def test_sentence_transformer_embeddings(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> - MLTransform(write_artifact_location=artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) def assert_element(element): assert len(element[test_query_column]) == 768 @@ -98,7 +107,6 @@ def assert_element(element): @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.') def test_embeddings_with_scale_to_0_1(self): - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) model_name = DEFAULT_MODEL_NAME embedding_config = SentenceTransformerEmbeddings( model_name=model_name, @@ -110,10 +118,10 @@ def test_embeddings_with_scale_to_0_1(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> - MLTransform(write_artifact_location=artifact_location).with_transform( - embedding_config).with_transform( - ScaleTo01(columns=[test_query_column]))) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config).with_transform( + ScaleTo01(columns=[test_query_column]))) def assert_element(element): assert max(element.feature_1) == 1 @@ -126,14 +134,13 @@ def test_embeddings_with_read_artifact_location( embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) with beam.Pipeline() as p: result_pcoll = ( p | "CreateData" >> beam.Create(inputs) - | "MLTransform" >> - MLTransform(write_artifact_location=artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(max(x[test_query_column]), 2))) @@ -145,7 +152,7 @@ def test_embeddings_with_read_artifact_location( p | "CreateData" >> beam.Create(inputs) | "MLTransform" >> - MLTransform(read_artifact_location=artifact_location)) + MLTransform(read_artifact_location=self.artifact_location)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(max(x[test_query_column]), 2))) @@ -154,7 +161,6 @@ def test_embeddings_with_read_artifact_location( def test_sentence_transformer_with_int_data_types(self): model_name = DEFAULT_MODEL_NAME - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) with self.assertRaises(TypeError): @@ -165,13 +171,12 @@ def test_sentence_transformer_with_int_data_types(self): test_query_column: 1 }]) | "MLTransform" >> MLTransform( - write_artifact_location=artifact_location).with_transform( + write_artifact_location=self.artifact_location).with_transform( embedding_config)) @parameterized.expand(_parameterized_inputs) def test_with_gcs_artifact_location(self, inputs, model_name, output): - artifact_location = os.path.join( - 'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex) + artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers') embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) @@ -202,7 +207,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): def test_embeddings_with_inference_args(self): model_name = DEFAULT_MODEL_NAME - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) + inference_args = {'convert_to_numpy': False} embedding_config = SentenceTransformerEmbeddings( model_name=model_name, @@ -214,9 +219,9 @@ def test_embeddings_with_inference_args(self): | "CreateData" >> beam.Create([{ test_query_column: test_query }]) - | "MLTransform" >> - MLTransform(write_artifact_location=artifact_location).with_transform( - embedding_config)) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) def assert_element(element): assert type(element) == torch.Tensor @@ -228,7 +233,6 @@ def assert_element(element): def test_mltransform_to_ptransform_with_vertex(self): model_name = '' - artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex) transforms = [ SentenceTransformerEmbeddings(columns=['x'], model_name=model_name), SentenceTransformerEmbeddings( @@ -236,7 +240,7 @@ def test_mltransform_to_ptransform_with_vertex(self): ] ptransform_mapper = base._MLTransformToPTransformMapper( transforms=transforms, - artifact_location=artifact_location, + artifact_location=self.artifact_location, artifact_mode=None) ptransform_list = ptransform_mapper.create_and_save_ptransform_list() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 2dacb6f88b71f..843f15293a130 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -119,7 +119,7 @@ def __init__( model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. task_type: The downstream task for the embeddings. - Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING. title: Identifier of the text content. project: The default GCP project for API calls. From cd7050e693c693de4063a632a95d4b0ff5ac84e3 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 01:17:08 +0000 Subject: [PATCH 44/52] add no_xdist --- .../apache_beam/ml/transforms/embeddings/huggingface_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index d932eb1212dfa..5e98f77d2deb5 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -19,6 +19,7 @@ import unittest import numpy as np +import pytest from parameterized import parameterized import apache_beam as beam @@ -79,6 +80,7 @@ @unittest.skipIf( SentenceTransformerEmbeddings is None, 'sentence-transformers is not installed.') +@pytest.mark.no_xdist class SentenceTrasformerEmbeddingsTest(unittest.TestCase): def setUp(self) -> None: self.artifact_location = tempfile.mkdtemp() From 98cd949bc408a18c2184fcf014531a4e89f7a0f8 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 01:17:48 +0000 Subject: [PATCH 45/52] Try fixing pydoc for vertexai --- sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 843f15293a130..2dacb6f88b71f 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -119,7 +119,7 @@ def __init__( model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. task_type: The downstream task for the embeddings. - Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING. title: Identifier of the text content. project: The default GCP project for API calls. From 8ea0906be53f0c4d64236cd8525cf43afc800ff5 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 03:51:22 +0000 Subject: [PATCH 46/52] change tox.ini to use pytest directly --- sdks/python/tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 88c60bce4b190..eb0bbddd09e8b 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -433,4 +433,4 @@ commands = # Log aiplatform and its dependencies version for debugging /bin/sh -c "pip freeze | grep -E sentence-transformers" # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. - bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings' \ No newline at end of file + /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' From 6f83d3cee96af25f1aa6be451468a2007a15ceb5 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 20:00:40 +0000 Subject: [PATCH 47/52] raise FileExistError if Attribute file is already present --- sdks/python/apache_beam/ml/transforms/base.py | 17 ++++++++++++++--- .../apache_beam/ml/transforms/base_test.py | 13 +++++++++++++ .../transforms/embeddings/huggingface_test.py | 17 +++++++++-------- .../ml/transforms/embeddings/vertex_ai.py | 7 ++++--- .../ml/transforms/embeddings/vertex_ai_test.py | 11 +++++++---- sdks/python/apache_beam/ml/transforms/tft.py | 4 ---- sdks/python/tox.ini | 5 +++-- 7 files changed, 50 insertions(+), 24 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index c0eadedc85ac5..859fbc68ffd68 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -96,8 +96,8 @@ class ArtifactMode(object): class MLTransformProvider: """ Data processing transforms that are intended to be used with MLTransform - should subclass MLTransformProvider and implement the following methods: - 1. get_ptransform_for_processing() + should subclass MLTransformProvider and implement + get_ptransform_for_processing(). get_ptransform_for_processing() method should return a PTransform that can be used to process the data. @@ -184,7 +184,7 @@ def __init__( if kwargs: _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys()) - # TODO: Add set_model_handler method. + # TODO:https://github.com/apache/beam/pull/29564 add set_model_handler method @abc.abstractmethod def get_model_handler(self) -> ModelHandler: """ @@ -398,6 +398,17 @@ def save_attributes( artifact_location, **kwargs, ): + # if an artifact location is present, instead of overwriting the + # existing file, raise an error since the same artifact location + # can be used by multiple beam jobs and this could result in undesired + # behavior. + if FileSystems.exists(FileSystems.join(artifact_location, + _ATTRIBUTE_FILE_NAME)): + raise FileExistsError( + "The artifact location %s already exists and contains %s. Please " + "specify a different location." % + (artifact_location, _ATTRIBUTE_FILE_NAME)) + if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location): temp_dir = tempfile.mkdtemp() temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME) diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index 8e1515e7ece1a..e079594361980 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -567,6 +567,19 @@ def test_with_gcs_location_with_none_options(self): self.attribute_manager.save_attributes( ptransform_list=[], artifact_location=path) + def test_with_same_local_artifact_location(self): + artifact_location = self.artifact_location + attribute_manager = base._JsonPickleTransformAttributeManager() + + ptransform_list = [RunInference(model_handler=FakeModelHandler())] + + attribute_manager.save_attributes( + ptransform_list, artifact_location=artifact_location) + + with self.assertRaises(FileExistsError): + attribute_manager.save_attributes([lambda x: x], + artifact_location=artifact_location) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index 5e98f77d2deb5..e59090151c5e9 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -14,12 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import shutil import tempfile import unittest +import uuid import numpy as np -import pytest from parameterized import parameterized import apache_beam as beam @@ -80,10 +81,11 @@ @unittest.skipIf( SentenceTransformerEmbeddings is None, 'sentence-transformers is not installed.') -@pytest.mark.no_xdist class SentenceTrasformerEmbeddingsTest(unittest.TestCase): def setUp(self) -> None: - self.artifact_location = tempfile.mkdtemp() + self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_') + self.gcs_artifact_location = os.path.join( + 'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex) def tearDown(self) -> None: shutil.rmtree(self.artifact_location) @@ -178,7 +180,6 @@ def test_sentence_transformer_with_int_data_types(self): @parameterized.expand(_parameterized_inputs) def test_with_gcs_artifact_location(self, inputs, model_name, output): - artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers') embedding_config = SentenceTransformerEmbeddings( model_name=model_name, columns=[test_query_column]) @@ -187,8 +188,8 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): p | "CreateData" >> beam.Create(inputs) | "MLTransform" >> - MLTransform(write_artifact_location=artifact_location).with_transform( - embedding_config)) + MLTransform(write_artifact_location=self.gcs_artifact_location + ).with_transform(embedding_config)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(np.max(x[test_query_column]), 2))) @@ -200,7 +201,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output): p | "CreateData" >> beam.Create(inputs) | "MLTransform" >> - MLTransform(read_artifact_location=artifact_location)) + MLTransform(read_artifact_location=self.gcs_artifact_location)) max_ele_pcoll = ( result_pcoll | beam.Map(lambda x: round(np.max(x[test_query_column]), 2))) @@ -233,7 +234,7 @@ def assert_element(element): | beam.Map(lambda x: x[test_query_column]) | beam.Map(assert_element)) - def test_mltransform_to_ptransform_with_vertex(self): + def test_mltransform_to_ptransform_with_sentence_transformer(self): model_name = '' transforms = [ SentenceTransformerEmbeddings(columns=['x'], model_name=model_name), diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index 2dacb6f88b71f..b80498fc7a13d 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -118,9 +118,10 @@ def __init__( Args: model_name: The name of the Vertex AI Text Embedding model. columns: The columns containing the text to be embedded. - task_type: The downstream task for the embeddings. - Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, - SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING. + task_type: The downstream task for the embeddings. Valid values are + RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY, + CLASSIFICATION, CLUSTERING. For more information on the task type, + look at https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long title: Identifier of the text content. project: The default GCP project for API calls. location: The default location for API calls. diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py index 388df7ae30da5..3d8e1ea31673a 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py @@ -14,9 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import shutil import tempfile import unittest +import uuid import apache_beam as beam from apache_beam.ml.inference.base import RunInference @@ -44,7 +46,9 @@ VertexAITextEmbeddings is None, 'Vertex AI Python SDK is not installed.') class VertexAIEmbeddingsTest(unittest.TestCase): def setUp(self) -> None: - self.artifact_location = tempfile.mkdtemp() + self.artifact_location = tempfile.mkdtemp(prefix='_vertex_ai_test') + self.gcs_artifact_location = os.path.join( + 'gs://apache-beam-ml/testing/vertex_ai', uuid.uuid4().hex) def tearDown(self) -> None: shutil.rmtree(self.artifact_location) @@ -158,7 +162,6 @@ def test_with_int_data_types(self): embedding_config)) def test_with_gcs_artifact_location(self): - artifact_location = ('gs://apache-beam-ml/testing/vertex_ai') with beam.Pipeline() as p: embedding_config = VertexAITextEmbeddings( model_name=model_name, columns=[test_query_column]) @@ -172,7 +175,7 @@ def test_with_gcs_artifact_location(self): _ = self.pipeline_with_configurable_artifact_location( pipeline=data, embedding_config=embedding_config, - write_artifact_location=artifact_location) + write_artifact_location=self.gcs_artifact_location) with beam.Pipeline() as p: data = ( @@ -183,7 +186,7 @@ def test_with_gcs_artifact_location(self): test_query_column: test_query }])) result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=artifact_location) + pipeline=data, read_artifact_location=self.gcs_artifact_location) def assert_element(element): assert round(element, 2) == 0.15 diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py index 3a103962045f6..8b571d9a685e9 100644 --- a/sdks/python/apache_beam/ml/transforms/tft.py +++ b/sdks/python/apache_beam/ml/transforms/tft.py @@ -105,10 +105,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform: "artifact_location is not specified. Please specify the " "artifact_location for the op %s" % self.__class__.__name__) - transforms = kwargs.get('transforms') - if transforms: - params['transforms'] = transforms - artifact_mode = kwargs.get('artifact_mode') if artifact_mode: params['artifact_mode'] = artifact_mode diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index eb0bbddd09e8b..dc9e6a28cb9ed 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -425,12 +425,13 @@ commands = /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_vertex_ai {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' -[testenv:py{38,39,310,311}-sentence-transformers-222] +[testenv:py{38,39,310,311}-embeddings] deps = sentence-transformers==2.2.2 extras = test,gcp commands = # Log aiplatform and its dependencies version for debugging /bin/sh -c "pip freeze | grep -E sentence-transformers" + /bin/sh -c "pip freeze | grep -E google-cloud-aiplatform" # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories. - /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' From 9dce3cf09491f86f4e8bf8b1e336b1c41b651423 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 20:04:03 +0000 Subject: [PATCH 48/52] modify build.gradle to match tox task names --- sdks/python/test-suites/tox/py38/build.gradle | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle index a5a624b998e8d..1e03b50580830 100644 --- a/sdks/python/test-suites/tox/py38/build.gradle +++ b/sdks/python/test-suites/tox/py38/build.gradle @@ -141,9 +141,9 @@ toxTask "testPy38transformers-430", "py38-transformers-430", "${posargs}" test.dependsOn "testPy38transformers-430" preCommitPyCoverage.dependsOn "testPy38transformers-430" -toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "${posargs}" -test.dependsOn "testPy38sentenceTransformers-222" -preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222" +toxTask "testPy38embeddingsMLTransform", "py38-embeddings", "${posargs}" +test.dependsOn "testPy38embeddingsMLTransform" +preCommitPyCoverage.dependsOn "testPy38embeddingsMLTransform" toxTask "whitespacelint", "whitespacelint", "${posargs}" From 539c9adf8040320e76d6870e954967b38b634f21 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 20:04:23 +0000 Subject: [PATCH 49/52] Add note to CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 7686b7a92d96a..0cc27d3560475 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -67,6 +67,7 @@ * Python GCSIO is now implemented with GCP GCS Client instead of apitools ([#25676](https://github.com/apache/beam/issues/25676)) * Adding support for LowCardinality DataType in ClickHouse (Java) ([#29533](https://github.com/apache/beam/pull/29533)). * Added support for handling bad records to KafkaIO (Java) ([#29546](https://github.com/apache/beam/pull/29546)) +* Add support for generating text embeddings in MLTransform for Vertex AI and Huggingface hub models.([#29564](https://github.com/apache/beam/pull/29564)) ## New Features / Improvements From b967cd8fb9183ee557d0d757ee75cbc7e353a060 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Fri, 8 Dec 2023 20:51:25 +0000 Subject: [PATCH 50/52] change gcs bucket to gs://temp-storage-for-perf-tests --- .../apache_beam/ml/transforms/embeddings/huggingface_test.py | 4 +++- .../apache_beam/ml/transforms/embeddings/vertex_ai_test.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py index e59090151c5e9..779a6daf8f3c1 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py @@ -84,8 +84,10 @@ class SentenceTrasformerEmbeddingsTest(unittest.TestCase): def setUp(self) -> None: self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_') + # this bucket has TTL and will be deleted periodically self.gcs_artifact_location = os.path.join( - 'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex) + 'gs://temp-storage-for-perf-tests/sentence_transformers', + uuid.uuid4().hex) def tearDown(self) -> None: shutil.rmtree(self.artifact_location) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py index 3d8e1ea31673a..04a730eaefb0f 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py @@ -48,7 +48,7 @@ class VertexAIEmbeddingsTest(unittest.TestCase): def setUp(self) -> None: self.artifact_location = tempfile.mkdtemp(prefix='_vertex_ai_test') self.gcs_artifact_location = os.path.join( - 'gs://apache-beam-ml/testing/vertex_ai', uuid.uuid4().hex) + 'gs://temp-storage-for-perf-tests/vertex_ai', uuid.uuid4().hex) def tearDown(self) -> None: shutil.rmtree(self.artifact_location) From f1bb42c3376520da4a06006f5c6fc24acda5da90 Mon Sep 17 00:00:00 2001 From: Anand Inguva Date: Mon, 11 Dec 2023 09:55:51 -0500 Subject: [PATCH 51/52] Add TODO GH links --- sdks/python/apache_beam/ml/transforms/base.py | 6 ++++-- .../apache_beam/ml/transforms/embeddings/vertex_ai.py | 3 ++- sdks/python/apache_beam/ml/transforms/utils.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index 859fbc68ffd68..d5f4d1b60e140 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -162,7 +162,8 @@ def append_transform(self, transform: BaseOperation): """ -# TODO: Add support for inference_fn +# TODO:https://github.com/apache/beam/issues/29356 +# Add support for inference_fn class EmbeddingsManager(MLTransformProvider): def __init__( self, @@ -385,7 +386,8 @@ class _JsonPickleTransformAttributeManager(_TransformAttributeManager): @staticmethod def _is_remote_path(path): is_gcs = path.find('gs://') != -1 - # TODO: Add support for other remote paths. + # TODO:https://github.com/apache/beam/issues/29356 + # Add support for other remote paths. if not is_gcs and path.find('://') != -1: raise RuntimeError( "Artifact locations are currently supported for only available for " diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py index b80498fc7a13d..1f4c1577eb797 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py @@ -40,7 +40,8 @@ __all__ = ["VertexAITextEmbeddings"] DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT" -# TODO: Can this list be automatically pulled from Vertex SDK? +# TODO: https://github.com/apache/beam/issues/29356 +# Can this list be automatically pulled from Vertex SDK? TASK_TYPE_INPUTS = [ "RETRIEVAL_DOCUMENT", "RETRIEVAL_QUERY", diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py index b0aef5898cf08..fadf611b0e66e 100644 --- a/sdks/python/apache_beam/ml/transforms/utils.py +++ b/sdks/python/apache_beam/ml/transforms/utils.py @@ -32,7 +32,8 @@ class ArtifactsFetcher(): def __init__(self, artifact_location): files = os.listdir(artifact_location) files.remove(base._ATTRIBUTE_FILE_NAME) - # TODO: Integrate ArtifactFetcher into MLTransform. + # TODO: https://github.com/apache/beam/issues/29356 + # Integrate ArtifactFetcher into MLTransform. if len(files) > 1: raise NotImplementedError( "MLTransform may have been utilized alongside transforms written " From c173d6ad13ab24d469ca251bd81e0bf539faff20 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Mon, 11 Dec 2023 10:14:18 -0500 Subject: [PATCH 52/52] Update CHANGES.md Co-authored-by: Danny McCormick --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 0cc27d3560475..60b5a820cf3bd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -67,7 +67,7 @@ * Python GCSIO is now implemented with GCP GCS Client instead of apitools ([#25676](https://github.com/apache/beam/issues/25676)) * Adding support for LowCardinality DataType in ClickHouse (Java) ([#29533](https://github.com/apache/beam/pull/29533)). * Added support for handling bad records to KafkaIO (Java) ([#29546](https://github.com/apache/beam/pull/29546)) -* Add support for generating text embeddings in MLTransform for Vertex AI and Huggingface hub models.([#29564](https://github.com/apache/beam/pull/29564)) +* Add support for generating text embeddings in MLTransform for Vertex AI and Hugging Face Hub models.([#29564](https://github.com/apache/beam/pull/29564)) ## New Features / Improvements