From 80e5c4af7aa233cb891c6a30f1a48fcb3963e6c2 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:45:54 -0500
Subject: [PATCH 01/52] Make base.py framework agnostic and add helper
 transforms

---
 sdks/python/apache_beam/ml/transforms/base.py | 437 ++++++++++++++++--
 1 file changed, 394 insertions(+), 43 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index b3a30bb5f1256..b8a9beafb9862 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -14,18 +14,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# pytype: skip-file
-
 import abc
+import collections
+import logging
+import os
+import tempfile
+import uuid
+from typing import Any
 from typing import Dict
 from typing import Generic
 from typing import List
+from typing import Mapping
 from typing import Optional
 from typing import Sequence
 from typing import TypeVar
+from typing import Union
+
+import jsonpickle
+import numpy as np
 
 import apache_beam as beam
+from apache_beam.io.filesystems import FileSystems
 from apache_beam.metrics.metric import Metrics
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import ModelT
+from apache_beam.options.pipeline_options import PipelineOptions
+
+_LOGGER = logging.getLogger(__name__)
+_ATTRIBUTE_FILE_NAME = 'attributes.json'
 
 __all__ = ['MLTransform', 'ProcessHandler', 'BaseOperation']
 
@@ -42,12 +58,68 @@
 OperationOutputT = TypeVar('OperationOutputT')
 
 
+def _convert_list_of_dicts_to_dict_of_lists(
+    list_of_dicts: Sequence[Dict[str, Any]]) -> Dict[str, List[Any]]:
+  keys_to_element_list = collections.defaultdict(list)
+  for d in list_of_dicts:
+    for key, value in d.items():
+      keys_to_element_list[key].append(value)
+  return keys_to_element_list
+
+
+def _convert_dict_of_lists_to_lists_of_dict(
+    dict_of_lists: Dict[str, List[Any]],
+    batch_length: int) -> List[Dict[str, Any]]:
+  result: List[Dict[str, Any]] = [{} for _ in range(batch_length)]
+  for key, values in dict_of_lists.items():
+    for i in range(len(values)):
+      result[i][key] = values[i]
+  return result
+
+
 class ArtifactMode(object):
   PRODUCE = 'produce'
   CONSUME = 'consume'
 
 
-class BaseOperation(Generic[OperationInputT, OperationOutputT], abc.ABC):
+class PTransformProvider:
+  """
+  Data processing transforms that are intended to be used with MLTransform
+  should subclass PTransformProvider and implement the following methods:
+  1. get_ptransform_for_processing()
+  2. requires_chaining()
+
+  get_ptransform_for_processing() method should return a PTransform that can be
+  used to process the data.
+
+  requires_chaining() method should return True if the data processing
+  transforms needs to be chained sequentially with compatible data processing
+  transforms.
+  """
+  @abc.abstractmethod
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    """
+    Returns a PTransform that can be used to process the data.
+    """
+
+  @abc.abstractmethod
+  def requires_chaining(self):
+    """
+    Returns True if the data processing transforms needs to be chained
+    sequentially with compatible data processing transforms.
+    """
+
+  def get_counter(self):
+    """
+    Returns the counter name for the data processing transform.
+    """
+    counter_name = self.__class__.__name__
+    return Metrics.counter(MLTransform, f'BeamML_{counter_name}')
+
+
+class BaseOperation(Generic[OperationInputT, OperationOutputT],
+                    PTransformProvider,
+                    abc.ABC):
   def __init__(self, columns: List[str]) -> None:
     """
     Base Opertation class data processing transformations.
@@ -76,33 +148,55 @@ def __call__(self, data: OperationInputT,
     transformed_data = self.apply_transform(data, output_column_name)
     return transformed_data
 
-  def get_counter(self):
-    """
-    Returns the counter name for the operation.
-    """
-    counter_name = self.__class__.__name__
-    return Metrics.counter(MLTransform, f'BeamML_{counter_name}')
 
-
-class ProcessHandler(Generic[ExampleT, MLTransformOutputT], abc.ABC):
+class ProcessHandler(beam.PTransform[beam.PCollection[ExampleT],
+                                     beam.PCollection[MLTransformOutputT]],
+                     abc.ABC):
   """
   Only for internal use. No backwards compatibility guarantees.
   """
   @abc.abstractmethod
-  def process_data(
-      self, pcoll: beam.PCollection[ExampleT]
-  ) -> beam.PCollection[MLTransformOutputT]:
+  def append_transform(self, transform: BaseOperation):
     """
-    Logic to process the data. This will be the entrypoint in
-    beam.MLTransform to process incoming data.
+    Append transforms to the ProcessHandler.
     """
 
+
+# TODO: Add support for inference_fn
+class EmbeddingsManager(PTransformProvider):
+  def __init__(
+      self,
+      columns: List[str],
+      *,
+      # common args for all ModelHandlers.
+      load_model_args: Optional[Dict[str, Any]] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      large_model: bool = False,
+      **kwargs):
+    self.load_model_args = load_model_args or {}
+    self.min_batch_size = min_batch_size
+    self.max_batch_size = max_batch_size
+    self.large_model = large_model
+    self.columns = columns
+
+    if kwargs:
+      _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys())
+
+  # TODO: Add set_model_handler method.
   @abc.abstractmethod
-  def append_transform(self, transform: BaseOperation):
+  def get_model_handler(self) -> ModelHandler:
     """
-    Append transforms to the ProcessHandler.
+    Return framework specific model handler.
     """
 
+  def requires_chaining(self):
+    # each embedding config requires a separate PTransform. so no chaining.
+    return False
+
+  def get_columns_to_apply(self):
+    return self.columns
+
 
 class MLTransform(beam.PTransform[beam.PCollection[ExampleT],
                                   beam.PCollection[MLTransformOutputT]],
@@ -112,7 +206,8 @@ def __init__(
       *,
       write_artifact_location: Optional[str] = None,
       read_artifact_location: Optional[str] = None,
-      transforms: Optional[Sequence[BaseOperation]] = None):
+      transforms: Optional[List[Union[BaseOperation,
+                                      EmbeddingsManager]]] = None):
     """
     MLTransform is a Beam PTransform that can be used to apply
     transformations to the data. MLTransform is used to wrap the
@@ -157,9 +252,6 @@ def __init__(
         i-th transform is the output of the (i-1)-th transform. Multi-input
         transforms are not supported yet.
     """
-    if transforms:
-      _ = [self._validate_transform(transform) for transform in transforms]
-
     if read_artifact_location and write_artifact_location:
       raise ValueError(
           'Only one of read_artifact_location or write_artifact_location can '
@@ -177,19 +269,10 @@ def __init__(
       artifact_location = write_artifact_location  # type: ignore[assignment]
       artifact_mode = ArtifactMode.PRODUCE
 
-    # avoid circular import
-    # pylint: disable=wrong-import-order, wrong-import-position
-    from apache_beam.ml.transforms.handlers import TFTProcessHandler
-    # TODO: When new ProcessHandlers(eg: JaxProcessHandler) are introduced,
-    # create a mapping between transforms and ProcessHandler since
-    # ProcessHandler is not exposed to the user.
-    process_handler: ProcessHandler = TFTProcessHandler(
-        artifact_location=artifact_location,
-        artifact_mode=artifact_mode,
-        transforms=transforms)  # type: ignore[arg-type]
-
-    self._process_handler = process_handler
-    self.transforms = transforms
+    self._parent_artifact_location = artifact_location
+
+    self._artifact_mode = artifact_mode
+    self.transforms = transforms or []
     self._counter = Metrics.counter(
         MLTransform, f'BeamML_{self.__class__.__name__}')
 
@@ -209,10 +292,33 @@ def expand(
     Returns:
       A PCollection of MLTransformOutputT type
     """
+    _ = [self._validate_transform(transform) for transform in self.transforms]
+    if self._artifact_mode == ArtifactMode.PRODUCE:
+      ptransform_partitioner = _MLTransformToPTransformMapper(
+          transforms=self.transforms,
+          artifact_location=self._parent_artifact_location,
+          artifact_mode=self._artifact_mode,
+          pipeline_options=pcoll.pipeline.options)
+      ptransform_list = ptransform_partitioner.create_and_save_ptransform_list()
+    else:
+      ptransform_list = (
+          _MLTransformToPTransformMapper.load_transforms_from_artifact_location(
+              self._parent_artifact_location))
+
+    # the saved transforms has artifact mode set to PRODUCE.
+    # set the artifact mode to CONSUME.
+    if self._artifact_mode == ArtifactMode.CONSUME:
+      for i in range(len(ptransform_list)):
+        if hasattr(ptransform_list[i], 'artifact_mode'):
+          ptransform_list[i].artifact_mode = self._artifact_mode
+
+    for ptransform in ptransform_list:
+      pcoll = pcoll | ptransform
+
     _ = (
         pcoll.pipeline
         | "MLTransformMetricsUsage" >> MLTransformMetricsUsage(self))
-    return self._process_handler.process_data(pcoll)
+    return pcoll  # type: ignore[return-value]
 
   def with_transform(self, transform: BaseOperation):
     """
@@ -222,14 +328,21 @@ def with_transform(self, transform: BaseOperation):
     Returns:
       A MLTransform instance.
     """
-    self._validate_transform(transform)
-    self._process_handler.append_transform(transform)
+    # self._validate_transform(transform)
+    # avoid circular import
+    # pylint: disable=wrong-import-order, wrong-import-position
+    self.transforms.append(transform)
     return self
 
   def _validate_transform(self, transform):
-    if not isinstance(transform, BaseOperation):
+    # every data processing transform should subclass PTransformProvider. Raise
+    # an error if the transform does not subclass PTransformProvider since the
+    # downstream code expects the transform to be a subclass of
+    # PTransformProvider.
+    if not isinstance(transform, PTransformProvider):
       raise TypeError(
-          'transform must be a subclass of BaseOperation. '
+          'transform must be a subclass of PTransformProvider and implement '
+          'get_ptransform_for_processing() method.'
           'Got: %s instead.' % type(transform))
 
 
@@ -243,9 +356,7 @@ def _increment_counters():
       # increment for MLTransform.
       self._ml_transform._counter.inc()
       # increment if data processing transforms are passed.
-      transforms = (
-          self._ml_transform.transforms or
-          self._ml_transform._process_handler.transforms)
+      transforms = self._ml_transform.transforms
       if transforms:
         for transform in transforms:
           transform.get_counter().inc()
@@ -254,3 +365,243 @@ def _increment_counters():
         pipeline
         | beam.Create([None])
         | beam.Map(lambda _: _increment_counters()))
+
+
+class _TransformAttributeManager:
+  """
+  Base class used for saving and loading the attributes.
+  """
+  @staticmethod
+  def save_attributes(artifact_location):
+    """
+    Save the attributes to json file using stdlib json.
+    """
+    raise NotImplementedError
+
+  @staticmethod
+  def load_attributes(artifact_location):
+    """
+    Load the attributes from json file.
+    """
+    raise NotImplementedError
+
+
+class _JsonPickleTransformAttributeManager(_TransformAttributeManager):
+  """
+  Use Jsonpickle to save and load the attributes. Here the attributes refer
+  to the list of PTransforms that are used to process the data.
+
+  jsonpickle is used to serialize the PTransforms and save it to a json file and
+  is compatible across python versions.
+  """
+  @staticmethod
+  def _is_remote_path(path):
+    is_gcs = path.find('gs://') != -1
+    # TODO: Add support for other remote paths.
+    if not is_gcs and path.find('://') != -1:
+      raise RuntimeError(
+          "Artifact locations are currently supported for only available for "
+          "local paths and GCS paths. Got: %s" % path)
+    return is_gcs
+
+  @staticmethod
+  def save_attributes(
+      ptransform_list,
+      artifact_location,
+      **kwargs,
+  ):
+    if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location):
+      try:
+        options = kwargs.get('options')
+      except KeyError:
+        raise RuntimeError(
+            'pipeline options are required to save the attributes.'
+            'in the artifact location %s' % artifact_location)
+
+      temp_dir = tempfile.mkdtemp()
+      temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME)
+      with open(temp_json_file, 'w+') as f:
+        f.write(jsonpickle.encode(ptransform_list))
+      with open(temp_json_file, 'rb') as f:
+        from apache_beam.runners.dataflow.internal import apiclient
+        _LOGGER.info('Creating artifact location: %s', artifact_location)
+        apiclient.DataflowApplicationClient(options=options).stage_file(
+            gcs_or_local_path=artifact_location,
+            file_name=_ATTRIBUTE_FILE_NAME,
+            stream=f,
+            mime_type='application/json')
+    else:
+      if not FileSystems.exists(artifact_location):
+        FileSystems.mkdirs(artifact_location)
+      # FileSystems.open() fails if the file does not exist.
+      with open(os.path.join(artifact_location, _ATTRIBUTE_FILE_NAME),
+                'w+') as f:
+        f.write(jsonpickle.encode(ptransform_list))
+
+  @staticmethod
+  def load_attributes(artifact_location):
+    with FileSystems.open(os.path.join(artifact_location, _ATTRIBUTE_FILE_NAME),
+                          'rb') as f:
+      return jsonpickle.decode(f.read())
+
+
+_transform_attribute_manager = _JsonPickleTransformAttributeManager
+
+
+class _MLTransformToPTransformMapper:
+  """
+  This class takes in a list of data processing transforms compatible to be
+  wrapped around MLTransform and returns a list of PTransforms that are used to
+  run the data processing transforms.
+
+  The _MLTransformToPTransformMapper is responsible for loading and saving the
+  PTransforms or attributes of PTransforms to the artifact location to seal
+  the gap between the training and inference pipelines.
+  """
+  def __init__(
+      self,
+      transforms: List[Union[BaseOperation, EmbeddingsManager]],
+      artifact_location: str,
+      artifact_mode: str,
+      pipeline_options: Optional[PipelineOptions] = None,
+  ):
+    self.transforms = transforms
+    self._parent_artifact_location = artifact_location
+    self.artifact_mode = artifact_mode
+    self.pipeline_options = pipeline_options
+
+  def create_and_save_ptransform_list(self):
+    ptransform_list = self.create_ptransform_list()
+    self.save_transforms_in_artifact_location(ptransform_list)
+    return ptransform_list
+
+  def create_ptransform_list(self):
+    previous_ptransform_type = None
+    current_ptransform = None
+    ptransform_list = []
+    for transform in self.transforms:
+      if not isinstance(transform, PTransformProvider):
+        raise RuntimeError(
+            'Transforms must be instances of PTransformProvider and '
+            'implement get_ptransform_for_processing() method.')
+      # for each instance of PTransform, create a new artifact location
+      current_ptransform = transform.get_ptransform_for_processing(
+          artifact_location=os.path.join(
+              self._parent_artifact_location, uuid.uuid4().hex[:6]),
+          artifact_mode=self.artifact_mode)
+      # Determine if a new ptransform should be added to the list
+      is_different_type = (type(current_ptransform) != previous_ptransform_type)
+      if is_different_type or not transform.requires_chaining():
+        ptransform_list.append(current_ptransform)
+        previous_ptransform_type = type(current_ptransform)
+
+      if hasattr(ptransform_list[-1], 'append_transform'):
+        ptransform_list[-1].append_transform(transform)
+
+    return ptransform_list
+
+  def save_transforms_in_artifact_location(self, ptransform_list):
+    """
+    Save the ptransform references to json file.
+    """
+    _transform_attribute_manager.save_attributes(
+        ptransform_list=ptransform_list,
+        artifact_location=self._parent_artifact_location,
+        options=self.pipeline_options)
+
+  @staticmethod
+  def load_transforms_from_artifact_location(artifact_location):
+    return _transform_attribute_manager.load_attributes(artifact_location)
+
+
+class _TextEmbeddingHandler(ModelHandler):
+  """
+  A ModelHandler intended to be work on list[dict[str, str]] inputs.
+
+  The inputs to the model handler are expected to be a list of dicts.
+
+  For example, if the original mode is used with RunInference to take a
+  PCollection[E] to a PCollection[P], this ModelHandler would take a
+  PCollection[Dict[str, E]] to a PCollection[Dict[str, P]].
+
+  _TextEmbeddingHandler will accept an EmbeddingsManager instance, which
+  contains the details of the model to be loaded and the inference_fn to be
+  used. The purpose of _TextEmbeddingHandler is to generate embeddings for
+  text inputs using the EmbeddingsManager instance.
+
+  If the input is not a text column, a RuntimeError will be raised.
+
+  This is an internal class and offers no backwards compatibility guarantees.
+
+  Args:
+    embeddings_manager: An EmbeddingsManager instance.
+  """
+  def __init__(self, embeddings_manager: EmbeddingsManager):
+    self.embedding_config = embeddings_manager
+    self._underlying = self.embedding_config.get_model_handler()
+    self.columns = self.embedding_config.get_columns_to_apply()
+
+  def load_model(self):
+    model = self._underlying.load_model()
+    return model
+
+  def _validate_column_data(self, batch):
+    if not isinstance(batch[0], (str, bytes)):
+      raise TypeError('Embeddings can only be generated on text columns.')
+
+  def _validate_batch(self, batch: Sequence[Dict[str, List[str]]]):
+    if not batch or not isinstance(batch[0], dict):
+      raise TypeError(
+          'Expected data to be dicts, got '
+          f'{type(batch[0])} instead.')
+
+  def _process_batch(
+      self,
+      dict_batch: Dict[str, List[Any]],
+      model: ModelT,
+      inference_args: Optional[Dict[str, Any]]) -> Dict[str, List[Any]]:
+    result: Dict[str, List[Any]] = collections.defaultdict(list)
+    for key, batch in dict_batch.items():
+      if key in self.columns:
+        self._validate_column_data(batch)
+        prediction = self._underlying.run_inference(
+            batch, model, inference_args)
+        if isinstance(prediction, np.ndarray):
+          prediction = prediction.tolist()
+          result[key] = prediction  # type: ignore[assignment]
+        else:
+          result[key] = prediction  # type: ignore[assignment]
+      else:
+        result[key] = batch
+    return result
+
+  def run_inference(
+      self,
+      batch: Sequence[Dict[str, List[str]]],
+      model: ModelT,
+      inference_args: Optional[Dict[str, Any]] = None,
+  ) -> List[Dict[str, Union[List[float], List[str]]]]:
+    """
+    Runs inference on a batch of text inputs. The inputs are expected to be
+    a list of dicts. Each dict should have the same keys, and the shape
+    should be of the same size for a single key across the batch.
+    """
+    self._validate_batch(batch)
+    batch_len = len(batch)
+    dict_batch = _convert_list_of_dicts_to_dict_of_lists(list_of_dicts=batch)
+    transformed_batch = self._process_batch(dict_batch, model, inference_args)
+    return _convert_dict_of_lists_to_lists_of_dict(
+        dict_of_lists=transformed_batch, batch_length=batch_len)
+
+  def get_metrics_namespace(self) -> str:
+    return (
+        self._underlying.get_metrics_namespace() or
+        'BeamML_TextEmbeddingHandler')
+
+  def batch_elements_kwargs(self) -> Mapping[str, Any]:
+    batch_sizes_map = {}
+    if self.embedding_config.max_batch_size:
+      batch_sizes_map['max_batch_size'] = self.embedding_config.max_batch_size
+    if self.embedding_config.min_batch_size:
+      batch_sizes_map['min_batch_size'] = self.embedding_config.min_batch_size
+    return (self._underlying.batch_elements_kwargs() or batch_sizes_map)

From 0d34847252c6457a305a704729a4173bdd110c22 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:46:20 -0500
Subject: [PATCH 02/52] Add tests for base.py

---
 .../apache_beam/ml/transforms/base_test.py    | 185 ++++++++++++++++--
 1 file changed, 170 insertions(+), 15 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 2e447964541ba..1f9e5a85d1c2a 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -20,7 +20,11 @@
 import tempfile
 import typing
 import unittest
+from typing import Any
+from typing import Dict
 from typing import List
+from typing import Optional
+from typing import Sequence
 
 import numpy as np
 from parameterized import param
@@ -28,28 +32,30 @@
 
 import apache_beam as beam
 from apache_beam.metrics.metric import MetricsFilter
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms import base
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 
 # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports
 try:
-  from apache_beam.ml.transforms import base
   from apache_beam.ml.transforms import tft
   from apache_beam.ml.transforms.tft import TFTOperation
 except ImportError:
   tft = None  # type: ignore
 
-if tft is None:
-  raise unittest.SkipTest('tensorflow_transform is not installed')
-
+try:
 
-class _FakeOperation(TFTOperation):
-  def __init__(self, name, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.name = name
+  class _FakeOperation(TFTOperation):
+    def __init__(self, name, *args, **kwargs):
+      super().__init__(*args, **kwargs)
+      self.name = name
 
-  def apply_transform(self, inputs, output_column_name, **kwargs):
-    return {output_column_name: inputs}
+    def apply_transform(self, inputs, output_column_name, **kwargs):
+      return {output_column_name: inputs}
+except:  # pylint: disable=bare-except
+  pass
 
 
 class BaseMLTransformTest(unittest.TestCase):
@@ -59,6 +65,7 @@ def setUp(self) -> None:
   def tearDown(self):
     shutil.rmtree(self.artifact_location)
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_appends_transforms_to_process_handler_correctly(self):
     fake_fn_1 = _FakeOperation(name='fake_fn_1', columns=['x'])
     transforms = [fake_fn_1]
@@ -67,12 +74,11 @@ def test_ml_transform_appends_transforms_to_process_handler_correctly(self):
     ml_transform = ml_transform.with_transform(
         transform=_FakeOperation(name='fake_fn_2', columns=['x']))
 
-    self.assertEqual(len(ml_transform._process_handler.transforms), 2)
-    self.assertEqual(
-        ml_transform._process_handler.transforms[0].name, 'fake_fn_1')
-    self.assertEqual(
-        ml_transform._process_handler.transforms[1].name, 'fake_fn_2')
+    self.assertEqual(len(ml_transform.transforms), 2)
+    self.assertEqual(ml_transform.transforms[0].name, 'fake_fn_1')
+    self.assertEqual(ml_transform.transforms[1].name, 'fake_fn_2')
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_on_dict(self):
     transforms = [tft.ScaleTo01(columns=['x'])]
     data = [{'x': 1}, {'x': 2}]
@@ -91,6 +97,7 @@ def test_ml_transform_on_dict(self):
       assert_that(
           actual_output, equal_to(expected_output, equals_fn=np.array_equal))
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_on_list_dict(self):
     transforms = [tft.ScaleTo01(columns=['x'])]
     data = [{'x': [1, 2, 3]}, {'x': [4, 5, 6]}]
@@ -162,6 +169,7 @@ def test_ml_transform_on_list_dict(self):
           },
       ),
   ])
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_dict_output_pcoll_schema(
       self, input_data, input_types, expected_dtype):
     transforms = [tft.ScaleTo01(columns=['x'])]
@@ -178,6 +186,7 @@ def test_ml_transform_dict_output_pcoll_schema(
         if name in expected_dtype:
           self.assertEqual(expected_dtype[name], typ)
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_fail_for_non_global_windows_in_produce_mode(self):
     transforms = [tft.ScaleTo01(columns=['x'])]
     with beam.Pipeline() as p:
@@ -193,6 +202,7 @@ def test_ml_transform_fail_for_non_global_windows_in_produce_mode(self):
                 write_artifact_location=self.artifact_location,
             ))
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transform_on_multiple_columns_single_transform(self):
     transforms = [tft.ScaleTo01(columns=['x', 'y'])]
     data = [{'x': [1, 2, 3], 'y': [1.0, 10.0, 20.0]}]
@@ -217,6 +227,7 @@ def test_ml_transform_on_multiple_columns_single_transform(self):
           equal_to(expected_output_y, equals_fn=np.array_equal),
           label='y')
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_ml_transforms_on_multiple_columns_multiple_transforms(self):
     transforms = [
         tft.ScaleTo01(columns=['x']),
@@ -245,6 +256,7 @@ def test_ml_transforms_on_multiple_columns_multiple_transforms(self):
           equal_to(expected_output_y, equals_fn=np.array_equal),
           label='actual_output_y')
 
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
   def test_mltransform_with_counter(self):
     transforms = [
         tft.ComputeAndApplyVocabulary(columns=['y']),
@@ -269,6 +281,149 @@ def test_mltransform_with_counter(self):
     self.assertEqual(
         result.metrics().query(mltransform_counter)['counters'][0].result, 1)
 
+  def test_non_ptransfrom_provider_class_to_mltransform(self):
+    class Add:
+      def __call__(self, x):
+        return x + 1
+
+    with self.assertRaisesRegex(
+        TypeError, 'transform must be a subclass of PTransformProvider'):
+      with beam.Pipeline() as p:
+        _ = (
+            p
+            | beam.Create([{
+                'x': 1
+            }])
+            | base.MLTransform(
+                write_artifact_location=self.artifact_location).with_transform(
+                    Add()))
+
+
+class FakeModel:
+  def __call__(self, example: List[str]) -> List[str]:
+    for i in range(len(example)):
+      example[i] = example[i][::-1]
+    return example
+
+
+class FakeModelHandler(ModelHandler):
+  def run_inference(
+      self,
+      batch: Sequence[str],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None):
+    return model(batch)
+
+  def load_model(self):
+    return FakeModel()
+
+
+class FakeEmbeddingsManager(base.EmbeddingsManager):
+  def __init__(self, columns):
+    super().__init__(columns=columns)
+
+  def get_model_handler(self) -> ModelHandler:
+    return FakeModelHandler()
+
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    return (RunInference(model_handler=base._TextEmbeddingHandler(self)))
+
+
+class TextEmbeddingHandlerTest(unittest.TestCase):
+  def setUp(self) -> None:
+    self.embedding_conig = FakeEmbeddingsManager(columns=['x'])
+    self.artifact_location = tempfile.mkdtemp()
+
+  def tearDown(self) -> None:
+    shutil.rmtree(self.artifact_location)
+
+  def test_handler_with_incompatible_datatype(self):
+    text_handler = base._TextEmbeddingHandler(
+        embeddings_manager=self.embedding_conig)
+    data = [
+        ('x', 1),
+        ('x', 2),
+        ('x', 3),
+    ]
+    with self.assertRaises(TypeError):
+      text_handler.run_inference(data, None, None)
+
+  def test_handler_with_dict_inputs(self):
+    data = [
+        {
+            'x': "Hello world"
+        },
+        {
+            'x': "Apache Beam"
+        },
+    ]
+    expected_data = [{key: value[::-1]
+                      for key, value in d.items()} for d in data]
+    with beam.Pipeline() as p:
+      result = (
+          p
+          | beam.Create(data)
+          | base.MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  self.embedding_conig))
+      assert_that(
+          result,
+          equal_to(expected_data),
+      )
+
+  def test_handler_with_batch_sizes(self):
+    self.embedding_conig.max_batch_size = 100
+    self.embedding_conig.min_batch_size = 10
+    data = [
+        {
+            'x': "Hello world"
+        },
+        {
+            'x': "Apache Beam"
+        },
+    ] * 100
+    expected_data = [{key: value[::-1]
+                      for key, value in d.items()} for d in data]
+    with beam.Pipeline() as p:
+      result = (
+          p
+          | beam.Create(data)
+          | base.MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  self.embedding_conig))
+      assert_that(
+          result,
+          equal_to(expected_data),
+      )
+
+  def test_handler_on_multiple_columns(self):
+    self.embedding_conig.columns = ['x', 'y']
+    data = [
+        {
+            'x': "Hello world", 'y': "Apache Beam", 'z': 'unchanged'
+        },
+        {
+            'x': "Apache Beam", 'y': "Hello world", 'z': 'unchanged'
+        },
+    ]
+    self.embedding_conig.columns = ['x', 'y']
+    expected_data = [{
+        key: (value[::-1] if key in self.embedding_conig.columns else value)
+        for key,
+        value in d.items()
+    } for d in data]
+    with beam.Pipeline() as p:
+      result = (
+          p
+          | beam.Create(data)
+          | base.MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  self.embedding_conig))
+      assert_that(
+          result,
+          equal_to(expected_data),
+      )
+
 
 if __name__ == '__main__':
   unittest.main()

From 58b24f6ea63384dd9492a8d83112775eddd6c8d3 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:47:39 -0500
Subject: [PATCH 03/52] Add sentence-transformers

---
 .../ml/transforms/embeddings/__init__.py      |  21 ++
 .../embeddings/sentence_transformer.py        | 128 +++++++++++
 .../embeddings/sentence_transformer_test.py   | 212 ++++++++++++++++++
 3 files changed, 361 insertions(+)
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/__init__.py
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py b/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py
new file mode 100644
index 0000000000000..bda6256b79ef4
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# TODO: Add dead letter queue for RunInference transforms.
+
+"""
+This module contains embedding configs that can be used to generate
+embeddings using MLTransform.
+"""
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
new file mode 100644
index 0000000000000..5b31dbca00820
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
@@ -0,0 +1,128 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["SentenceTransformerEmbeddings"]
+
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Mapping
+from typing import Optional
+from typing import Sequence
+
+import apache_beam as beam
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from sentence_transformers import SentenceTransformer
+
+
+# TODO: Use HuggingFaceModelHandlerTensor once the import issue is fixed.
+# Right now, the hugging face model handler import torch and tensorflow
+# at the same time, which adds too much weigth to the container unnecessarily.
+class _SentenceTransformerModelHandler(ModelHandler):
+  """
+  Note: Intended for internal use and guarantees no backwards compatibility.
+  """
+  def __init__(
+      self,
+      model_name: str,
+      model_class: Callable,
+      load_model_args: Optional[dict] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      max_seq_length: Optional[int] = None,
+      large_model: bool = False,
+      **kwargs):
+    self._max_seq_length = max_seq_length
+    self._model_uri = model_name
+    self._model_class = model_class
+    self._load_model_args = load_model_args
+    self._min_batch_size = min_batch_size
+    self._max_batch_size = max_batch_size
+    self._large_model = large_model
+    self._kwargs = kwargs
+
+  def run_inference(
+      self,
+      batch: Sequence[str],
+      model: SentenceTransformer,
+      inference_args: Optional[Dict[str, Any]] = None,
+  ):
+    inference_args = inference_args or {}
+    return model.encode(batch, **inference_args)
+
+  def load_model(self):
+    model = self._model_class(self._model_uri)
+    if self._max_seq_length:
+      model.max_seq_length = self._max_seq_length
+    return model
+
+  def share_model_across_processes(self) -> bool:
+    return self._large_model
+
+  def batch_elements_kwargs(self) -> Mapping[str, Any]:
+    batch_sizes = {}
+    if self._min_batch_size:
+      batch_sizes["min_batch_size"] = self._min_batch_size
+    if self._max_batch_size:
+      batch_sizes["max_batch_size"] = self._max_batch_size
+    return batch_sizes
+
+
+class SentenceTransformerEmbeddings(EmbeddingsManager):
+  def __init__(
+      self,
+      model_name: str,
+      columns: List[str],
+      max_seq_length: Optional[int] = None,
+      **kwargs):
+    """
+    Embedding config for sentence-transformers. This config can be used with
+    MLTransform to embed text data. Models are loaded using the RunInference
+    PTransform with the help of ModelHandler.
+    Args:
+      model_name: Name of the model to use. The model should be hosted on
+        HuggingFace Hub or compatible with sentence_transformers.
+      columns: List of columns to be embedded.
+      max_seq_length: Max sequence length to use for the model if applicable.
+      min_batch_size: The minimum batch size to be used for inference.
+      max_batch_size: The maximum batch size to be used for inference.
+      large_model: Whether to share the model across processes.
+    """
+    super().__init__(columns, **kwargs)
+    self.model_name = model_name
+    self.max_seq_length = max_seq_length
+
+  def get_model_handler(self):
+    return _SentenceTransformerModelHandler(
+        model_class=SentenceTransformer,
+        max_seq_length=self.max_seq_length,
+        model_name=self.model_name,
+        load_model_args=self.load_model_args,
+        min_batch_size=self.min_batch_size,
+        max_batch_size=self.max_batch_size,
+        large_model=self.large_model)
+
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    # wrap the model handler in a _TextEmbeddingHandler since
+    # the SentenceTransformerEmbeddings works on text input data.
+    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
+
+  def requires_chaining(self):
+    return False
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
new file mode 100644
index 0000000000000..63f401180dc2d
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -0,0 +1,212 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import apache_beam as beam
+from apache_beam.ml.transforms.base import MLTransform
+
+# pylint: disable=ungrouped-imports
+try:
+  from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+except ImportError:
+  SentenceTransformerEmbeddings = None  # type: ignore
+
+# pylint: disable=ungrouped-imports
+try:
+  import tensorflow_transform as tft
+  from apache_beam.ml.transforms.tft import ScaleTo01
+except ImportError:
+  tft = None
+
+test_query = "This is a test"
+test_query_column = "feature_1"
+DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
+
+
+def get_pipeline_wth_embedding_config(
+    pipeline: beam.Pipeline, embedding_config, artifact_location):
+  transformed_pcoll = (
+      pipeline
+      | "CreateData" >> beam.Create([{
+          test_query_column: test_query
+      }])
+      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location).
+      with_transform(embedding_config))
+  return transformed_pcoll
+
+
+@unittest.skipIf(
+    SentenceTransformerEmbeddings is None,
+    'sentence-transformers is not installed.')
+class SentenceTrasformerEmbeddingsTest(unittest.TestCase):
+  def setUp(self) -> None:
+    self.artifact_location = tempfile.mkdtemp()
+
+  def tearDown(self) -> None:
+    shutil.rmtree(self.artifact_location)
+
+  def test_sentence_transformer_embeddings(self):
+    model_name = DEFAULT_MODEL_NAME
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name, columns=[test_query_column])
+    with beam.Pipeline() as pipeline:
+      result_pcoll = get_pipeline_wth_embedding_config(
+          pipeline=pipeline,
+          embedding_config=embedding_config,
+          artifact_location=self.artifact_location)
+
+      def assert_element(element):
+        assert len(element[test_query_column]) == 768
+
+      _ = (result_pcoll | beam.Map(assert_element))
+
+  @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.')
+  def test_embeddings_with_scale_to_0_1(self):
+    model_name = DEFAULT_MODEL_NAME
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name,
+        columns=[test_query_column],
+    )
+    with beam.Pipeline() as pipeline:
+      transformed_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config).with_transform(
+                      ScaleTo01(columns=[test_query_column])))
+
+      def assert_element(element):
+        assert max(element.feature_1) == 1
+
+      _ = (transformed_pcoll | beam.Map(assert_element))
+
+  def pipeline_with_configurable_artifact_location(
+      self,
+      pipeline,
+      embedding_config=None,
+      read_artifact_location=None,
+      write_artifact_location=None):
+    if write_artifact_location:
+      return (
+          pipeline
+          | MLTransform(write_artifact_location=write_artifact_location).
+          with_transform(embedding_config))
+    elif read_artifact_location:
+      return (
+          pipeline
+          | MLTransform(read_artifact_location=read_artifact_location))
+    else:
+      raise NotImplementedError
+
+  def test_embeddings_with_read_artifact_location(self):
+    with beam.Pipeline() as p:
+      model_name = DEFAULT_MODEL_NAME
+      embedding_config = SentenceTransformerEmbeddings(
+          model_name=model_name, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=self.artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=self.artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.13
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.1342099905014038
+            | beam.Map(assert_element))
+
+  def test_sentence_transformer_with_int_data_types(self):
+    model_name = DEFAULT_MODEL_NAME
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name, columns=[test_query_column])
+    with self.assertRaises(TypeError):
+      with beam.Pipeline() as pipeline:
+        _ = (
+            pipeline
+            | "CreateData" >> beam.Create([{
+                test_query_column: 1
+            }])
+            | "MLTransform" >> MLTransform(
+                write_artifact_location=self.artifact_location).with_transform(
+                    embedding_config))
+
+  def test_with_gcs_artifact_location(self):
+    artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
+    with beam.Pipeline() as p:
+      model_name = DEFAULT_MODEL_NAME
+      embedding_config = SentenceTransformerEmbeddings(
+          model_name=model_name, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.13
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.1342099905014038
+            | beam.Map(assert_element))
+
+
+if __name__ == '__main__':
+  unittest.main()

From 88f9ceb61bced33c68e3f2790e37d7ff15b6508c Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:47:53 -0500
Subject: [PATCH 04/52] Add tensorflow hub

---
 .../transforms/embeddings/tensorflow_hub.py   | 124 +++++++++++
 .../embeddings/tensorflow_hub_test.py         | 198 ++++++++++++++++++
 2 files changed, 322 insertions(+)
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
new file mode 100644
index 0000000000000..62bd00e10359a
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
@@ -0,0 +1,124 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterable
+from typing import List
+from typing import Optional
+
+import apache_beam as beam
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_text as text  # required to register TF ops. # pylint: disable=unused-import
+from apache_beam.ml.inference import utils
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import PredictionResult
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor
+from apache_beam.ml.inference.tensorflow_inference import default_tensor_inference_fn
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+
+__all__ = ['TensorflowHubTextEmbeddings']
+
+
+class _TensorflowHubModelHandler(TFModelHandlerTensor):
+  """
+  Note: Intended for internal use only. No backwards compatibility guarantees.
+  """
+  def __init__(self, preprocessing_url: Optional[str], *args, **kwargs):
+    self.preprocessing_url = preprocessing_url
+    super().__init__(*args, **kwargs)
+
+  def load_model(self):
+    # unable to load the models with tf.keras.models.load_model so
+    # using hub.KerasLayer instead
+    model = hub.KerasLayer(self._model_uri)
+    return model
+
+  def _convert_prediction_result_to_list(
+      self, predictions: Iterable[PredictionResult]):
+    result = []
+    for prediction in predictions:
+      inference = prediction.inference.numpy().tolist()
+      result.append(inference)
+    return result
+
+  def run_inference(self, batch, model, inference_args, model_id=None):
+    if not inference_args:
+      inference_args = {}
+    if not self.preprocessing_url:
+      predictions = default_tensor_inference_fn(
+          model=model,
+          batch=batch,
+          inference_args=inference_args,
+          model_id=model_id)
+      return self._convert_prediction_result_to_list(predictions)
+
+    vectorized_batch = tf.stack(batch, axis=0)
+    preprocessor_fn = hub.KerasLayer(self.preprocessing_url)
+    vectorized_batch = preprocessor_fn(vectorized_batch)
+    predictions = model(vectorized_batch)
+    # https://www.tensorflow.org/text/tutorials/classify_text_with_bert#using_the_bert_model # pylint: disable=line-too-long
+    # pooled_output -> represents the text as a whole. This is an embeddings
+    # of the whole text. The shape is [batch_size, embedding_dimension]
+    # sequence_output -> represents the text as a sequence of tokens. This is
+    # an embeddings of each token in the text. The shape is
+    # [batch_size, max_sequence_length, embedding_dimension]
+    # pooled output is the embeedings as per the documentation. so let's use
+    # that.
+    embeddings = predictions['pooled_output']
+    predictions = utils._convert_to_result(batch, embeddings, model_id)
+    return self._convert_prediction_result_to_list(predictions)
+
+
+class TensorflowHubTextEmbeddings(EmbeddingsManager):
+  def __init__(
+      self,
+      columns: List[str],
+      hub_url: str,
+      preprocessing_url: Optional[str] = None,
+      **kwargs):
+    super().__init__(columns=columns, **kwargs)
+    self.model_uri = hub_url
+    self.preprocessing_url = preprocessing_url
+    """
+    Embedding config for tensorflow hub models. This config can be used with
+    MLTransform to embed text data. Models are loaded using the RunInference
+    PTransform with the help of a ModelHandler.
+
+    Args:
+      columns: The columns containing the text to be embedded.
+      hub_url: The url of the tensorflow hub model.
+      preprocessing_url: The url of the preprocessing model. This is optional.
+        If provided, the preprocessing model will be used to preprocess the
+        text before feeding it to the main model.
+      min_batch_size: The minimum batch size to be used for inference.
+      max_batch_size: The maximum batch size to be used for inference.
+      large_model: Whether to share the model across processes.
+    """
+
+  def get_model_handler(self) -> ModelHandler:
+    # override the default inference function
+    return _TensorflowHubModelHandler(
+        model_uri=self.model_uri,
+        preprocessing_url=self.preprocessing_url,
+        min_batch_size=self.min_batch_size,
+        max_batch_size=self.max_batch_size,
+        large_model=self.large_model,
+    )
+
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
new file mode 100644
index 0000000000000..6b918153945ae
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
@@ -0,0 +1,198 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import apache_beam as beam
+from apache_beam.ml.transforms.base import MLTransform
+
+hub_url = 'https://tfhub.dev/google/LEALLA/LEALLA-small/1'
+test_query_column = 'test_query'
+test_query = 'This is a test query'
+
+# pylint: disable=ungrouped-imports
+try:
+  import tensorflow as tf  # disable=unused-import
+  from apache_beam.ml.transforms.embeddings.tensorflow_hub import TensorflowHubTextEmbeddings
+except ImportError:
+  tf = None
+
+try:
+  from apache_beam.ml.transforms.tft import ScaleTo01
+except ImportError:
+  ScaleTo01 = None  # type: ignore
+
+
+@unittest.skipIf(tf is None, 'Tensorflow is not installed.')
+class TFHubEmbeddingsTest(unittest.TestCase):
+  def setUp(self) -> None:
+    self.artifact_location = tempfile.mkdtemp()
+
+  def tearDown(self) -> None:
+    shutil.rmtree(self.artifact_location)
+
+  def test_tfhub_text_embeddings(self):
+    embedding_config = TensorflowHubTextEmbeddings(
+        hub_url=hub_url, columns=[test_query_column])
+    with beam.Pipeline() as pipeline:
+      transformed_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
+
+      def assert_element(element):
+        assert len(element[test_query_column]) == 128
+
+      _ = (transformed_pcoll | beam.Map(assert_element))
+
+  @unittest.skipIf(ScaleTo01 is None, 'Tensorflow Transform is not installed.')
+  def test_embeddings_with_scale_to_0_1(self):
+    embedding_config = TensorflowHubTextEmbeddings(
+        hub_url=hub_url,
+        columns=[test_query_column],
+    )
+    with beam.Pipeline() as pipeline:
+      transformed_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config).with_transform(
+                      ScaleTo01(columns=[test_query_column])))
+
+      def assert_element(element):
+        assert max(element[test_query_column]) == 1
+
+      _ = (
+          transformed_pcoll | beam.Map(lambda x: x.as_dict())
+          | beam.Map(assert_element))
+
+  def pipeline_with_configurable_artifact_location(
+      self,
+      pipeline,
+      embedding_config=None,
+      read_artifact_location=None,
+      write_artifact_location=None):
+    if write_artifact_location:
+      return (
+          pipeline
+          | MLTransform(write_artifact_location=write_artifact_location).
+          with_transform(embedding_config))
+    elif read_artifact_location:
+      return (
+          pipeline
+          | MLTransform(read_artifact_location=read_artifact_location))
+    else:
+      raise NotImplementedError
+
+  def test_embeddings_with_read_artifact_location(self):
+    with beam.Pipeline() as p:
+      embedding_config = TensorflowHubTextEmbeddings(
+          hub_url=hub_url, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=self.artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=self.artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.21
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.14797046780586243
+            | beam.Map(assert_element))
+
+  def test_with_int_data_types(self):
+    embedding_config = TensorflowHubTextEmbeddings(
+        hub_url=hub_url, columns=[test_query_column])
+    with self.assertRaises(TypeError):
+      with beam.Pipeline() as pipeline:
+        _ = (
+            pipeline
+            | "CreateData" >> beam.Create([{
+                test_query_column: 1
+            }])
+            | "MLTransform" >> MLTransform(
+                write_artifact_location=self.artifact_location).with_transform(
+                    embedding_config))
+
+  def test_with_gcs_artifact_location(self):
+    artifact_location = 'gs://apache-beam-ml/testing/tensorflow_hub'
+    with beam.Pipeline() as p:
+      embedding_config = TensorflowHubTextEmbeddings(
+          hub_url=hub_url, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.21
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.14797046780586243
+            | beam.Map(assert_element))
+
+
+if __name__ == '__main__':
+  unittest.main()

From 23f70278e2bf5fd0f34cf5f496184906522b59f3 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:48:22 -0500
Subject: [PATCH 05/52] Add vertex_ai

---
 .../ml/transforms/embeddings/vertex_ai.py     | 160 ++++++++++++++
 .../transforms/embeddings/vertex_ai_test.py   | 197 ++++++++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
 create mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
new file mode 100644
index 0000000000000..31f5240093441
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -0,0 +1,160 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Vertex AI Python SDK is required for this module.
+# Follow https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk # pylint: disable=line-too-long
+# to install Vertex AI Python SDK.
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+
+from google.auth.credentials import Credentials
+
+import apache_beam as beam
+import vertexai
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from vertexai.language_models import TextEmbeddingInput
+from vertexai.language_models import TextEmbeddingModel
+
+__all__ = ["VertexAITextEmbeddings"]
+
+TASK_TYPE = "RETRIEVAL_DOCUMENT"
+TASK_TYPE_INPUTS = [
+    "RETRIEVAL_DOCUMENT",
+    "RETRIEVAL_QUERY",
+    "SEMANTIC_SIMILARITY",
+    "CLASSIFICATION",
+    "CLUSTERING"
+]
+
+
+class _VertexAITextEmbeddingHandler(ModelHandler):
+  """
+  Note: Intended for internal use and guarantees no backwards compatibility.
+  """
+  def __init__(
+      self,
+      model_name: str,
+      title: Optional[str] = None,
+      task_type: str = TASK_TYPE,
+      project: Optional[str] = None,
+      location: Optional[str] = None,
+      credentials: Optional[Credentials] = None,
+  ):
+    vertexai.init(project=project, location=location, credentials=credentials)
+    self.model_name = model_name
+    if task_type not in TASK_TYPE_INPUTS:
+      raise ValueError(
+          f"task_type must be one of {TASK_TYPE_INPUTS}, got {task_type}")
+    self.task_type = task_type
+    self.title = title
+
+  def run_inference(
+      self,
+      batch: Sequence[str],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None,
+  ) -> Iterable:
+    embeddings = []
+    batch_size = 5  # Vertex AI limits requests to 5 at a time.
+    for i in range(0, len(batch), batch_size):
+      text_batch = batch[i:i + batch_size]
+      text_batch = [
+          TextEmbeddingInput(
+              text=text, title=self.title, task_type=self.task_type)
+          for text in text_batch
+      ]
+      embeddings_batch = model.get_embeddings(text_batch)
+      embeddings.extend([el.values for el in embeddings_batch])
+    return embeddings
+
+  def load_model(self):
+    model = TextEmbeddingModel.from_pretrained(self.model_name)
+    return model
+
+
+class VertexAITextEmbeddings(EmbeddingsManager):
+  def __init__(
+      self,
+      model_name: str,
+      columns: List[str],
+      title: Optional[str] = None,
+      task_type: str = TASK_TYPE,
+      project: Optional[str] = None,
+      location: Optional[str] = None,
+      credentials: Optional[Credentials] = None,
+      **kwargs,
+  ):
+    """
+    Embedding Config for Vertex AI Text Embedding models following
+    https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long
+
+    Text Embeddings are generated for a batch of text using the Vertex AI SDK.
+    Embeddings are returned in a list for each text in the batch. Look at
+    https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md # pylint: disable=line-too-long
+    for more information on model versions and lifecycle.
+
+    Args:
+      model_name: The name of the Vertex AI Text Embedding model.
+      columns: The columns containing the text to be embedded.
+      task_type: The name of the downstream task the embeddings will be used for.
+        Valid values:
+        RETRIEVAL_QUERY
+            Specifies the given text is a query in a search/retrieval setting.
+        RETRIEVAL_DOCUMENT
+            Specifies the given text is a document from the corpus being searched.
+        SEMANTIC_SIMILARITY
+            Specifies the given text will be used for STS.
+        CLASSIFICATION
+            Specifies that the given text will be classified.
+        CLUSTERING
+            Specifies that the embeddings will be used for clustering.
+      title: Optional identifier of the text content.
+      project: The default GCP project to make Vertex API calls.
+      location: The default location to use when making API calls.
+      credentials: The default custom
+        credentials to use when making API calls. If not provided credentials
+        will be ascertained from the environment.
+
+    """
+    self.model_name = model_name
+    self.project = project
+    self.location = location
+    self.credentials = credentials
+    self.title = title
+    self.task_type = task_type
+    super().__init__(columns=columns, **kwargs)
+
+  def get_model_handler(self) -> ModelHandler:
+    return _VertexAITextEmbeddingHandler(
+        model_name=self.model_name,
+        project=self.project,
+        location=self.location,
+        credentials=self.credentials,
+        title=self.title,
+        task_type=self.task_type,
+    )
+
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
new file mode 100644
index 0000000000000..7124aab9cbf23
--- /dev/null
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
@@ -0,0 +1,197 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import apache_beam as beam
+from apache_beam.ml.transforms.base import MLTransform
+
+try:
+  from apache_beam.ml.transforms.embeddings.vertex_ai import VertexAITextEmbeddings
+except ImportError:
+  VertexAITextEmbeddings = None  # type: ignore
+
+# pylint: disable=ungrouped-imports
+try:
+  import tensorflow_transform as tft
+  from apache_beam.ml.transforms.tft import ScaleTo01
+except ImportError:
+  tft = None
+
+test_query = "This is a test"
+test_query_column = "feature_1"
+model_name: str = "textembedding-gecko@002"
+
+
+@unittest.skipIf(
+    VertexAITextEmbeddings is None, 'Vertex AI Python SDK is not installed.')
+class VertexAIEmbeddingsTest(unittest.TestCase):
+  def setUp(self) -> None:
+    self.artifact_location = tempfile.mkdtemp()
+
+  def tearDown(self) -> None:
+    shutil.rmtree(self.artifact_location)
+
+  def test_vertex_ai_text_embeddings(self):
+    embedding_config = VertexAITextEmbeddings(
+        model_name=model_name, columns=[test_query_column])
+    with beam.Pipeline() as pipeline:
+      transformed_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
+
+      def assert_element(element):
+        assert len(element[test_query_column]) == 768
+
+      _ = (transformed_pcoll | beam.Map(assert_element))
+
+  @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.')
+  def test_embeddings_with_scale_to_0_1(self):
+    embedding_config = VertexAITextEmbeddings(
+        model_name=model_name,
+        columns=[test_query_column],
+    )
+    with beam.Pipeline() as pipeline:
+      transformed_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config).with_transform(
+                      ScaleTo01(columns=[test_query_column])))
+
+      def assert_element(element):
+        assert max(element.feature_1) == 1
+
+      _ = (transformed_pcoll | beam.Map(assert_element))
+
+  def pipeline_with_configurable_artifact_location(
+      self,
+      pipeline,
+      embedding_config=None,
+      read_artifact_location=None,
+      write_artifact_location=None):
+    if write_artifact_location:
+      return (
+          pipeline
+          | MLTransform(write_artifact_location=write_artifact_location).
+          with_transform(embedding_config))
+    elif read_artifact_location:
+      return (
+          pipeline
+          | MLTransform(read_artifact_location=read_artifact_location))
+    else:
+      raise NotImplementedError
+
+  def test_embeddings_with_read_artifact_location(self):
+    with beam.Pipeline() as p:
+      embedding_config = VertexAITextEmbeddings(
+          model_name=model_name, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=self.artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=self.artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.15
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.14797046780586243
+            | beam.Map(assert_element))
+
+  def test_with_int_data_types(self):
+    embedding_config = VertexAITextEmbeddings(
+        model_name=model_name, columns=[test_query_column])
+    with self.assertRaises(TypeError):
+      with beam.Pipeline() as pipeline:
+        _ = (
+            pipeline
+            | "CreateData" >> beam.Create([{
+                test_query_column: 1
+            }])
+            | "MLTransform" >> MLTransform(
+                write_artifact_location=self.artifact_location).with_transform(
+                    embedding_config))
+
+  def test_with_gcs_artifact_location(self):
+    artifact_location = ('gs://apache-beam-ml/testing/vertex_ai')
+    with beam.Pipeline() as p:
+      embedding_config = VertexAITextEmbeddings(
+          model_name=model_name, columns=[test_query_column])
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }]))
+        _ = self.pipeline_with_configurable_artifact_location(
+            pipeline=data,
+            embedding_config=embedding_config,
+            write_artifact_location=artifact_location)
+
+      with beam.Pipeline() as p:
+        data = (
+            p
+            | "CreateData" >> beam.Create([{
+                test_query_column: test_query
+            }, {
+                test_query_column: test_query
+            }]))
+        result_pcoll = self.pipeline_with_configurable_artifact_location(
+            pipeline=data, read_artifact_location=artifact_location)
+
+        def assert_element(element):
+          assert round(element, 2) == 0.15
+
+        _ = (
+            result_pcoll
+            | beam.Map(lambda x: max(x[test_query_column]))
+            #  0.14797046780586243
+            | beam.Map(assert_element))
+
+
+if __name__ == '__main__':
+  unittest.main()

From 04ebdb0a3079e8ba5b5d69af9dbb010f38998493 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:48:49 -0500
Subject: [PATCH 06/52] Make TFTProcessHandler a PTransform

---
 .../apache_beam/ml/transforms/handlers.py     | 11 +++++++---
 .../ml/transforms/handlers_test.py            | 10 ++++-----
 sdks/python/apache_beam/ml/transforms/tft.py  | 22 +++++++++++++++++++
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/handlers.py b/sdks/python/apache_beam/ml/transforms/handlers.py
index 8695d5146efae..1a673c51df261 100644
--- a/sdks/python/apache_beam/ml/transforms/handlers.py
+++ b/sdks/python/apache_beam/ml/transforms/handlers.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 # pytype: skip-file
+# pylint: skip-file
 
 import collections
 import hashlib
@@ -217,6 +218,9 @@ def __init__(
   def append_transform(self, transform):
     self.transforms.append(transform)
 
+  def get_transforms(self):
+    return self.transforms
+
   def _map_column_names_to_types(self, row_type):
     """
     Return a dictionary of column names and types.
@@ -319,6 +323,7 @@ def _get_raw_data_feature_spec_per_column(
           f"Please provide a valid type from the following: "
           f"{_default_type_to_tensor_type_map.keys()}")
     return tf.io.VarLenFeature(_default_type_to_tensor_type_map[dtype])
+    # return tf.io.VarLenFeature()
 
   def get_raw_data_metadata(
       self, input_types: Dict[str, type]) -> dataset_metadata.DatasetMetadata:
@@ -387,7 +392,7 @@ def _get_transformed_data_schema(
         transformed_types[name] = typing.Sequence[bytes]  # type: ignore[assignment]
     return transformed_types
 
-  def process_data(
+  def expand(
       self, raw_data: beam.PCollection[tft_process_handler_input_type]
   ) -> beam.PCollection[tft_process_handler_output_type]:
     """
@@ -512,7 +517,7 @@ def process_data(
 
       # The schema only contains the columns that are transformed.
       transformed_dataset = (
-          transformed_dataset | "ConvertToRowType" >>
+          transformed_dataset
+          | "ConvertToRowType" >>
           beam.Map(lambda x: beam.Row(**x)).with_output_types(row_type))
-
       return transformed_dataset
diff --git a/sdks/python/apache_beam/ml/transforms/handlers_test.py b/sdks/python/apache_beam/ml/transforms/handlers_test.py
index 327c8c76c0e9f..d39a1d775f3f2 100644
--- a/sdks/python/apache_beam/ml/transforms/handlers_test.py
+++ b/sdks/python/apache_beam/ml/transforms/handlers_test.py
@@ -298,7 +298,7 @@ def test_tft_process_handler_verify_artifacts(self):
           transforms=[tft.ScaleTo01(columns=['x'])],
           artifact_location=self.artifact_location,
       )
-      _ = process_handler.process_data(raw_data)
+      _ = raw_data | process_handler
 
       self.assertTrue(
           os.path.exists(
@@ -315,7 +315,7 @@ def test_tft_process_handler_verify_artifacts(self):
       raw_data = (p | beam.Create([{'x': np.array([2, 5])}]))
       process_handler = handlers.TFTProcessHandler(
           artifact_location=self.artifact_location, artifact_mode='consume')
-      transformed_data = process_handler.process_data(raw_data)
+      transformed_data = raw_data | process_handler
       transformed_data |= beam.Map(lambda x: x.x)
 
       # the previous min is 1 and max is 6. So this should scale by (1, 6)
@@ -494,7 +494,7 @@ def test_tft_process_handler_unused_column(self):
           transforms=[scale_to_0_1_fn],
           artifact_location=self.artifact_location,
       )
-      transformed_pcoll = process_handler.process_data(raw_data)
+      transformed_pcoll = raw_data | process_handler
       transformed_pcoll_x = transformed_pcoll | beam.Map(lambda x: x.x)
       transformed_pcoll_y = transformed_pcoll | beam.Map(lambda x: x.y)
       assert_that(
@@ -520,7 +520,7 @@ def test_consume_mode_with_extra_columns_in_the_input(self):
           transforms=[tft.ScaleTo01(columns=['x'])],
           artifact_location=self.artifact_location,
       )
-      _ = process_handler.process_data(raw_data)
+      _ = raw_data | process_handler
 
     test_data = [{
         'x': np.array([2, 5]), 'y': np.array([1, 2]), 'z': 'fake_string'
@@ -548,7 +548,7 @@ def test_consume_mode_with_extra_columns_in_the_input(self):
       raw_data = (p | beam.Create(test_data))
       process_handler = handlers.TFTProcessHandler(
           artifact_location=self.artifact_location, artifact_mode='consume')
-      transformed_data = process_handler.process_data(raw_data)
+      transformed_data = raw_data | process_handler
 
       transformed_data_x = transformed_data | beam.Map(lambda x: x.x)
       transformed_data_y = transformed_data | beam.Map(lambda x: x.y)
diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py
index c7b8ff0153247..8705b79aa309a 100644
--- a/sdks/python/apache_beam/ml/transforms/tft.py
+++ b/sdks/python/apache_beam/ml/transforms/tft.py
@@ -42,6 +42,7 @@
 from typing import Tuple
 from typing import Union
 
+import apache_beam as beam
 import tensorflow as tf
 import tensorflow_transform as tft
 from apache_beam.ml.transforms.base import BaseOperation
@@ -95,6 +96,27 @@ def __init__(self, columns: List[str]) -> None:
           "Columns are not specified. Please specify the column for the "
           " op %s" % self.__class__.__name__)
 
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    from apache_beam.ml.transforms.handlers import TFTProcessHandler
+    params = {}
+    artifact_location = kwargs.get('artifact_location')
+    if not artifact_location:
+      raise RuntimeError(
+          "artifact_location is not specified. Please specify the "
+          "artifact_location for the op %s" % self.__class__.__name__)
+
+    transforms = kwargs.get('transforms')
+    if transforms:
+      params['transforms'] = transforms
+
+    artifact_mode = kwargs.get('artifact_mode')
+    if artifact_mode:
+      params['artifact_mode'] = artifact_mode
+    return TFTProcessHandler(artifact_location=artifact_location, **params)
+
+  def requires_chaining(self):
+    return True
+
   @tf.function
   def _split_string_with_delimiter(self, data, delimiter):
     """

From f86c259d51e312a91776f4e40c8172645e02a9ff Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:50:18 -0500
Subject: [PATCH 07/52] raise RuntimeError in ArtifactsFetcher when it is used
 for embeddings

---
 sdks/python/apache_beam/ml/transforms/tft_test.py |  7 ++++++-
 sdks/python/apache_beam/ml/transforms/utils.py    | 11 +++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/tft_test.py b/sdks/python/apache_beam/ml/transforms/tft_test.py
index 38ded6a809af0..9f15db45bd285 100644
--- a/sdks/python/apache_beam/ml/transforms/tft_test.py
+++ b/sdks/python/apache_beam/ml/transforms/tft_test.py
@@ -711,8 +711,13 @@ def test_count_per_key_on_list(self):
               ]))
 
     def validate_count_per_key(key_vocab_filename):
+      files = os.listdir(self.artifact_location)
+      files.remove(base._ATTRIBUTE_FILE_NAME)
       key_vocab_location = os.path.join(
-          self.artifact_location, 'transform_fn/assets', key_vocab_filename)
+          self.artifact_location,
+          files[0],
+          'transform_fn/assets',
+          key_vocab_filename)
       with open(key_vocab_location, 'r') as f:
         key_vocab_list = [line.strip() for line in f]
       return key_vocab_list
diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py
index 19bb02c5ae1b9..b66cb4162ce29 100644
--- a/sdks/python/apache_beam/ml/transforms/utils.py
+++ b/sdks/python/apache_beam/ml/transforms/utils.py
@@ -17,9 +17,11 @@
 
 __all__ = ['ArtifactsFetcher']
 
+import os
 import typing
 
 import tensorflow_transform as tft
+from apache_beam.ml.transforms import base
 
 
 class ArtifactsFetcher():
@@ -28,8 +30,13 @@ class ArtifactsFetcher():
   to the TFTProcessHandlers in MLTransform.
   """
   def __init__(self, artifact_location):
-    self.artifact_location = artifact_location
-    self.transform_output = tft.TFTransformOutput(self.artifact_location)
+    files = os.listdir(artifact_location)
+    files.remove(base._ATTRIBUTE_FILE_NAME)
+    if len(files) > 1:
+      raise NotImplementedError(
+          'Multiple files in artifact location not supported yet.')
+    self._artifact_location = os.path.join(artifact_location, files[0])
+    self.transform_output = tft.TFTransformOutput(self._artifact_location)
 
   def get_vocab_list(
       self,

From fc4ec0086be15e43f435eb340942a7f497d874e3 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:50:45 -0500
Subject: [PATCH 08/52] Add JsonPickle to requirements

---
 sdks/python/container/py310/base_image_requirements.txt | 1 +
 sdks/python/container/py311/base_image_requirements.txt | 1 +
 sdks/python/container/py38/base_image_requirements.txt  | 1 +
 sdks/python/container/py39/base_image_requirements.txt  | 1 +
 sdks/python/setup.py                                    | 1 +
 5 files changed, 5 insertions(+)

diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt
index a9f94104374e0..fc1ce3f28eeab 100644
--- a/sdks/python/container/py310/base_image_requirements.txt
+++ b/sdks/python/container/py310/base_image_requirements.txt
@@ -82,6 +82,7 @@ idna==3.4
 iniconfig==2.0.0
 joblib==1.3.2
 Js2Py==0.74
+jsonpickle==3.0.2
 jsonschema==4.19.1
 jsonschema-specifications==2023.7.1
 mmh3==4.0.1
diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt
index 865b856683a4d..7b55936530a09 100644
--- a/sdks/python/container/py311/base_image_requirements.txt
+++ b/sdks/python/container/py311/base_image_requirements.txt
@@ -79,6 +79,7 @@ idna==3.4
 iniconfig==2.0.0
 joblib==1.3.2
 Js2Py==0.74
+jsonpickle==3.0.2
 jsonschema==4.19.1
 jsonschema-specifications==2023.7.1
 mmh3==4.0.1
diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt
index 5dffff5f80d9a..fb89284967167 100644
--- a/sdks/python/container/py38/base_image_requirements.txt
+++ b/sdks/python/container/py38/base_image_requirements.txt
@@ -85,6 +85,7 @@ importlib-resources==6.1.0
 iniconfig==2.0.0
 joblib==1.3.2
 Js2Py==0.74
+jsonpickle==3.0.2
 jsonschema==4.19.1
 jsonschema-specifications==2023.7.1
 mmh3==4.0.1
diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt
index 1b8ad7a2e748f..c0dcd6baf6a33 100644
--- a/sdks/python/container/py39/base_image_requirements.txt
+++ b/sdks/python/container/py39/base_image_requirements.txt
@@ -83,6 +83,7 @@ importlib-metadata==6.8.0
 iniconfig==2.0.0
 joblib==1.3.2
 Js2Py==0.74
+jsonpickle==3.0.2
 jsonschema==4.19.1
 jsonschema-specifications==2023.7.1
 mmh3==4.0.1
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 1785cd75df80b..6c99dad55504d 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -286,6 +286,7 @@ def get_portability_package_data():
           'httplib2>=0.8,<0.23.0',
           'js2py>=0.74,<1',
           'jsonschema>=4.0.0,<5.0.0',
+          'jsonpickle>=3.0.0,<4.0.0',
           # numpy can have breaking changes in minor versions.
           # Use a strict upper bound.
           'numpy>=1.14.3,<1.25.0',  # Update pyproject.toml as well.

From 3da5ce836bd752169f1f58daa7ea15bfc77d139f Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 10:51:03 -0500
Subject: [PATCH 09/52] Add tox tests

---
 sdks/python/test-suites/tox/py38/build.gradle | 12 +++++++++
 sdks/python/tox.ini                           | 27 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle
index b1ed5f88c7c93..c4fd300ca9435 100644
--- a/sdks/python/test-suites/tox/py38/build.gradle
+++ b/sdks/python/test-suites/tox/py38/build.gradle
@@ -141,6 +141,18 @@ toxTask "testPy38transformers-430", "py38-transformers-430", "${posargs}"
 test.dependsOn "testPy38transformers-430"
 preCommitPyCoverage.dependsOn "testPy38transformers-430"
 
+toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "${posargs}"
+test.dependsOn "testPy38sentenceTransformers-222"
+preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222"
+
+toxTask "testPy38tensorflowHub-014", "py38-tfhub-014", "${posargs}"
+test.dependsOn "testPy38tensorflowHub-014"
+preCommitPyCoverage.dependsOn "testPy38tensorflowHub-014"
+
+toxTask "testPy38tensorflowHub-015", "py38-tfhub-015", "${posargs}"
+test.dependsOn "testPy38tensorflowHub-015"
+preCommitPyCoverage.dependsOn "testPy38tensorflowHub-015"
+
 toxTask "whitespacelint", "whitespacelint", "${posargs}"
 
 task archiveFilesToLint(type: Zip) {
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index e4cf09cacba40..57533af31a286 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -423,3 +423,30 @@ commands =
   # Run all Vertex AI unit tests
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
   /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_vertex_ai {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
+
+
+[testenv:py{38,39,310,311}-sentence-transformers-222]
+deps =
+  sentence-transformers==2.2.2
+extras = test,gcp
+commands =
+  # Log aiplatform and its dependencies version for debugging
+  /bin/sh -c "pip freeze | grep -E sentence-transformers"
+  # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
+  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'
+
+[testenv:py{38,39,310,311}-tfhub-{014,015}]
+deps =
+  014:
+    tensorflow-hub>=0.14.0,<0.15.0
+    tensorflow-text
+  015:
+    tensorflow-hub>=0.15.0,<0.16.0
+    tensorflow-text
+
+extras = test,gcp
+commands =
+  # Log aiplatform and its dependencies version for debugging
+  /bin/sh -c "pip freeze | grep -E tensorflow"
+  # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
+  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'

From 4b4ee588c2db1185d37f0a1c38eed1df3d7275b7 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 29 Nov 2023 13:16:08 -0500
Subject: [PATCH 10/52] Mock frameworks in pydocs

Fix tox.ini

Fix pydoc

Fix indent in pydoc
---
 sdks/python/apache_beam/ml/transforms/base.py            | 8 +++++++-
 .../ml/transforms/embeddings/tensorflow_hub.py           | 6 +++---
 .../apache_beam/ml/transforms/embeddings/vertex_ai.py    | 4 +---
 sdks/python/scripts/generate_pydoc.sh                    | 4 +++-
 sdks/python/tox.ini                                      | 9 +++------
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index b8a9beafb9862..4ecbeacb8e672 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -43,7 +43,13 @@
 _LOGGER = logging.getLogger(__name__)
 _ATTRIBUTE_FILE_NAME = 'attributes.json'
 
-__all__ = ['MLTransform', 'ProcessHandler', 'BaseOperation']
+__all__ = [
+    'MLTransform',
+    'ProcessHandler',
+    'PTransformProvider',
+    'BaseOperation',
+    'EmbeddingsManager'
+]
 
 TransformedDatasetT = TypeVar('TransformedDatasetT')
 TransformedMetadataT = TypeVar('TransformedMetadataT')
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
index 62bd00e10359a..4b01f7ec44b9a 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
@@ -91,9 +91,6 @@ def __init__(
       hub_url: str,
       preprocessing_url: Optional[str] = None,
       **kwargs):
-    super().__init__(columns=columns, **kwargs)
-    self.model_uri = hub_url
-    self.preprocessing_url = preprocessing_url
     """
     Embedding config for tensorflow hub models. This config can be used with
     MLTransform to embed text data. Models are loaded using the RunInference
@@ -109,6 +106,9 @@ def __init__(
       max_batch_size: The maximum batch size to be used for inference.
       large_model: Whether to share the model across processes.
     """
+    super().__init__(columns=columns, **kwargs)
+    self.model_uri = hub_url
+    self.preprocessing_url = preprocessing_url
 
   def get_model_handler(self) -> ModelHandler:
     # override the default inference function
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 31f5240093441..e4c6745bb5665 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -104,12 +104,10 @@ def __init__(
       project: Optional[str] = None,
       location: Optional[str] = None,
       credentials: Optional[Credentials] = None,
-      **kwargs,
-  ):
+      **kwargs):
     """
     Embedding Config for Vertex AI Text Embedding models following
     https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long
-
     Text Embeddings are generated for a batch of text using the Vertex AI SDK.
     Embeddings are returned in a list for each text in the batch. Look at
     https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md # pylint: disable=line-too-long
diff --git a/sdks/python/scripts/generate_pydoc.sh b/sdks/python/scripts/generate_pydoc.sh
index 06ad06320fcf4..8d5b43167dd11 100755
--- a/sdks/python/scripts/generate_pydoc.sh
+++ b/sdks/python/scripts/generate_pydoc.sh
@@ -133,7 +133,9 @@ autodoc_inherit_docstrings = False
 autodoc_member_order = 'bysource'
 autodoc_mock_imports = ["tensorrt", "cuda", "torch",
     "onnxruntime", "onnx", "tensorflow", "tensorflow_hub",
-    "tensorflow_transform", "tensorflow_metadata", "transformers"]
+    "tensorflow_transform", "tensorflow_metadata", "transformers", "tensorflow_text",
+    "sentence_transformers",
+    ]
 
 # Allow a special section for documenting DataFrame API
 napoleon_custom_sections = ['Differences from pandas']
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index 57533af31a286..1cea858e8bbc2 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -437,12 +437,9 @@ commands =
 
 [testenv:py{38,39,310,311}-tfhub-{014,015}]
 deps =
-  014:
-    tensorflow-hub>=0.14.0,<0.15.0
-    tensorflow-text
-  015:
-    tensorflow-hub>=0.15.0,<0.16.0
-    tensorflow-text
+  014: tensorflow-hub>=0.14.0,<0.15.0
+  015: tensorflow-hub>=0.15.0,<0.16.0
+  tensorflow-text
 
 extras = test,gcp
 commands =

From 01ba2175330257745c890b80f6db7f26eae24c62 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 18:25:24 +0000
Subject: [PATCH 11/52] Add Row type check

---
 sdks/python/apache_beam/typehints/typehints.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/typehints/typehints.py b/sdks/python/apache_beam/typehints/typehints.py
index 4fd4b97e82cde..70eb78b6ffc60 100644
--- a/sdks/python/apache_beam/typehints/typehints.py
+++ b/sdks/python/apache_beam/typehints/typehints.py
@@ -1020,13 +1020,13 @@ def __getitem__(self, type_param):
 
 class CollectionHint(CompositeTypeHint):
   """ A Collection type-hint.
-  
+
   Collection[X] defines a type-hint for a collection of homogenous types. 'X'
   may be either a built-in Python type or another nested TypeConstraint.
 
   This represents a collections.abc.Collection type, which implements
   __contains__, __iter__, and __len__. This acts as a parent type for
-  sets but has fewer guarantees for mixins. 
+  sets but has fewer guarantees for mixins.
   """
   class CollectionTypeConstraint(SequenceTypeConstraint):
     def __init__(self, type_param):
@@ -1302,6 +1302,8 @@ def is_consistent_with(sub, base):
   relation, but also handles the special Any type as well as type
   parameterization.
   """
+  from apache_beam.pvalue import Row
+  from apache_beam.typehints.row_type import RowTypeConstraint
   if sub == base:
     # Common special case.
     return True
@@ -1313,6 +1315,8 @@ def is_consistent_with(sub, base):
     return all(is_consistent_with(c, base) for c in sub.union_types)
   elif isinstance(base, TypeConstraint):
     return base._consistent_with_check_(sub)
+  elif isinstance(sub, RowTypeConstraint):
+    return base == Row
   elif isinstance(sub, TypeConstraint):
     # Nothing but object lives above any type constraints.
     return base == object

From f080c25ca6310cc8a4614d71f23d3632f716bdc7 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 18:29:35 +0000
Subject: [PATCH 12/52] Remove requires_chaining

---
 sdks/python/apache_beam/ml/transforms/base.py | 26 ++++---------------
 .../embeddings/sentence_transformer.py        |  5 +---
 .../transforms/embeddings/tensorflow_hub.py   |  2 +-
 .../embeddings/tensorflow_hub_test.py         | 10 +++----
 4 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 4ecbeacb8e672..580c7c7e912d4 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -93,14 +93,10 @@ class PTransformProvider:
   Data processing transforms that are intended to be used with MLTransform
   should subclass PTransformProvider and implement the following methods:
   1. get_ptransform_for_processing()
-  2. requires_chaining()
 
   get_ptransform_for_processing() method should return a PTransform that can be
   used to process the data.
 
-  requires_chaining() method should return True if the data processing
-  transforms needs to be chained sequentially with compatible data processing
-  transforms.
   """
   @abc.abstractmethod
   def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
@@ -108,13 +104,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
     Returns a PTransform that can be used to process the data.
     """
 
-  @abc.abstractmethod
-  def requires_chaining(self):
-    """
-    Returns True if the data processing transforms needs to be chained
-    sequentially with compatible data processing transforms.
-    """
-
   def get_counter(self):
     """
     Returns the counter name for the data processing transform.
@@ -196,10 +185,6 @@ def get_model_handler(self) -> ModelHandler:
     Return framework specific model handler.
     """
 
-  def requires_chaining(self):
-    # each embedding config requires a separate PTransform. so no chaining.
-    return False
-
   def get_columns_to_apply(self):
     return self.columns
 
@@ -495,15 +480,14 @@ def create_ptransform_list(self):
           artifact_location=os.path.join(
               self._parent_artifact_location, uuid.uuid4().hex[:6]),
           artifact_mode=self.artifact_mode)
-      # Determine if a new ptransform should be added to the list
-      is_different_type = (type(current_ptransform) != previous_ptransform_type)
-      if is_different_type or not transform.requires_chaining():
+      append_transform = hasattr(current_ptransform, 'append_transform')
+      if (type(current_ptransform) != previous_ptransform_type) or not append_transform:
         ptransform_list.append(current_ptransform)
         previous_ptransform_type = type(current_ptransform)
-
-      if hasattr(ptransform_list[-1], 'append_transform'):
+      # If different PTransform is appended to the list and the PTransform
+      # supports append_transform, append the transform to the PTransform.
+      if append_transform:
         ptransform_list[-1].append_transform(transform)
-
     return ptransform_list
 
   def save_transforms_in_artifact_location(self, ptransform_list):
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
index 5b31dbca00820..f41e24c0f7a4a 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
@@ -122,7 +122,4 @@ def get_model_handler(self):
   def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
     # wrap the model handler in a _TextEmbeddingHandler since
     # the SentenceTransformerEmbeddings works on text input data.
-    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
-
-  def requires_chaining(self):
-    return False
+    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
\ No newline at end of file
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
index 4b01f7ec44b9a..4612ca9d1d425 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
@@ -45,7 +45,7 @@ def __init__(self, preprocessing_url: Optional[str], *args, **kwargs):
   def load_model(self):
     # unable to load the models with tf.keras.models.load_model so
     # using hub.KerasLayer instead
-    model = hub.KerasLayer(self._model_uri)
+    model = hub.KerasLayer(self._model_uri, )
     return model
 
   def _convert_prediction_result_to_list(
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
index 6b918153945ae..8c571e0cf4621 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
@@ -21,7 +21,7 @@
 import apache_beam as beam
 from apache_beam.ml.transforms.base import MLTransform
 
-hub_url = 'https://tfhub.dev/google/LEALLA/LEALLA-small/1'
+hub_url = 'https://tfhub.dev/google/nnlm-en-dim128/2'
 test_query_column = 'test_query'
 test_query = 'This is a test query'
 
@@ -134,12 +134,12 @@ def test_embeddings_with_read_artifact_location(self):
             pipeline=data, read_artifact_location=self.artifact_location)
 
         def assert_element(element):
-          assert round(element, 2) == 0.21
+          #  0.29836970567703247
+          assert round(element, 2) == 0.3
 
         _ = (
             result_pcoll
             | beam.Map(lambda x: max(x[test_query_column]))
-            #  0.14797046780586243
             | beam.Map(assert_element))
 
   def test_with_int_data_types(self):
@@ -185,12 +185,12 @@ def test_with_gcs_artifact_location(self):
             pipeline=data, read_artifact_location=artifact_location)
 
         def assert_element(element):
-          assert round(element, 2) == 0.21
+          # 0.29836970567703247
+          assert round(element, 2) == 0.3
 
         _ = (
             result_pcoll
             | beam.Map(lambda x: max(x[test_query_column]))
-            #  0.14797046780586243
             | beam.Map(assert_element))
 
 

From 6111c31066264f1387395fc8c86d7f061720c921 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 18:39:48 +0000
Subject: [PATCH 13/52] change name of PTransformProvider to
 MLTransformProvider

---
 sdks/python/apache_beam/ml/transforms/base.py | 27 ++++++++++---------
 .../apache_beam/ml/transforms/base_test.py    |  2 +-
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 580c7c7e912d4..096773339bb7e 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -46,7 +46,7 @@
 __all__ = [
     'MLTransform',
     'ProcessHandler',
-    'PTransformProvider',
+    'MLTransformProvider',
     'BaseOperation',
     'EmbeddingsManager'
 ]
@@ -88,10 +88,10 @@ class ArtifactMode(object):
   CONSUME = 'consume'
 
 
-class PTransformProvider:
+class MLTransformProvider:
   """
   Data processing transforms that are intended to be used with MLTransform
-  should subclass PTransformProvider and implement the following methods:
+  should subclass MLTransformProvider and implement the following methods:
   1. get_ptransform_for_processing()
 
   get_ptransform_for_processing() method should return a PTransform that can be
@@ -113,7 +113,7 @@ def get_counter(self):
 
 
 class BaseOperation(Generic[OperationInputT, OperationOutputT],
-                    PTransformProvider,
+                    MLTransformProvider,
                     abc.ABC):
   def __init__(self, columns: List[str]) -> None:
     """
@@ -158,7 +158,7 @@ def append_transform(self, transform: BaseOperation):
 
 
 # TODO: Add support for inference_fn
-class EmbeddingsManager(PTransformProvider):
+class EmbeddingsManager(MLTransformProvider):
   def __init__(
       self,
       columns: List[str],
@@ -326,13 +326,13 @@ def with_transform(self, transform: BaseOperation):
     return self
 
   def _validate_transform(self, transform):
-    # every data processing transform should subclass PTransformProvider. Raise
-    # an error if the transform does not subclass PTransformProvider since the
+    # every data processing transform should subclass MLTransformProvider. Raise
+    # an error if the transform does not subclass MLTransformProvider since the
     # downstream code expects the transform to be a subclass of
-    # PTransformProvider.
-    if not isinstance(transform, PTransformProvider):
+    # MLTransformProvider.
+    if not isinstance(transform, MLTransformProvider):
       raise TypeError(
-          'transform must be a subclass of PTransformProvider and implement '
+          'transform must be a subclass of MLTransformProvider and implement '
           'get_ptransform_for_processing() method.'
           'Got: %s instead.' % type(transform))
 
@@ -471,9 +471,9 @@ def create_ptransform_list(self):
     current_ptransform = None
     ptransform_list = []
     for transform in self.transforms:
-      if not isinstance(transform, PTransformProvider):
+      if not isinstance(transform, MLTransformProvider):
         raise RuntimeError(
-            'Transforms must be instances of PTransformProvider and '
+            'Transforms must be instances of MLTransformProvider and '
             'implement get_ptransform_for_processing() method.')
       # for each instance of PTransform, create a new artifact location
       current_ptransform = transform.get_ptransform_for_processing(
@@ -481,7 +481,8 @@ def create_ptransform_list(self):
               self._parent_artifact_location, uuid.uuid4().hex[:6]),
           artifact_mode=self.artifact_mode)
       append_transform = hasattr(current_ptransform, 'append_transform')
-      if (type(current_ptransform) != previous_ptransform_type) or not append_transform:
+      if (type(current_ptransform) !=
+          previous_ptransform_type) or not append_transform:
         ptransform_list.append(current_ptransform)
         previous_ptransform_type = type(current_ptransform)
       # If different PTransform is appended to the list and the PTransform
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 1f9e5a85d1c2a..e79157cea7565 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -287,7 +287,7 @@ def __call__(self, x):
         return x + 1
 
     with self.assertRaisesRegex(
-        TypeError, 'transform must be a subclass of PTransformProvider'):
+        TypeError, 'transform must be a subclass of MLTransformProvider'):
       with beam.Pipeline() as p:
         _ = (
             p

From ba24e81e0994a78791f030ac42abc0a6525966b7 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 18:57:00 +0000
Subject: [PATCH 14/52] remove batch_len in utility fun

---
 sdks/python/apache_beam/ml/transforms/base.py | 14 +++++++---
 .../apache_beam/ml/transforms/base_test.py    | 28 +++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 096773339bb7e..667c9acb069ad 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -74,10 +74,16 @@ def _convert_list_of_dicts_to_dict_of_lists(
 
 
 def _convert_dict_of_lists_to_lists_of_dict(
-    dict_of_lists: Dict[str, List[Any]],
-    batch_length: int) -> List[Dict[str, Any]]:
+    dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
+
+  batch_length = len(next(iter(dict_of_lists.values())))
   result: List[Dict[str, Any]] = [{} for _ in range(batch_length)]
+  # all the values in the dict_of_lists should have same length
   for key, values in dict_of_lists.items():
+    assert len(values) == batch_length, (
+        "This function expects all the values "
+        "in the dict_of_lists to have same length."
+        )
     for i in range(len(values)):
       result[i][key] = values[i]
   return result
@@ -578,11 +584,11 @@ def run_inference(
     should be of the same size for a single key across the batch.
     """
     self._validate_batch(batch)
-    batch_len = len(batch)
     dict_batch = _convert_list_of_dicts_to_dict_of_lists(list_of_dicts=batch)
     transformed_batch = self._process_batch(dict_batch, model, inference_args)
     return _convert_dict_of_lists_to_lists_of_dict(
-        dict_of_lists=transformed_batch, batch_length=batch_len)
+        dict_of_lists=transformed_batch,
+    )
 
   def get_metrics_namespace(self) -> str:
     return (
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index e79157cea7565..4e73a915adc50 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -425,5 +425,33 @@ def test_handler_on_multiple_columns(self):
       )
 
 
+class TestUtilFunctions(unittest.TestCase):
+  def test_list_of_dicts_to_dict_of_lists_normal(self):
+    input_list = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
+    expected_output = {'a': [1, 3], 'b': [2, 4]}
+    self.assertEqual(
+        base._convert_list_of_dicts_to_dict_of_lists(input_list),
+        expected_output)
+
+  def test_list_of_dicts_to_dict_of_lists_on_list_inputs(self):
+    input_list = [{'a': [1, 2, 10], 'b': 3}, {'a': [1], 'b': 5}]
+    expected_output = {'a': [[1, 2, 10], [1]], 'b': [3, 5]}
+    self.assertEqual(
+        base._convert_list_of_dicts_to_dict_of_lists(input_list),
+        expected_output)
+
+  def test_dict_of_lists_to_lists_of_dict_normal(self):
+    input_dict = {'a': [1, 3], 'b': [2, 4]}
+    expected_output = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
+    self.assertEqual(
+        base._convert_dict_of_lists_to_lists_of_dict(input_dict),
+        expected_output)
+
+  def test_dict_of_lists_to_lists_of_dict_unequal_length(self):
+    input_dict = {'a': [1, 3], 'b': [2]}
+    with self.assertRaises(AssertionError):
+      base._convert_dict_of_lists_to_lists_of_dict(input_dict)
+
+
 if __name__ == '__main__':
   unittest.main()

From d690aec81da16aa491254301ca92c477bf71b75b Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 20:13:14 +0000
Subject: [PATCH 15/52] Change type annotation and redundant comments

---
 sdks/python/apache_beam/ml/transforms/base.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 667c9acb069ad..fcd2c3299e384 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -75,7 +75,6 @@ def _convert_list_of_dicts_to_dict_of_lists(
 
 def _convert_dict_of_lists_to_lists_of_dict(
     dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
-
   batch_length = len(next(iter(dict_of_lists.values())))
   result: List[Dict[str, Any]] = [{} for _ in range(batch_length)]
   # all the values in the dict_of_lists should have same length
@@ -203,8 +202,7 @@ def __init__(
       *,
       write_artifact_location: Optional[str] = None,
       read_artifact_location: Optional[str] = None,
-      transforms: Optional[List[Union[BaseOperation,
-                                      EmbeddingsManager]]] = None):
+      transforms: Optional[MLTransformProvider] = None):
     """
     MLTransform is a Beam PTransform that can be used to apply
     transformations to the data. MLTransform is used to wrap the
@@ -317,7 +315,7 @@ def expand(
         | "MLTransformMetricsUsage" >> MLTransformMetricsUsage(self))
     return pcoll  # type: ignore[return-value]
 
-  def with_transform(self, transform: BaseOperation):
+  def with_transform(self, transform: MLTransformProvider):
     """
     Add a transform to the MLTransform pipeline.
     Args:
@@ -325,9 +323,7 @@ def with_transform(self, transform: BaseOperation):
     Returns:
       A MLTransform instance.
     """
-    # self._validate_transform(transform)
-    # avoid circular import
-    # pylint: disable=wrong-import-order, wrong-import-position
+    self._validate_transform(transform)
     self.transforms.append(transform)
     return self
 
@@ -457,7 +453,7 @@ class _MLTransformToPTransformMapper:
   """
   def __init__(
       self,
-      transforms: List[Union[BaseOperation, EmbeddingsManager]],
+      transforms: List[MLTransformProvider],
       artifact_location: str,
       artifact_mode: str,
       pipeline_options: Optional[PipelineOptions] = None,

From af7496b8bcdecc929cc7224a23cec6df0a50df93 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 20:16:44 +0000
Subject: [PATCH 16/52] Remove get_transforms method

---
 sdks/python/apache_beam/ml/transforms/handlers.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/handlers.py b/sdks/python/apache_beam/ml/transforms/handlers.py
index 1a673c51df261..620a417c29422 100644
--- a/sdks/python/apache_beam/ml/transforms/handlers.py
+++ b/sdks/python/apache_beam/ml/transforms/handlers.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 # pytype: skip-file
-# pylint: skip-file
 
 import collections
 import hashlib
@@ -218,9 +217,6 @@ def __init__(
   def append_transform(self, transform):
     self.transforms.append(transform)
 
-  def get_transforms(self):
-    return self.transforms
-
   def _map_column_names_to_types(self, row_type):
     """
     Return a dictionary of column names and types.
@@ -323,7 +319,6 @@ def _get_raw_data_feature_spec_per_column(
           f"Please provide a valid type from the following: "
           f"{_default_type_to_tensor_type_map.keys()}")
     return tf.io.VarLenFeature(_default_type_to_tensor_type_map[dtype])
-    # return tf.io.VarLenFeature()
 
   def get_raw_data_metadata(
       self, input_types: Dict[str, type]) -> dataset_metadata.DatasetMetadata:

From d713555e24cb2f02c3ce40d51f6b82a4176ef856 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 20:18:42 +0000
Subject: [PATCH 17/52] remove requires_chaining from tft

---
 sdks/python/apache_beam/ml/transforms/tft.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py
index 8705b79aa309a..3a103962045f6 100644
--- a/sdks/python/apache_beam/ml/transforms/tft.py
+++ b/sdks/python/apache_beam/ml/transforms/tft.py
@@ -114,9 +114,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
       params['artifact_mode'] = artifact_mode
     return TFTProcessHandler(artifact_location=artifact_location, **params)
 
-  def requires_chaining(self):
-    return True
-
   @tf.function
   def _split_string_with_delimiter(self, data, delimiter):
     """

From 50450f34b67d80375f1a3bf739de8ade4ee2f40c Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva@google.com>
Date: Mon, 4 Dec 2023 22:42:54 +0000
Subject: [PATCH 18/52] add tests to sentence-transformers

---
 .../embeddings/sentence_transformer_test.py   | 172 +++++++++---------
 1 file changed, 89 insertions(+), 83 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index 63f401180dc2d..bdf30ec14fc36 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -18,8 +18,12 @@
 import tempfile
 import unittest
 
+from parameterized import parameterized
+
 import apache_beam as beam
 from apache_beam.ml.transforms.base import MLTransform
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
 
 # pylint: disable=ungrouped-imports
 try:
@@ -37,6 +41,39 @@
 test_query = "This is a test"
 test_query_column = "feature_1"
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
+_parameterized_inputs = [
+    ([{
+        test_query_column: '样例数据-1'
+    }, {
+        test_query_column: '样例数据-2'
+    }, {
+        test_query_column: '样例数据-3'
+    }, {
+        test_query_column: '样例数据-4'
+    }],
+     'BAAI/bge-base-en-v1.5', [0.1091, 0.122, 0.104, 0.1093]),
+    ([{
+        test_query_column: test_query,
+    }], DEFAULT_MODEL_NAME, [0.1342]),
+    (
+        [{
+            test_query_column: 'query: how much protein should a female eat',
+        },
+         {
+             test_query_column: (
+                 "passage: As a general guideline, the CDC's "
+                 "average requirement of protein for women "
+                 "ages 19 to 70 is 46 grams per day. But, "
+                 "as you can see from this chart, you'll need "
+                 "to increase that if you're expecting or training"
+                 " for a marathon. Check out the chart below "
+                 "to see how much protein "
+                 "you should be eating each day.")
+         }],
+        'intfloat/e5-base-v2',
+        # this model requires inputs to be specified as query: and passage:
+        [0.0982, 0.1033]),
+]
 
 
 def get_pipeline_wth_embedding_config(
@@ -99,60 +136,36 @@ def assert_element(element):
 
       _ = (transformed_pcoll | beam.Map(assert_element))
 
-  def pipeline_with_configurable_artifact_location(
-      self,
-      pipeline,
-      embedding_config=None,
-      read_artifact_location=None,
-      write_artifact_location=None):
-    if write_artifact_location:
-      return (
-          pipeline
-          | MLTransform(write_artifact_location=write_artifact_location).
-          with_transform(embedding_config))
-    elif read_artifact_location:
-      return (
-          pipeline
-          | MLTransform(read_artifact_location=read_artifact_location))
-    else:
-      raise NotImplementedError
+  @parameterized.expand(_parameterized_inputs)
+  def test_embeddings_with_read_artifact_location(
+      self, inputs, model_name, output):
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name, columns=[test_query_column])
 
-  def test_embeddings_with_read_artifact_location(self):
     with beam.Pipeline() as p:
-      model_name = DEFAULT_MODEL_NAME
-      embedding_config = SentenceTransformerEmbeddings(
-          model_name=model_name, columns=[test_query_column])
+      result_pcoll = (
+          p
+          | "CreateData" >> beam.Create(inputs)
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
+      max_ele_pcoll = (
+          result_pcoll
+          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
 
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }]))
-        _ = self.pipeline_with_configurable_artifact_location(
-            pipeline=data,
-            embedding_config=embedding_config,
-            write_artifact_location=self.artifact_location)
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }, {
-                test_query_column: test_query
-            }]))
-        result_pcoll = self.pipeline_with_configurable_artifact_location(
-            pipeline=data, read_artifact_location=self.artifact_location)
+      assert_that(max_ele_pcoll, equal_to(output))
 
-        def assert_element(element):
-          assert round(element, 2) == 0.13
+    with beam.Pipeline() as p:
+      result_pcoll = (
+          p
+          | "CreateData" >> beam.Create(inputs)
+          | "MLTransform" >>
+          MLTransform(read_artifact_location=self.artifact_location))
+      max_ele_pcoll = (
+          result_pcoll
+          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
 
-        _ = (
-            result_pcoll
-            | beam.Map(lambda x: max(x[test_query_column]))
-            #  0.1342099905014038
-            | beam.Map(assert_element))
+      assert_that(max_ele_pcoll, equal_to(output))
 
   def test_sentence_transformer_with_int_data_types(self):
     model_name = DEFAULT_MODEL_NAME
@@ -169,43 +182,36 @@ def test_sentence_transformer_with_int_data_types(self):
                 write_artifact_location=self.artifact_location).with_transform(
                     embedding_config))
 
-  def test_with_gcs_artifact_location(self):
+  @parameterized.expand(_parameterized_inputs)
+  def test_with_gcs_artifact_location(self, inputs, model_name, output):
     artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
-    with beam.Pipeline() as p:
-      model_name = DEFAULT_MODEL_NAME
-      embedding_config = SentenceTransformerEmbeddings(
-          model_name=model_name, columns=[test_query_column])
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }]))
-        _ = self.pipeline_with_configurable_artifact_location(
-            pipeline=data,
-            embedding_config=embedding_config,
-            write_artifact_location=artifact_location)
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }, {
-                test_query_column: test_query
-            }]))
-        result_pcoll = self.pipeline_with_configurable_artifact_location(
-            pipeline=data, read_artifact_location=artifact_location)
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name, columns=[test_query_column])
 
-        def assert_element(element):
-          assert round(element, 2) == 0.13
+    with beam.Pipeline() as p:
+      result_pcoll = (
+          p
+          | "CreateData" >> beam.Create(inputs)
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
+      max_ele_pcoll = (
+          result_pcoll
+          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+
+      assert_that(max_ele_pcoll, equal_to(output))
 
-        _ = (
-            result_pcoll
-            | beam.Map(lambda x: max(x[test_query_column]))
-            #  0.1342099905014038
-            | beam.Map(assert_element))
+    with beam.Pipeline() as p:
+      result_pcoll = (
+          p
+          | "CreateData" >> beam.Create(inputs)
+          | "MLTransform" >>
+          MLTransform(read_artifact_location=artifact_location))
+      max_ele_pcoll = (
+          result_pcoll
+          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+
+      assert_that(max_ele_pcoll, equal_to(output))
 
 
 if __name__ == '__main__':

From 8823a752e76f660278f00d9891c42fe87d0f5a9a Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Tue, 5 Dec 2023 15:22:41 -0500
Subject: [PATCH 19/52] Pass inference_args to RunInference

---
 sdks/python/apache_beam/ml/transforms/base.py |  9 ++--
 .../embeddings/sentence_transformer.py        |  8 ++-
 .../embeddings/sentence_transformer_test.py   | 51 +++++++++++++------
 .../transforms/embeddings/tensorflow_hub.py   |  8 ++-
 .../ml/transforms/embeddings/vertex_ai.py     |  5 +-
 5 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index fcd2c3299e384..7b4bf0b0ef2a2 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -179,6 +179,7 @@ def __init__(
     self.max_batch_size = max_batch_size
     self.large_model = large_model
     self.columns = columns
+    self.inference_args = kwargs.pop('inference_args', {})
 
     if kwargs:
       _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys())
@@ -300,9 +301,8 @@ def expand(
           _MLTransformToPTransformMapper.load_transforms_from_artifact_location(
               self._parent_artifact_location))
 
-    # the saved transforms has artifact mode set to PRODUCE.
-    # set the artifact mode to CONSUME.
-    if self._artifact_mode == ArtifactMode.CONSUME:
+      # the saved transforms has artifact mode set to PRODUCE.
+      # set the artifact mode to CONSUME.
       for i in range(len(ptransform_list)):
         if hasattr(ptransform_list[i], 'artifact_mode'):
           ptransform_list[i].artifact_mode = self._artifact_mode
@@ -598,3 +598,6 @@ def batch_elements_kwargs(self) -> Mapping[str, Any]:
     if self.embedding_config.min_batch_size:
       batch_sizes_map['min_batch_size'] = self.embedding_config.min_batch_size
     return (self._underlying.batch_elements_kwargs() or batch_sizes_map)
+
+  def validate_inference_args(self, _):
+    pass
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
index f41e24c0f7a4a..935e9281c2d59 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
@@ -68,7 +68,7 @@ def run_inference(
     return model.encode(batch, **inference_args)
 
   def load_model(self):
-    model = self._model_class(self._model_uri)
+    model = self._model_class(self._model_uri, **self._load_model_args)
     if self._max_seq_length:
       model.max_seq_length = self._max_seq_length
     return model
@@ -122,4 +122,8 @@ def get_model_handler(self):
   def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
     # wrap the model handler in a _TextEmbeddingHandler since
     # the SentenceTransformerEmbeddings works on text input data.
-    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
\ No newline at end of file
+    return (
+        RunInference(
+            model_handler=_TextEmbeddingHandler(self),
+            inference_args=self.inference_args,
+        ))
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index bdf30ec14fc36..21289797133c0 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -28,6 +28,7 @@
 # pylint: disable=ungrouped-imports
 try:
   from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+  import torch
 except ImportError:
   SentenceTransformerEmbeddings = None  # type: ignore
 
@@ -76,18 +77,6 @@
 ]
 
 
-def get_pipeline_wth_embedding_config(
-    pipeline: beam.Pipeline, embedding_config, artifact_location):
-  transformed_pcoll = (
-      pipeline
-      | "CreateData" >> beam.Create([{
-          test_query_column: test_query
-      }])
-      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location).
-      with_transform(embedding_config))
-  return transformed_pcoll
-
-
 @unittest.skipIf(
     SentenceTransformerEmbeddings is None,
     'sentence-transformers is not installed.')
@@ -103,10 +92,14 @@ def test_sentence_transformer_embeddings(self):
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with beam.Pipeline() as pipeline:
-      result_pcoll = get_pipeline_wth_embedding_config(
-          pipeline=pipeline,
-          embedding_config=embedding_config,
-          artifact_location=self.artifact_location)
+      result_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
 
       def assert_element(element):
         assert len(element[test_query_column]) == 768
@@ -213,6 +206,32 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
 
       assert_that(max_ele_pcoll, equal_to(output))
 
+  def test_embeddings_with_inference_args(self):
+    model_name = DEFAULT_MODEL_NAME
+
+    inference_args = {'convert_to_numpy': False}
+    embedding_config = SentenceTransformerEmbeddings(
+        model_name=model_name,
+        columns=[test_query_column],
+        inference_args=inference_args)
+    with beam.Pipeline() as pipeline:
+      result_pcoll = (
+          pipeline
+          | "CreateData" >> beam.Create([{
+              test_query_column: test_query
+          }])
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
+
+      def assert_element(element):
+        assert type(element) == torch.Tensor
+
+      _ = (
+          result_pcoll
+          | beam.Map(lambda x: x[test_query_column])
+          | beam.Map(assert_element))
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
index 4612ca9d1d425..a545d4b3d3a20 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
@@ -45,7 +45,7 @@ def __init__(self, preprocessing_url: Optional[str], *args, **kwargs):
   def load_model(self):
     # unable to load the models with tf.keras.models.load_model so
     # using hub.KerasLayer instead
-    model = hub.KerasLayer(self._model_uri, )
+    model = hub.KerasLayer(self._model_uri, **self._load_model_args)
     return model
 
   def _convert_prediction_result_to_list(
@@ -121,4 +121,8 @@ def get_model_handler(self) -> ModelHandler:
     )
 
   def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
-    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
+    return (
+        RunInference(
+            model_handler=_TextEmbeddingHandler(self),
+            inference_args=self.inference_args,
+        ))
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index e4c6745bb5665..b61dc98fd5cdd 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -155,4 +155,7 @@ def get_model_handler(self) -> ModelHandler:
     )
 
   def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
-    return (RunInference(model_handler=_TextEmbeddingHandler(self)))
+    return (
+        RunInference(
+            model_handler=_TextEmbeddingHandler(self),
+            inference_args=self.inference_args))

From a7e2bd354c2f3901d8a3f1ab5207b78a87ef5b18 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 20:25:08 +0000
Subject: [PATCH 20/52] Add TODO GH issue

---
 .../ml/transforms/embeddings/sentence_transformer.py           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
index f41e24c0f7a4a..fa4c210860fa7 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
@@ -32,7 +32,8 @@
 from sentence_transformers import SentenceTransformer
 
 
-# TODO: Use HuggingFaceModelHandlerTensor once the import issue is fixed.
+# TODO: https://github.com/apache/beam/issues/29621
+# Use HuggingFaceModelHandlerTensor once the import issue is fixed.
 # Right now, the hugging face model handler import torch and tensorflow
 # at the same time, which adds too much weigth to the container unnecessarily.
 class _SentenceTransformerModelHandler(ModelHandler):

From f77ae6046fea55a35bc9dd5c1d9e35de72c3552e Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 20:49:02 +0000
Subject: [PATCH 21/52] refactor variables in vertex_ai embeddings

---
 .../apache_beam/ml/transforms/embeddings/vertex_ai.py  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index b61dc98fd5cdd..297549d4f3284 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -39,7 +39,8 @@
 
 __all__ = ["VertexAITextEmbeddings"]
 
-TASK_TYPE = "RETRIEVAL_DOCUMENT"
+DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT"
+# TODO: Can this list be automatically pulled from Vertex SDK?
 TASK_TYPE_INPUTS = [
     "RETRIEVAL_DOCUMENT",
     "RETRIEVAL_QUERY",
@@ -47,6 +48,7 @@
     "CLASSIFICATION",
     "CLUSTERING"
 ]
+_BATCH_SIZE = 5  # Vertex AI limits requests to 5 at a time.
 
 
 class _VertexAITextEmbeddingHandler(ModelHandler):
@@ -57,7 +59,7 @@ def __init__(
       self,
       model_name: str,
       title: Optional[str] = None,
-      task_type: str = TASK_TYPE,
+      task_type: str = DEFAULT_TASK_TYPE,
       project: Optional[str] = None,
       location: Optional[str] = None,
       credentials: Optional[Credentials] = None,
@@ -77,7 +79,7 @@ def run_inference(
       inference_args: Optional[Dict[str, Any]] = None,
   ) -> Iterable:
     embeddings = []
-    batch_size = 5  # Vertex AI limits requests to 5 at a time.
+    batch_size = _BATCH_SIZE
     for i in range(0, len(batch), batch_size):
       text_batch = batch[i:i + batch_size]
       text_batch = [
@@ -100,7 +102,7 @@ def __init__(
       model_name: str,
       columns: List[str],
       title: Optional[str] = None,
-      task_type: str = TASK_TYPE,
+      task_type: str = DEFAULT_TASK_TYPE,
       project: Optional[str] = None,
       location: Optional[str] = None,
       credentials: Optional[Credentials] = None,

From 95ed3c59e3c29a3f0709f558f04d3f20e4fc5406 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 21:05:03 +0000
Subject: [PATCH 22/52] remove try/catch and throw error if options is empty
 for GCS artifact location

---
 sdks/python/apache_beam/ml/transforms/base.py | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 7b4bf0b0ef2a2..4d90346e61b42 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -323,11 +323,6 @@ def with_transform(self, transform: MLTransformProvider):
     Returns:
       A MLTransform instance.
     """
-    self._validate_transform(transform)
-    self.transforms.append(transform)
-    return self
-
-  def _validate_transform(self, transform):
     # every data processing transform should subclass MLTransformProvider. Raise
     # an error if the transform does not subclass MLTransformProvider since the
     # downstream code expects the transform to be a subclass of
@@ -337,6 +332,8 @@ def _validate_transform(self, transform):
           'transform must be a subclass of MLTransformProvider and implement '
           'get_ptransform_for_processing() method.'
           'Got: %s instead.' % type(transform))
+    self.transforms.append(transform)
+    return self
 
 
 class MLTransformMetricsUsage(beam.PTransform):
@@ -404,13 +401,6 @@ def save_attributes(
       **kwargs,
   ):
     if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location):
-      try:
-        options = kwargs.get('options')
-      except KeyError:
-        raise RuntimeError(
-            'pipeline options are required to save the attributes.'
-            'in the artifact location %s' % artifact_location)
-
       temp_dir = tempfile.mkdtemp()
       temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME)
       with open(temp_json_file, 'w+') as f:
@@ -418,6 +408,12 @@ def save_attributes(
       with open(temp_json_file, 'rb') as f:
         from apache_beam.runners.dataflow.internal import apiclient
         _LOGGER.info('Creating artifact location: %s', artifact_location)
+        # pipeline options required to for the client to configure project.
+        options = kwargs.get('options')
+        if not options:
+          raise RuntimeError(
+              'pipeline options are required to save the attributes.'
+              'in the artifact location %s' % artifact_location)
         apiclient.DataflowApplicationClient(options=options).stage_file(
             gcs_or_local_path=artifact_location,
             file_name=_ATTRIBUTE_FILE_NAME,

From c235499cfbbe0dab53eb7cbb721810415bfb074c Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 21:29:02 +0000
Subject: [PATCH 23/52] Refactor NotImplementedError message

---
 sdks/python/apache_beam/ml/transforms/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py
index b66cb4162ce29..b0aef5898cf08 100644
--- a/sdks/python/apache_beam/ml/transforms/utils.py
+++ b/sdks/python/apache_beam/ml/transforms/utils.py
@@ -32,9 +32,13 @@ class ArtifactsFetcher():
   def __init__(self, artifact_location):
     files = os.listdir(artifact_location)
     files.remove(base._ATTRIBUTE_FILE_NAME)
+    # TODO: Integrate ArtifactFetcher into MLTransform.
     if len(files) > 1:
       raise NotImplementedError(
-          'Multiple files in artifact location not supported yet.')
+          "MLTransform may have been utilized alongside transforms written "
+          "in TensorFlow Transform, in conjunction with those from different "
+          "frameworks. Currently, retrieving artifacts from this "
+          "multi-framework setup is not supported.")
     self._artifact_location = os.path.join(artifact_location, files[0])
     self.transform_output = tft.TFTransformOutput(self._artifact_location)
 

From 6eebfa40c63707ab7c7991fd31a93cba748ed973 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 21:33:34 +0000
Subject: [PATCH 24/52] remove tensorflow hub from this PR

---
 .../transforms/embeddings/tensorflow_hub.py   | 128 -----------
 .../embeddings/tensorflow_hub_test.py         | 198 ------------------
 sdks/python/tox.ini                           |  15 +-
 3 files changed, 1 insertion(+), 340 deletions(-)
 delete mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
 delete mode 100644 sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
deleted file mode 100644
index a545d4b3d3a20..0000000000000
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Iterable
-from typing import List
-from typing import Optional
-
-import apache_beam as beam
-import tensorflow as tf
-import tensorflow_hub as hub
-import tensorflow_text as text  # required to register TF ops. # pylint: disable=unused-import
-from apache_beam.ml.inference import utils
-from apache_beam.ml.inference.base import ModelHandler
-from apache_beam.ml.inference.base import PredictionResult
-from apache_beam.ml.inference.base import RunInference
-from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor
-from apache_beam.ml.inference.tensorflow_inference import default_tensor_inference_fn
-from apache_beam.ml.transforms.base import EmbeddingsManager
-from apache_beam.ml.transforms.base import _TextEmbeddingHandler
-
-__all__ = ['TensorflowHubTextEmbeddings']
-
-
-class _TensorflowHubModelHandler(TFModelHandlerTensor):
-  """
-  Note: Intended for internal use only. No backwards compatibility guarantees.
-  """
-  def __init__(self, preprocessing_url: Optional[str], *args, **kwargs):
-    self.preprocessing_url = preprocessing_url
-    super().__init__(*args, **kwargs)
-
-  def load_model(self):
-    # unable to load the models with tf.keras.models.load_model so
-    # using hub.KerasLayer instead
-    model = hub.KerasLayer(self._model_uri, **self._load_model_args)
-    return model
-
-  def _convert_prediction_result_to_list(
-      self, predictions: Iterable[PredictionResult]):
-    result = []
-    for prediction in predictions:
-      inference = prediction.inference.numpy().tolist()
-      result.append(inference)
-    return result
-
-  def run_inference(self, batch, model, inference_args, model_id=None):
-    if not inference_args:
-      inference_args = {}
-    if not self.preprocessing_url:
-      predictions = default_tensor_inference_fn(
-          model=model,
-          batch=batch,
-          inference_args=inference_args,
-          model_id=model_id)
-      return self._convert_prediction_result_to_list(predictions)
-
-    vectorized_batch = tf.stack(batch, axis=0)
-    preprocessor_fn = hub.KerasLayer(self.preprocessing_url)
-    vectorized_batch = preprocessor_fn(vectorized_batch)
-    predictions = model(vectorized_batch)
-    # https://www.tensorflow.org/text/tutorials/classify_text_with_bert#using_the_bert_model # pylint: disable=line-too-long
-    # pooled_output -> represents the text as a whole. This is an embeddings
-    # of the whole text. The shape is [batch_size, embedding_dimension]
-    # sequence_output -> represents the text as a sequence of tokens. This is
-    # an embeddings of each token in the text. The shape is
-    # [batch_size, max_sequence_length, embedding_dimension]
-    # pooled output is the embeedings as per the documentation. so let's use
-    # that.
-    embeddings = predictions['pooled_output']
-    predictions = utils._convert_to_result(batch, embeddings, model_id)
-    return self._convert_prediction_result_to_list(predictions)
-
-
-class TensorflowHubTextEmbeddings(EmbeddingsManager):
-  def __init__(
-      self,
-      columns: List[str],
-      hub_url: str,
-      preprocessing_url: Optional[str] = None,
-      **kwargs):
-    """
-    Embedding config for tensorflow hub models. This config can be used with
-    MLTransform to embed text data. Models are loaded using the RunInference
-    PTransform with the help of a ModelHandler.
-
-    Args:
-      columns: The columns containing the text to be embedded.
-      hub_url: The url of the tensorflow hub model.
-      preprocessing_url: The url of the preprocessing model. This is optional.
-        If provided, the preprocessing model will be used to preprocess the
-        text before feeding it to the main model.
-      min_batch_size: The minimum batch size to be used for inference.
-      max_batch_size: The maximum batch size to be used for inference.
-      large_model: Whether to share the model across processes.
-    """
-    super().__init__(columns=columns, **kwargs)
-    self.model_uri = hub_url
-    self.preprocessing_url = preprocessing_url
-
-  def get_model_handler(self) -> ModelHandler:
-    # override the default inference function
-    return _TensorflowHubModelHandler(
-        model_uri=self.model_uri,
-        preprocessing_url=self.preprocessing_url,
-        min_batch_size=self.min_batch_size,
-        max_batch_size=self.max_batch_size,
-        large_model=self.large_model,
-    )
-
-  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
-    return (
-        RunInference(
-            model_handler=_TextEmbeddingHandler(self),
-            inference_args=self.inference_args,
-        ))
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
deleted file mode 100644
index 8c571e0cf4621..0000000000000
--- a/sdks/python/apache_beam/ml/transforms/embeddings/tensorflow_hub_test.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import tempfile
-import unittest
-
-import apache_beam as beam
-from apache_beam.ml.transforms.base import MLTransform
-
-hub_url = 'https://tfhub.dev/google/nnlm-en-dim128/2'
-test_query_column = 'test_query'
-test_query = 'This is a test query'
-
-# pylint: disable=ungrouped-imports
-try:
-  import tensorflow as tf  # disable=unused-import
-  from apache_beam.ml.transforms.embeddings.tensorflow_hub import TensorflowHubTextEmbeddings
-except ImportError:
-  tf = None
-
-try:
-  from apache_beam.ml.transforms.tft import ScaleTo01
-except ImportError:
-  ScaleTo01 = None  # type: ignore
-
-
-@unittest.skipIf(tf is None, 'Tensorflow is not installed.')
-class TFHubEmbeddingsTest(unittest.TestCase):
-  def setUp(self) -> None:
-    self.artifact_location = tempfile.mkdtemp()
-
-  def tearDown(self) -> None:
-    shutil.rmtree(self.artifact_location)
-
-  def test_tfhub_text_embeddings(self):
-    embedding_config = TensorflowHubTextEmbeddings(
-        hub_url=hub_url, columns=[test_query_column])
-    with beam.Pipeline() as pipeline:
-      transformed_pcoll = (
-          pipeline
-          | "CreateData" >> beam.Create([{
-              test_query_column: test_query
-          }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
-
-      def assert_element(element):
-        assert len(element[test_query_column]) == 128
-
-      _ = (transformed_pcoll | beam.Map(assert_element))
-
-  @unittest.skipIf(ScaleTo01 is None, 'Tensorflow Transform is not installed.')
-  def test_embeddings_with_scale_to_0_1(self):
-    embedding_config = TensorflowHubTextEmbeddings(
-        hub_url=hub_url,
-        columns=[test_query_column],
-    )
-    with beam.Pipeline() as pipeline:
-      transformed_pcoll = (
-          pipeline
-          | "CreateData" >> beam.Create([{
-              test_query_column: test_query
-          }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config).with_transform(
-                      ScaleTo01(columns=[test_query_column])))
-
-      def assert_element(element):
-        assert max(element[test_query_column]) == 1
-
-      _ = (
-          transformed_pcoll | beam.Map(lambda x: x.as_dict())
-          | beam.Map(assert_element))
-
-  def pipeline_with_configurable_artifact_location(
-      self,
-      pipeline,
-      embedding_config=None,
-      read_artifact_location=None,
-      write_artifact_location=None):
-    if write_artifact_location:
-      return (
-          pipeline
-          | MLTransform(write_artifact_location=write_artifact_location).
-          with_transform(embedding_config))
-    elif read_artifact_location:
-      return (
-          pipeline
-          | MLTransform(read_artifact_location=read_artifact_location))
-    else:
-      raise NotImplementedError
-
-  def test_embeddings_with_read_artifact_location(self):
-    with beam.Pipeline() as p:
-      embedding_config = TensorflowHubTextEmbeddings(
-          hub_url=hub_url, columns=[test_query_column])
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }]))
-        _ = self.pipeline_with_configurable_artifact_location(
-            pipeline=data,
-            embedding_config=embedding_config,
-            write_artifact_location=self.artifact_location)
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }, {
-                test_query_column: test_query
-            }]))
-        result_pcoll = self.pipeline_with_configurable_artifact_location(
-            pipeline=data, read_artifact_location=self.artifact_location)
-
-        def assert_element(element):
-          #  0.29836970567703247
-          assert round(element, 2) == 0.3
-
-        _ = (
-            result_pcoll
-            | beam.Map(lambda x: max(x[test_query_column]))
-            | beam.Map(assert_element))
-
-  def test_with_int_data_types(self):
-    embedding_config = TensorflowHubTextEmbeddings(
-        hub_url=hub_url, columns=[test_query_column])
-    with self.assertRaises(TypeError):
-      with beam.Pipeline() as pipeline:
-        _ = (
-            pipeline
-            | "CreateData" >> beam.Create([{
-                test_query_column: 1
-            }])
-            | "MLTransform" >> MLTransform(
-                write_artifact_location=self.artifact_location).with_transform(
-                    embedding_config))
-
-  def test_with_gcs_artifact_location(self):
-    artifact_location = 'gs://apache-beam-ml/testing/tensorflow_hub'
-    with beam.Pipeline() as p:
-      embedding_config = TensorflowHubTextEmbeddings(
-          hub_url=hub_url, columns=[test_query_column])
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }]))
-        _ = self.pipeline_with_configurable_artifact_location(
-            pipeline=data,
-            embedding_config=embedding_config,
-            write_artifact_location=artifact_location)
-
-      with beam.Pipeline() as p:
-        data = (
-            p
-            | "CreateData" >> beam.Create([{
-                test_query_column: test_query
-            }, {
-                test_query_column: test_query
-            }]))
-        result_pcoll = self.pipeline_with_configurable_artifact_location(
-            pipeline=data, read_artifact_location=artifact_location)
-
-        def assert_element(element):
-          # 0.29836970567703247
-          assert round(element, 2) == 0.3
-
-        _ = (
-            result_pcoll
-            | beam.Map(lambda x: max(x[test_query_column]))
-            | beam.Map(assert_element))
-
-
-if __name__ == '__main__':
-  unittest.main()
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index 1cea858e8bbc2..88c60bce4b190 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -433,17 +433,4 @@ commands =
   # Log aiplatform and its dependencies version for debugging
   /bin/sh -c "pip freeze | grep -E sentence-transformers"
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
-  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'
-
-[testenv:py{38,39,310,311}-tfhub-{014,015}]
-deps =
-  014: tensorflow-hub>=0.14.0,<0.15.0
-  015: tensorflow-hub>=0.15.0,<0.16.0
-  tensorflow-text
-
-extras = test,gcp
-commands =
-  # Log aiplatform and its dependencies version for debugging
-  /bin/sh -c "pip freeze | grep -E tensorflow"
-  # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
-  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'
+  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'
\ No newline at end of file

From c27aabba78d8b504de34d25b28f6a8a3652595cc Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 21:55:52 +0000
Subject: [PATCH 25/52] Add _validate_transform  method

---
 sdks/python/apache_beam/ml/transforms/base.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 4d90346e61b42..e7498e3aad34f 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -323,17 +323,15 @@ def with_transform(self, transform: MLTransformProvider):
     Returns:
       A MLTransform instance.
     """
-    # every data processing transform should subclass MLTransformProvider. Raise
-    # an error if the transform does not subclass MLTransformProvider since the
-    # downstream code expects the transform to be a subclass of
-    # MLTransformProvider.
+    self._validate_transform(transform)
+    self.transforms.append(transform)
+    return self
+
+  def _validate_transform(self, transform):
     if not isinstance(transform, MLTransformProvider):
       raise TypeError(
-          'transform must be a subclass of MLTransformProvider and implement '
-          'get_ptransform_for_processing() method.'
+          'transform must be a subclass of BaseOperation. '
           'Got: %s instead.' % type(transform))
-    self.transforms.append(transform)
-    return self
 
 
 class MLTransformMetricsUsage(beam.PTransform):
@@ -536,7 +534,9 @@ def load_model(self):
 
   def _validate_column_data(self, batch):
     if not isinstance(batch[0], (str, bytes)):
-      raise TypeError('Embeddings can only be generated on text columns.')
+      raise TypeError(
+          'Embeddings can only be generated on Dict[str, str].'
+          f'Got Dict[str, {type(batch[0])}] instead.')
 
   def _validate_batch(self, batch: Sequence[Dict[str, List[str]]]):
     if not batch or not isinstance(batch[0], dict):

From 422a86a2c7a966a59199b30529506304eb6d2a4b Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Tue, 5 Dec 2023 23:17:28 +0000
Subject: [PATCH 26/52] add more tests

---
 sdks/python/apache_beam/ml/transforms/base.py |   2 +-
 .../apache_beam/ml/transforms/base_test.py    | 101 ++++++++++++++++++
 .../transforms/embeddings/vertex_ai_test.py   |  49 +++++++++
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index e7498e3aad34f..8c192759c46e3 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -449,7 +449,7 @@ def __init__(
       self,
       transforms: List[MLTransformProvider],
       artifact_location: str,
-      artifact_mode: str,
+      artifact_mode: str = ArtifactMode.PRODUCE,
       pipeline_options: Optional[PipelineOptions] = None,
   ):
     self.transforms = transforms
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 4e73a915adc50..6badf79369657 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -16,6 +16,7 @@
 #
 # pytype: skip-file
 
+import os
 import shutil
 import tempfile
 import typing
@@ -41,6 +42,7 @@
 # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports
 try:
   from apache_beam.ml.transforms import tft
+  from apache_beam.ml.transforms.handlers import TFTProcessHandler
   from apache_beam.ml.transforms.tft import TFTOperation
 except ImportError:
   tft = None  # type: ignore
@@ -424,6 +426,21 @@ def test_handler_on_multiple_columns(self):
           equal_to(expected_data),
       )
 
+  def test_handler_with_list_data(self):
+    data = [{
+        'x': ['Hello world', 'Apache Beam'],
+    }, {
+        'x': ['Apache Beam', 'Hello world'],
+    }]
+    with self.assertRaises(TypeError):
+      with beam.Pipeline() as p:
+        _ = (
+            p
+            | beam.Create(data)
+            | base.MLTransform(
+                write_artifact_location=self.artifact_location).with_transform(
+                    self.embedding_conig))
+
 
 class TestUtilFunctions(unittest.TestCase):
   def test_list_of_dicts_to_dict_of_lists_normal(self):
@@ -453,5 +470,89 @@ def test_dict_of_lists_to_lists_of_dict_unequal_length(self):
       base._convert_dict_of_lists_to_lists_of_dict(input_dict)
 
 
+class TestJsonPickleTransformAttributeManager(unittest.TestCase):
+  def setUp(self):
+    self.attribute_manager = base._transform_attribute_manager
+    self.artifact_location = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.artifact_location)
+
+  @unittest.skipIf(tft is None, 'tft module is not installed.')
+  def test_save_tft_process_handler(self):
+    transforms = [
+        tft.ScaleTo01(columns=['x']),
+        tft.ComputeAndApplyVocabulary(columns=['y'])
+    ]
+    process_handler = TFTProcessHandler(
+        transforms=transforms,
+        artifact_location=self.artifact_location,
+    )
+    self.attribute_manager.save_attributes(
+        ptransform_list=[process_handler],
+        artifact_location=self.artifact_location,
+    )
+
+    files = os.listdir(self.artifact_location)
+    self.assertTrue(len(files) == 1)
+    self.assertTrue(files[0] == base._ATTRIBUTE_FILE_NAME)
+
+  def test_save_run_inference(self):
+    self.attribute_manager.save_attributes(
+        ptransform_list=[RunInference(model_handler=FakeModelHandler())],
+        artifact_location=self.artifact_location,
+    )
+    files = os.listdir(self.artifact_location)
+    self.assertTrue(len(files) == 1)
+    self.assertTrue(files[0] == base._ATTRIBUTE_FILE_NAME)
+
+  def test_save_and_load_run_inference(self):
+    ptransform_list = [RunInference(model_handler=FakeModelHandler())]
+    self.attribute_manager.save_attributes(
+        ptransform_list=ptransform_list,
+        artifact_location=self.artifact_location,
+    )
+    loaded_ptransform_list = self.attribute_manager.load_attributes(
+        artifact_location=self.artifact_location,
+    )
+
+    self.assertTrue(len(loaded_ptransform_list) == len(ptransform_list))
+    self.assertListEqual(
+        list(loaded_ptransform_list[0].__dict__.keys()),
+        list(ptransform_list[0].__dict__.keys()))
+
+    get_keys = lambda x: list(x.__dict__.keys())
+    for i, transform in enumerate(ptransform_list):
+      self.assertListEqual(
+          get_keys(transform), get_keys(loaded_ptransform_list[i]))
+      if hasattr(transform, 'model_handler'):
+        model_handler = transform.model_handler
+        loaded_model_handler = loaded_ptransform_list[i].model_handler
+        self.assertListEqual(
+            get_keys(model_handler), get_keys(loaded_model_handler))
+
+  def test_mltransform_to_ptransform_wrapper(self):
+    transforms = [
+        FakeEmbeddingsManager(columns=['x']),
+        FakeEmbeddingsManager(columns=['y', 'z']),
+    ]
+    ptransform_mapper = base._MLTransformToPTransformMapper(
+        transforms=transforms,
+        artifact_location=self.artifact_location,
+        artifact_mode=None)
+
+    ptransform_list = ptransform_mapper.create_ptransform_list()
+    self.assertTrue(len(ptransform_list) == 2)
+
+    self.assertEqual(type(ptransform_list[0]), RunInference)
+    expected_columns = [['x'], ['y', 'z']]
+    for i in range(len(ptransform_list)):
+      self.assertEqual(type(ptransform_list[i]), RunInference)
+      self.assertEqual(
+          type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler)
+      self.assertEqual(
+          ptransform_list[i]._model_handler.columns, expected_columns[i])
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
index 7124aab9cbf23..388df7ae30da5 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
@@ -19,6 +19,8 @@
 import unittest
 
 import apache_beam as beam
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms import base
 from apache_beam.ml.transforms.base import MLTransform
 
 try:
@@ -192,6 +194,53 @@ def assert_element(element):
             #  0.14797046780586243
             | beam.Map(assert_element))
 
+  def test_mltransform_to_ptransform_with_vertex(self):
+    model_name = 'textembedding-gecko@002'
+    transforms = [
+        VertexAITextEmbeddings(
+            columns=['x'],
+            model_name=model_name,
+            task_type='RETRIEVAL_DOCUMENT'),
+        VertexAITextEmbeddings(
+            columns=['y', 'z'], model_name=model_name, task_type='CLUSTERING')
+    ]
+    ptransform_mapper = base._MLTransformToPTransformMapper(
+        transforms=transforms,
+        artifact_location=self.artifact_location,
+        artifact_mode=None)
+
+    ptransform_list = ptransform_mapper.create_and_save_ptransform_list()
+    self.assertTrue(len(ptransform_list) == 2)
+
+    self.assertEqual(type(ptransform_list[0]), RunInference)
+    expected_columns = [['x'], ['y', 'z']]
+    expected_task_type = ['RETRIEVAL_DOCUMENT', 'CLUSTERING']
+    for i in range(len(ptransform_list)):
+      self.assertEqual(type(ptransform_list[i]), RunInference)
+      self.assertEqual(
+          type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler)
+      self.assertEqual(
+          ptransform_list[i]._model_handler.columns, expected_columns[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.task_type,
+          expected_task_type[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.model_name, model_name)
+    ptransform_list = (
+        base._MLTransformToPTransformMapper.
+        load_transforms_from_artifact_location(self.artifact_location))
+    for i in range(len(ptransform_list)):
+      self.assertEqual(type(ptransform_list[i]), RunInference)
+      self.assertEqual(
+          type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler)
+      self.assertEqual(
+          ptransform_list[i]._model_handler.columns, expected_columns[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.task_type,
+          expected_task_type[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.model_name, model_name)
+
 
 if __name__ == '__main__':
   unittest.main()

From 08b36658f706bd2e235c0b8b5ff02134aea63807 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 03:19:06 +0000
Subject: [PATCH 27/52] fix test

---
 sdks/python/apache_beam/ml/transforms/handlers_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/handlers_test.py b/sdks/python/apache_beam/ml/transforms/handlers_test.py
index 730f25fe8c141..f13a916824c4c 100644
--- a/sdks/python/apache_beam/ml/transforms/handlers_test.py
+++ b/sdks/python/apache_beam/ml/transforms/handlers_test.py
@@ -596,7 +596,7 @@ def test_handler_with_same_input_elements(self):
           transforms=[tft.ComputeAndApplyVocabulary(columns=['x'])],
           artifact_location=self.artifact_location,
       )
-      transformed_data = process_handler.process_data(raw_data)
+      transformed_data = raw_data | process_handler
 
       expected_data = [
           beam.Row(x=np.array([4])),

From 91255adcdd67053bd5f811c5781e0a71cec1c8c2 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:09:59 +0000
Subject: [PATCH 28/52] Fix test

---
 sdks/python/apache_beam/ml/transforms/base.py      | 2 +-
 sdks/python/apache_beam/ml/transforms/base_test.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 8c192759c46e3..a174c328623ae 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -203,7 +203,7 @@ def __init__(
       *,
       write_artifact_location: Optional[str] = None,
       read_artifact_location: Optional[str] = None,
-      transforms: Optional[MLTransformProvider] = None):
+      transforms: Optional[List[MLTransformProvider]] = None):
     """
     MLTransform is a Beam PTransform that can be used to apply
     transformations to the data. MLTransform is used to wrap the
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 6badf79369657..2374d110e708c 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -288,8 +288,7 @@ class Add:
       def __call__(self, x):
         return x + 1
 
-    with self.assertRaisesRegex(
-        TypeError, 'transform must be a subclass of MLTransformProvider'):
+    with self.assertRaisesRegex(TypeError, 'transform must be a subclass of'):
       with beam.Pipeline() as p:
         _ = (
             p

From c7237c3e204c07ed991b4404a48129c6d73f8736 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:06:12 +0000
Subject: [PATCH 29/52] Add more tests in sentence-transformer

---
 .../embeddings/sentence_transformer.py        |  4 +-
 .../embeddings/sentence_transformer_test.py   | 39 +++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
index 8eeaa67ce611c..044c4bb003763 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
@@ -51,7 +51,7 @@ def __init__(
       large_model: bool = False,
       **kwargs):
     self._max_seq_length = max_seq_length
-    self._model_uri = model_name
+    self.model_name = model_name
     self._model_class = model_class
     self._load_model_args = load_model_args
     self._min_batch_size = min_batch_size
@@ -69,7 +69,7 @@ def run_inference(
     return model.encode(batch, **inference_args)
 
   def load_model(self):
-    model = self._model_class(self._model_uri, **self._load_model_args)
+    model = self._model_class(self.model_name, **self._load_model_args)
     if self._max_seq_length:
       model.max_seq_length = self._max_seq_length
     return model
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index 21289797133c0..f346b52fcaa88 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -21,6 +21,8 @@
 from parameterized import parameterized
 
 import apache_beam as beam
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms import base
 from apache_beam.ml.transforms.base import MLTransform
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
@@ -232,6 +234,43 @@ def assert_element(element):
           | beam.Map(lambda x: x[test_query_column])
           | beam.Map(assert_element))
 
+  def test_mltransform_to_ptransform_with_vertex(self):
+    model_name = ''
+    transforms = [
+        SentenceTransformerEmbeddings(columns=['x'], model_name=model_name),
+        SentenceTransformerEmbeddings(
+            columns=['y', 'z'], model_name=model_name)
+    ]
+    ptransform_mapper = base._MLTransformToPTransformMapper(
+        transforms=transforms,
+        artifact_location=self.artifact_location,
+        artifact_mode=None)
+
+    ptransform_list = ptransform_mapper.create_and_save_ptransform_list()
+    self.assertTrue(len(ptransform_list) == 2)
+
+    self.assertEqual(type(ptransform_list[0]), RunInference)
+    expected_columns = [['x'], ['y', 'z']]
+    for i in range(len(ptransform_list)):
+      self.assertEqual(type(ptransform_list[i]), RunInference)
+      self.assertEqual(
+          type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler)
+      self.assertEqual(
+          ptransform_list[i]._model_handler.columns, expected_columns[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.model_name, model_name)
+    ptransform_list = (
+        base._MLTransformToPTransformMapper.
+        load_transforms_from_artifact_location(self.artifact_location))
+    for i in range(len(ptransform_list)):
+      self.assertEqual(type(ptransform_list[i]), RunInference)
+      self.assertEqual(
+          type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler)
+      self.assertEqual(
+          ptransform_list[i]._model_handler.columns, expected_columns[i])
+      self.assertEqual(
+          ptransform_list[i]._model_handler._underlying.model_name, model_name)
+
 
 if __name__ == '__main__':
   unittest.main()

From a9428855efc4541ba6873989036ab5b338806a1d Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:50:41 +0000
Subject: [PATCH 30/52] use np.max instead of max

---
 .../ml/transforms/embeddings/sentence_transformer_test.py    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index f346b52fcaa88..99b362674813c 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -18,6 +18,7 @@
 import tempfile
 import unittest
 
+import numpy as np
 from parameterized import parameterized
 
 import apache_beam as beam
@@ -192,7 +193,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
               embedding_config))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(np.max(x[test_query_column]), 4)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 
@@ -204,7 +205,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
           MLTransform(read_artifact_location=artifact_location))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(np.max(x[test_query_column]), 4)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 

From 89c19fb912c9c1627175900891781b9145428d0b Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:52:30 +0000
Subject: [PATCH 31/52] round to 2 decimals

---
 .../embeddings/sentence_transformer_test.py        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index 99b362674813c..e2d108e8fefcc 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -55,10 +55,10 @@
     }, {
         test_query_column: '样例数据-4'
     }],
-     'BAAI/bge-base-en-v1.5', [0.1091, 0.122, 0.104, 0.1093]),
+     'BAAI/bge-base-en-v1.5', [0.11, 0.12, 0.10, 0.11]),
     ([{
         test_query_column: test_query,
-    }], DEFAULT_MODEL_NAME, [0.1342]),
+    }], DEFAULT_MODEL_NAME, [0.13]),
     (
         [{
             test_query_column: 'query: how much protein should a female eat',
@@ -76,7 +76,7 @@
          }],
         'intfloat/e5-base-v2',
         # this model requires inputs to be specified as query: and passage:
-        [0.0982, 0.1033]),
+        [0.1, 0.1]),
 ]
 
 
@@ -147,7 +147,7 @@ def test_embeddings_with_read_artifact_location(
                   embedding_config))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 
@@ -159,7 +159,7 @@ def test_embeddings_with_read_artifact_location(
           MLTransform(read_artifact_location=self.artifact_location))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 
@@ -193,7 +193,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
               embedding_config))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(np.max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(np.max(x[test_query_column]), 2)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 
@@ -205,7 +205,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
           MLTransform(read_artifact_location=artifact_location))
       max_ele_pcoll = (
           result_pcoll
-          | beam.Map(lambda x: round(np.max(x[test_query_column]), 4)))
+          | beam.Map(lambda x: round(np.max(x[test_query_column]), 2)))
 
       assert_that(max_ele_pcoll, equal_to(output))
 

From 2db4a20e1871effb34147f5ff6c1ab4a2330e76d Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Wed, 6 Dec 2023 14:00:01 -0500
Subject: [PATCH 32/52] Remove gradle command action

---
 .github/actions/gradle-command-action | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 .github/actions/gradle-command-action

diff --git a/.github/actions/gradle-command-action b/.github/actions/gradle-command-action
deleted file mode 160000
index 90ccf054e6b99..0000000000000
--- a/.github/actions/gradle-command-action
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 90ccf054e6b9905f30f98c938bce4c6acd323b6b

From b7a48d5af9e7783bbdd3f7ec027899c01343d762 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 19:23:36 +0000
Subject: [PATCH 33/52] Refactor throwing dataflow client exception

---
 sdks/python/apache_beam/ml/transforms/base.py | 22 +++++++++++--------
 .../apache_beam/ml/transforms/base_test.py    |  9 ++++++++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index a174c328623ae..c0eadedc85ac5 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -408,15 +408,19 @@ def save_attributes(
         _LOGGER.info('Creating artifact location: %s', artifact_location)
         # pipeline options required to for the client to configure project.
         options = kwargs.get('options')
-        if not options:
-          raise RuntimeError(
-              'pipeline options are required to save the attributes.'
-              'in the artifact location %s' % artifact_location)
-        apiclient.DataflowApplicationClient(options=options).stage_file(
-            gcs_or_local_path=artifact_location,
-            file_name=_ATTRIBUTE_FILE_NAME,
-            stream=f,
-            mime_type='application/json')
+        try:
+          apiclient.DataflowApplicationClient(options=options).stage_file(
+              gcs_or_local_path=artifact_location,
+              file_name=_ATTRIBUTE_FILE_NAME,
+              stream=f,
+              mime_type='application/json')
+        except Exception as exc:
+          if not options:
+            raise RuntimeError(
+                "Failed to create Dataflow client. "
+                "Pipeline options are required to save the attributes."
+                "in the artifact location %s" % artifact_location) from exc
+          raise
     else:
       if not FileSystems.exists(artifact_location):
         FileSystems.mkdirs(artifact_location)
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 2374d110e708c..6b381ba272487 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -552,6 +552,15 @@ def test_mltransform_to_ptransform_wrapper(self):
       self.assertEqual(
           ptransform_list[i]._model_handler.columns, expected_columns[i])
 
+  def test_with_gcs_location_with_none_options(self):
+    path = 'gs://fake_path'
+    with self.assertRaises(RuntimeError):
+      self.attribute_manager.save_attributes(
+          ptransform_list=[], artifact_location=path, options=None)
+    with self.assertRaises(RuntimeError):
+      self.attribute_manager.save_attributes(
+          ptransform_list=[], artifact_location=path)
+
 
 if __name__ == '__main__':
   unittest.main()

From bad1b3b7ff892463957a12b322003bf37a1a188b Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 20:23:46 +0000
Subject: [PATCH 34/52] skip the test if gcp is not installed

---
 sdks/python/apache_beam/ml/transforms/base_test.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 6b381ba272487..8e1515e7ece1a 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -59,6 +59,11 @@ def apply_transform(self, inputs, output_column_name, **kwargs):
 except:  # pylint: disable=bare-except
   pass
 
+try:
+  from apache_beam.runners.dataflow.internal import apiclient
+except ImportError:
+  apiclient = None  # type: ignore
+
 
 class BaseMLTransformTest(unittest.TestCase):
   def setUp(self) -> None:
@@ -552,6 +557,7 @@ def test_mltransform_to_ptransform_wrapper(self):
       self.assertEqual(
           ptransform_list[i]._model_handler.columns, expected_columns[i])
 
+  @unittest.skipIf(apiclient is None, 'apache_beam[gcp] is not installed.')
   def test_with_gcs_location_with_none_options(self):
     path = 'gs://fake_path'
     with self.assertRaises(RuntimeError):

From b850cee2260c2131bbceb1e536dfc250684e5d83 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:19:06 +0000
Subject: [PATCH 35/52] remove toxTests for hub

---
 .../ml/transforms/embeddings/sentence_transformer_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index e2d108e8fefcc..832d9b77932f7 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -76,7 +76,7 @@
          }],
         'intfloat/e5-base-v2',
         # this model requires inputs to be specified as query: and passage:
-        [0.1, 0.1]),
+        [0.12, 0.13]),
 ]
 
 

From ffff21a1386cfd8ddadc74cacb5b460032b6228e Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:19:46 +0000
Subject: [PATCH 36/52] remove toxTests for hub

---
 sdks/python/test-suites/tox/py38/build.gradle | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle
index c4fd300ca9435..a5a624b998e8d 100644
--- a/sdks/python/test-suites/tox/py38/build.gradle
+++ b/sdks/python/test-suites/tox/py38/build.gradle
@@ -145,14 +145,6 @@ toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "$
 test.dependsOn "testPy38sentenceTransformers-222"
 preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222"
 
-toxTask "testPy38tensorflowHub-014", "py38-tfhub-014", "${posargs}"
-test.dependsOn "testPy38tensorflowHub-014"
-preCommitPyCoverage.dependsOn "testPy38tensorflowHub-014"
-
-toxTask "testPy38tensorflowHub-015", "py38-tfhub-015", "${posargs}"
-test.dependsOn "testPy38tensorflowHub-015"
-preCommitPyCoverage.dependsOn "testPy38tensorflowHub-015"
-
 toxTask "whitespacelint", "whitespacelint", "${posargs}"
 
 task archiveFilesToLint(type: Zip) {

From 88412ea622bd8f329f55b49e16afccf0b8799fd4 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Thu, 7 Dec 2023 18:14:15 +0000
Subject: [PATCH 37/52] Fix values in assert for sentence_transformer_test

---
 .../ml/transforms/embeddings/sentence_transformer_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
index 832d9b77932f7..e2d108e8fefcc 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
@@ -76,7 +76,7 @@
          }],
         'intfloat/e5-base-v2',
         # this model requires inputs to be specified as query: and passage:
-        [0.12, 0.13]),
+        [0.1, 0.1]),
 ]
 
 

From 617f9d6e97acd381c2e95794e9ef1016a4a3c4c7 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Thu, 7 Dec 2023 18:19:35 +0000
Subject: [PATCH 38/52] rename sentence_transformers to huggingface

---
 .../embeddings/{sentence_transformer.py => huggingface.py}      | 0
 .../{sentence_transformer_test.py => huggingface_test.py}       | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename sdks/python/apache_beam/ml/transforms/embeddings/{sentence_transformer.py => huggingface.py} (100%)
 rename sdks/python/apache_beam/ml/transforms/embeddings/{sentence_transformer_test.py => huggingface_test.py} (98%)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py
similarity index 100%
rename from sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer.py
rename to sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
similarity index 98%
rename from sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
rename to sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index e2d108e8fefcc..d8b77ba2d83e8 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/sentence_transformer_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -30,7 +30,7 @@
 
 # pylint: disable=ungrouped-imports
 try:
-  from apache_beam.ml.transforms.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+  from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings
   import torch
 except ImportError:
   SentenceTransformerEmbeddings = None  # type: ignore

From 5cae04b2dddcfa426f3d64cf52235f9952c718ba Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Thu, 7 Dec 2023 18:34:30 +0000
Subject: [PATCH 39/52] fix pydocs

---
 .../apache_beam/ml/transforms/embeddings/huggingface.py       | 1 +
 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py
index 044c4bb003763..e979296b0b830 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface.py
@@ -97,6 +97,7 @@ def __init__(
     Embedding config for sentence-transformers. This config can be used with
     MLTransform to embed text data. Models are loaded using the RunInference
     PTransform with the help of ModelHandler.
+
     Args:
       model_name: Name of the model to use. The model should be hosted on
         HuggingFace Hub or compatible with sentence_transformers.
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 297549d4f3284..c0b7aa41819ec 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -119,7 +119,7 @@ def __init__(
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
       task_type: The name of the downstream task the embeddings will be used for.
-        Valid values:
+        Valid values are listed below.
         RETRIEVAL_QUERY
             Specifies the given text is a query in a search/retrieval setting.
         RETRIEVAL_DOCUMENT
@@ -129,7 +129,7 @@ def __init__(
         CLASSIFICATION
             Specifies that the given text will be classified.
         CLUSTERING
-            Specifies that the embeddings will be used for clustering.
+          Specifies that the embeddings will be used for clustering.
       title: Optional identifier of the text content.
       project: The default GCP project to make Vertex API calls.
       location: The default location to use when making API calls.

From 489200f20dfee5cdd0b596f30ce83f225679d13e Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Thu, 7 Dec 2023 20:55:57 +0000
Subject: [PATCH 40/52] Change the model name for tests since it is getting
 different results on different machines

---
 .../ml/transforms/embeddings/huggingface_test.py       | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index d8b77ba2d83e8..d932eb1212dfa 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -47,15 +47,11 @@
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
 _parameterized_inputs = [
     ([{
-        test_query_column: '样例数据-1'
+        test_query_column: 'That is a happy person'
     }, {
-        test_query_column: '样例数据-2'
-    }, {
-        test_query_column: '样例数据-3'
-    }, {
-        test_query_column: '样例数据-4'
+        test_query_column: 'That is a very happy person'
     }],
-     'BAAI/bge-base-en-v1.5', [0.11, 0.12, 0.10, 0.11]),
+     'thenlper/gte-base', [0.11, 0.11]),
     ([{
         test_query_column: test_query,
     }], DEFAULT_MODEL_NAME, [0.13]),

From 816174a26e2002111405119d18a7d890c7abfafc Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Thu, 7 Dec 2023 21:00:04 +0000
Subject: [PATCH 41/52] Fix pydoc in vertexai

---
 .../ml/transforms/embeddings/vertex_ai.py     | 27 ++++++-------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index c0b7aa41819ec..843f15293a130 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -118,25 +118,14 @@ def __init__(
     Args:
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
-      task_type: The name of the downstream task the embeddings will be used for.
-        Valid values are listed below.
-        RETRIEVAL_QUERY
-            Specifies the given text is a query in a search/retrieval setting.
-        RETRIEVAL_DOCUMENT
-            Specifies the given text is a document from the corpus being searched.
-        SEMANTIC_SIMILARITY
-            Specifies the given text will be used for STS.
-        CLASSIFICATION
-            Specifies that the given text will be classified.
-        CLUSTERING
-          Specifies that the embeddings will be used for clustering.
-      title: Optional identifier of the text content.
-      project: The default GCP project to make Vertex API calls.
-      location: The default location to use when making API calls.
-      credentials: The default custom
-        credentials to use when making API calls. If not provided credentials
-        will be ascertained from the environment.
-
+      task_type: The downstream task for the embeddings.
+        Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
+      title: Identifier of the text content.
+      project: The default GCP project for API calls.
+      location: The default location for API calls.
+      credentials: Custom credentials for API calls.
+        Defaults to environment credentials.
     """
     self.model_name = model_name
     self.project = project

From cfb18831abc458ed5ed987bcd729d2f06d9710c1 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 00:55:03 +0000
Subject: [PATCH 42/52] add suffix to artifact_location

---
 .../transforms/embeddings/huggingface_test.py | 68 +++++++++----------
 .../ml/transforms/embeddings/vertex_ai.py     |  2 +-
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index d932eb1212dfa..516c13a2d60be 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 import numpy as np
 from parameterized import parameterized
@@ -55,24 +57,12 @@
     ([{
         test_query_column: test_query,
     }], DEFAULT_MODEL_NAME, [0.13]),
-    (
-        [{
-            test_query_column: 'query: how much protein should a female eat',
-        },
-         {
-             test_query_column: (
-                 "passage: As a general guideline, the CDC's "
-                 "average requirement of protein for women "
-                 "ages 19 to 70 is 46 grams per day. But, "
-                 "as you can see from this chart, you'll need "
-                 "to increase that if you're expecting or training"
-                 " for a marathon. Check out the chart below "
-                 "to see how much protein "
-                 "you should be eating each day.")
-         }],
-        'intfloat/e5-base-v2',
-        # this model requires inputs to be specified as query: and passage:
-        [0.1, 0.1]),
+    ([{
+        test_query_column: 'This is an example sentence',
+    }, {
+        test_query_column: ("Each sentence is converted")
+    }],
+     'sentence-transformers/all-MiniLM-L6-v2', [0.15, 0.14]),
 ]
 
 
@@ -88,6 +78,7 @@ def tearDown(self) -> None:
 
   def test_sentence_transformer_embeddings(self):
     model_name = DEFAULT_MODEL_NAME
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with beam.Pipeline() as pipeline:
@@ -96,9 +87,9 @@ def test_sentence_transformer_embeddings(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
 
       def assert_element(element):
         assert len(element[test_query_column]) == 768
@@ -107,6 +98,7 @@ def assert_element(element):
 
   @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.')
   def test_embeddings_with_scale_to_0_1(self):
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     model_name = DEFAULT_MODEL_NAME
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -118,10 +110,10 @@ def test_embeddings_with_scale_to_0_1(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config).with_transform(
-                      ScaleTo01(columns=[test_query_column])))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config).with_transform(
+                  ScaleTo01(columns=[test_query_column])))
 
       def assert_element(element):
         assert max(element.feature_1) == 1
@@ -134,13 +126,14 @@ def test_embeddings_with_read_artifact_location(
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     with beam.Pipeline() as p:
       result_pcoll = (
           p
           | "CreateData" >> beam.Create(inputs)
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -152,7 +145,7 @@ def test_embeddings_with_read_artifact_location(
           p
           | "CreateData" >> beam.Create(inputs)
           | "MLTransform" >>
-          MLTransform(read_artifact_location=self.artifact_location))
+          MLTransform(read_artifact_location=artifact_location))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -161,6 +154,7 @@ def test_embeddings_with_read_artifact_location(
 
   def test_sentence_transformer_with_int_data_types(self):
     model_name = DEFAULT_MODEL_NAME
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with self.assertRaises(TypeError):
@@ -171,12 +165,13 @@ def test_sentence_transformer_with_int_data_types(self):
                 test_query_column: 1
             }])
             | "MLTransform" >> MLTransform(
-                write_artifact_location=self.artifact_location).with_transform(
+                write_artifact_location=artifact_location).with_transform(
                     embedding_config))
 
   @parameterized.expand(_parameterized_inputs)
   def test_with_gcs_artifact_location(self, inputs, model_name, output):
-    artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
+    artifact_location = os.path.join(
+        'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
@@ -207,7 +202,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
 
   def test_embeddings_with_inference_args(self):
     model_name = DEFAULT_MODEL_NAME
-
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     inference_args = {'convert_to_numpy': False}
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -219,9 +214,9 @@ def test_embeddings_with_inference_args(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >> MLTransform(
-              write_artifact_location=self.artifact_location).with_transform(
-                  embedding_config))
+          | "MLTransform" >>
+          MLTransform(write_artifact_location=artifact_location).with_transform(
+              embedding_config))
 
       def assert_element(element):
         assert type(element) == torch.Tensor
@@ -233,6 +228,7 @@ def assert_element(element):
 
   def test_mltransform_to_ptransform_with_vertex(self):
     model_name = ''
+    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     transforms = [
         SentenceTransformerEmbeddings(columns=['x'], model_name=model_name),
         SentenceTransformerEmbeddings(
@@ -240,7 +236,7 @@ def test_mltransform_to_ptransform_with_vertex(self):
     ]
     ptransform_mapper = base._MLTransformToPTransformMapper(
         transforms=transforms,
-        artifact_location=self.artifact_location,
+        artifact_location=artifact_location,
         artifact_mode=None)
 
     ptransform_list = ptransform_mapper.create_and_save_ptransform_list()
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 843f15293a130..2dacb6f88b71f 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -119,7 +119,7 @@ def __init__(
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
       task_type: The downstream task for the embeddings.
-        Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
         SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
       title: Identifier of the text content.
       project: The default GCP project for API calls.

From 2cb6f03763d18dff1c86cf4f8f5ef8cb47457233 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 01:14:09 +0000
Subject: [PATCH 43/52] Revert "add suffix to artifact_location"

This reverts commit cfb18831abc458ed5ed987bcd729d2f06d9710c1.
---
 .../transforms/embeddings/huggingface_test.py | 68 ++++++++++---------
 .../ml/transforms/embeddings/vertex_ai.py     |  2 +-
 2 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index 516c13a2d60be..d932eb1212dfa 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -14,11 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import shutil
 import tempfile
 import unittest
-import uuid
 
 import numpy as np
 from parameterized import parameterized
@@ -57,12 +55,24 @@
     ([{
         test_query_column: test_query,
     }], DEFAULT_MODEL_NAME, [0.13]),
-    ([{
-        test_query_column: 'This is an example sentence',
-    }, {
-        test_query_column: ("Each sentence is converted")
-    }],
-     'sentence-transformers/all-MiniLM-L6-v2', [0.15, 0.14]),
+    (
+        [{
+            test_query_column: 'query: how much protein should a female eat',
+        },
+         {
+             test_query_column: (
+                 "passage: As a general guideline, the CDC's "
+                 "average requirement of protein for women "
+                 "ages 19 to 70 is 46 grams per day. But, "
+                 "as you can see from this chart, you'll need "
+                 "to increase that if you're expecting or training"
+                 " for a marathon. Check out the chart below "
+                 "to see how much protein "
+                 "you should be eating each day.")
+         }],
+        'intfloat/e5-base-v2',
+        # this model requires inputs to be specified as query: and passage:
+        [0.1, 0.1]),
 ]
 
 
@@ -78,7 +88,6 @@ def tearDown(self) -> None:
 
   def test_sentence_transformer_embeddings(self):
     model_name = DEFAULT_MODEL_NAME
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with beam.Pipeline() as pipeline:
@@ -87,9 +96,9 @@ def test_sentence_transformer_embeddings(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >>
-          MLTransform(write_artifact_location=artifact_location).with_transform(
-              embedding_config))
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
 
       def assert_element(element):
         assert len(element[test_query_column]) == 768
@@ -98,7 +107,6 @@ def assert_element(element):
 
   @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.')
   def test_embeddings_with_scale_to_0_1(self):
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     model_name = DEFAULT_MODEL_NAME
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -110,10 +118,10 @@ def test_embeddings_with_scale_to_0_1(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >>
-          MLTransform(write_artifact_location=artifact_location).with_transform(
-              embedding_config).with_transform(
-                  ScaleTo01(columns=[test_query_column])))
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config).with_transform(
+                      ScaleTo01(columns=[test_query_column])))
 
       def assert_element(element):
         assert max(element.feature_1) == 1
@@ -126,14 +134,13 @@ def test_embeddings_with_read_artifact_location(
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     with beam.Pipeline() as p:
       result_pcoll = (
           p
           | "CreateData" >> beam.Create(inputs)
-          | "MLTransform" >>
-          MLTransform(write_artifact_location=artifact_location).with_transform(
-              embedding_config))
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -145,7 +152,7 @@ def test_embeddings_with_read_artifact_location(
           p
           | "CreateData" >> beam.Create(inputs)
           | "MLTransform" >>
-          MLTransform(read_artifact_location=artifact_location))
+          MLTransform(read_artifact_location=self.artifact_location))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(max(x[test_query_column]), 2)))
@@ -154,7 +161,6 @@ def test_embeddings_with_read_artifact_location(
 
   def test_sentence_transformer_with_int_data_types(self):
     model_name = DEFAULT_MODEL_NAME
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
     with self.assertRaises(TypeError):
@@ -165,13 +171,12 @@ def test_sentence_transformer_with_int_data_types(self):
                 test_query_column: 1
             }])
             | "MLTransform" >> MLTransform(
-                write_artifact_location=artifact_location).with_transform(
+                write_artifact_location=self.artifact_location).with_transform(
                     embedding_config))
 
   @parameterized.expand(_parameterized_inputs)
   def test_with_gcs_artifact_location(self, inputs, model_name, output):
-    artifact_location = os.path.join(
-        'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex)
+    artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
@@ -202,7 +207,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
 
   def test_embeddings_with_inference_args(self):
     model_name = DEFAULT_MODEL_NAME
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
+
     inference_args = {'convert_to_numpy': False}
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name,
@@ -214,9 +219,9 @@ def test_embeddings_with_inference_args(self):
           | "CreateData" >> beam.Create([{
               test_query_column: test_query
           }])
-          | "MLTransform" >>
-          MLTransform(write_artifact_location=artifact_location).with_transform(
-              embedding_config))
+          | "MLTransform" >> MLTransform(
+              write_artifact_location=self.artifact_location).with_transform(
+                  embedding_config))
 
       def assert_element(element):
         assert type(element) == torch.Tensor
@@ -228,7 +233,6 @@ def assert_element(element):
 
   def test_mltransform_to_ptransform_with_vertex(self):
     model_name = ''
-    artifact_location = os.path.join(self.artifact_location, uuid.uuid4().hex)
     transforms = [
         SentenceTransformerEmbeddings(columns=['x'], model_name=model_name),
         SentenceTransformerEmbeddings(
@@ -236,7 +240,7 @@ def test_mltransform_to_ptransform_with_vertex(self):
     ]
     ptransform_mapper = base._MLTransformToPTransformMapper(
         transforms=transforms,
-        artifact_location=artifact_location,
+        artifact_location=self.artifact_location,
         artifact_mode=None)
 
     ptransform_list = ptransform_mapper.create_and_save_ptransform_list()
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 2dacb6f88b71f..843f15293a130 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -119,7 +119,7 @@ def __init__(
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
       task_type: The downstream task for the embeddings.
-        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
         SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
       title: Identifier of the text content.
       project: The default GCP project for API calls.

From cd7050e693c693de4063a632a95d4b0ff5ac84e3 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 01:17:08 +0000
Subject: [PATCH 44/52] add no_xdist

---
 .../apache_beam/ml/transforms/embeddings/huggingface_test.py    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index d932eb1212dfa..5e98f77d2deb5 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import pytest
 from parameterized import parameterized
 
 import apache_beam as beam
@@ -79,6 +80,7 @@
 @unittest.skipIf(
     SentenceTransformerEmbeddings is None,
     'sentence-transformers is not installed.')
+@pytest.mark.no_xdist
 class SentenceTrasformerEmbeddingsTest(unittest.TestCase):
   def setUp(self) -> None:
     self.artifact_location = tempfile.mkdtemp()

From 98cd949bc408a18c2184fcf014531a4e89f7a0f8 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 01:17:48 +0000
Subject: [PATCH 45/52] Try fixing pydoc for vertexai

---
 sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 843f15293a130..2dacb6f88b71f 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -119,7 +119,7 @@ def __init__(
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
       task_type: The downstream task for the embeddings.
-        Valid values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
         SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
       title: Identifier of the text content.
       project: The default GCP project for API calls.

From 8ea0906be53f0c4d64236cd8525cf43afc800ff5 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 03:51:22 +0000
Subject: [PATCH 46/52] change tox.ini to use pytest directly

---
 sdks/python/tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index 88c60bce4b190..eb0bbddd09e8b 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -433,4 +433,4 @@ commands =
   # Log aiplatform and its dependencies version for debugging
   /bin/sh -c "pip freeze | grep -E sentence-transformers"
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
-  bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms/embeddings'
\ No newline at end of file
+  /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'

From 6f83d3cee96af25f1aa6be451468a2007a15ceb5 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 20:00:40 +0000
Subject: [PATCH 47/52] raise FileExistError if Attribute file is already
 present

---
 sdks/python/apache_beam/ml/transforms/base.py   | 17 ++++++++++++++---
 .../apache_beam/ml/transforms/base_test.py      | 13 +++++++++++++
 .../transforms/embeddings/huggingface_test.py   | 17 +++++++++--------
 .../ml/transforms/embeddings/vertex_ai.py       |  7 ++++---
 .../ml/transforms/embeddings/vertex_ai_test.py  | 11 +++++++----
 sdks/python/apache_beam/ml/transforms/tft.py    |  4 ----
 sdks/python/tox.ini                             |  5 +++--
 7 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index c0eadedc85ac5..859fbc68ffd68 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -96,8 +96,8 @@ class ArtifactMode(object):
 class MLTransformProvider:
   """
   Data processing transforms that are intended to be used with MLTransform
-  should subclass MLTransformProvider and implement the following methods:
-  1. get_ptransform_for_processing()
+  should subclass MLTransformProvider and implement
+  get_ptransform_for_processing().
 
   get_ptransform_for_processing() method should return a PTransform that can be
   used to process the data.
@@ -184,7 +184,7 @@ def __init__(
     if kwargs:
       _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys())
 
-  # TODO: Add set_model_handler method.
+  # TODO:https://github.com/apache/beam/pull/29564 add set_model_handler method
   @abc.abstractmethod
   def get_model_handler(self) -> ModelHandler:
     """
@@ -398,6 +398,17 @@ def save_attributes(
       artifact_location,
       **kwargs,
   ):
+    # if an artifact location is present, instead of overwriting the
+    # existing file, raise an error since the same artifact location
+    # can be used by multiple beam jobs and this could result in undesired
+    # behavior.
+    if FileSystems.exists(FileSystems.join(artifact_location,
+                                           _ATTRIBUTE_FILE_NAME)):
+      raise FileExistsError(
+          "The artifact location %s already exists and contains %s. Please "
+          "specify a different location." %
+          (artifact_location, _ATTRIBUTE_FILE_NAME))
+
     if _JsonPickleTransformAttributeManager._is_remote_path(artifact_location):
       temp_dir = tempfile.mkdtemp()
       temp_json_file = os.path.join(temp_dir, _ATTRIBUTE_FILE_NAME)
diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py
index 8e1515e7ece1a..e079594361980 100644
--- a/sdks/python/apache_beam/ml/transforms/base_test.py
+++ b/sdks/python/apache_beam/ml/transforms/base_test.py
@@ -567,6 +567,19 @@ def test_with_gcs_location_with_none_options(self):
       self.attribute_manager.save_attributes(
           ptransform_list=[], artifact_location=path)
 
+  def test_with_same_local_artifact_location(self):
+    artifact_location = self.artifact_location
+    attribute_manager = base._JsonPickleTransformAttributeManager()
+
+    ptransform_list = [RunInference(model_handler=FakeModelHandler())]
+
+    attribute_manager.save_attributes(
+        ptransform_list, artifact_location=artifact_location)
+
+    with self.assertRaises(FileExistsError):
+      attribute_manager.save_attributes([lambda x: x],
+                                        artifact_location=artifact_location)
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index 5e98f77d2deb5..e59090151c5e9 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -14,12 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 import numpy as np
-import pytest
 from parameterized import parameterized
 
 import apache_beam as beam
@@ -80,10 +81,11 @@
 @unittest.skipIf(
     SentenceTransformerEmbeddings is None,
     'sentence-transformers is not installed.')
-@pytest.mark.no_xdist
 class SentenceTrasformerEmbeddingsTest(unittest.TestCase):
   def setUp(self) -> None:
-    self.artifact_location = tempfile.mkdtemp()
+    self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_')
+    self.gcs_artifact_location = os.path.join(
+        'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex)
 
   def tearDown(self) -> None:
     shutil.rmtree(self.artifact_location)
@@ -178,7 +180,6 @@ def test_sentence_transformer_with_int_data_types(self):
 
   @parameterized.expand(_parameterized_inputs)
   def test_with_gcs_artifact_location(self, inputs, model_name, output):
-    artifact_location = ('gs://apache-beam-ml/testing/sentence_transformers')
     embedding_config = SentenceTransformerEmbeddings(
         model_name=model_name, columns=[test_query_column])
 
@@ -187,8 +188,8 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
           p
           | "CreateData" >> beam.Create(inputs)
           | "MLTransform" >>
-          MLTransform(write_artifact_location=artifact_location).with_transform(
-              embedding_config))
+          MLTransform(write_artifact_location=self.gcs_artifact_location
+                      ).with_transform(embedding_config))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(np.max(x[test_query_column]), 2)))
@@ -200,7 +201,7 @@ def test_with_gcs_artifact_location(self, inputs, model_name, output):
           p
           | "CreateData" >> beam.Create(inputs)
           | "MLTransform" >>
-          MLTransform(read_artifact_location=artifact_location))
+          MLTransform(read_artifact_location=self.gcs_artifact_location))
       max_ele_pcoll = (
           result_pcoll
           | beam.Map(lambda x: round(np.max(x[test_query_column]), 2)))
@@ -233,7 +234,7 @@ def assert_element(element):
           | beam.Map(lambda x: x[test_query_column])
           | beam.Map(assert_element))
 
-  def test_mltransform_to_ptransform_with_vertex(self):
+  def test_mltransform_to_ptransform_with_sentence_transformer(self):
     model_name = ''
     transforms = [
         SentenceTransformerEmbeddings(columns=['x'], model_name=model_name),
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index 2dacb6f88b71f..b80498fc7a13d 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -118,9 +118,10 @@ def __init__(
     Args:
       model_name: The name of the Vertex AI Text Embedding model.
       columns: The columns containing the text to be embedded.
-      task_type: The downstream task for the embeddings.
-        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
-        SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
+      task_type: The downstream task for the embeddings. Valid values are
+        RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, SEMANTIC_SIMILARITY,
+        CLASSIFICATION, CLUSTERING. For more information on the task type,
+        look at https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings # pylint: disable=line-too-long
       title: Identifier of the text content.
       project: The default GCP project for API calls.
       location: The default location for API calls.
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
index 388df7ae30da5..3d8e1ea31673a 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 import apache_beam as beam
 from apache_beam.ml.inference.base import RunInference
@@ -44,7 +46,9 @@
     VertexAITextEmbeddings is None, 'Vertex AI Python SDK is not installed.')
 class VertexAIEmbeddingsTest(unittest.TestCase):
   def setUp(self) -> None:
-    self.artifact_location = tempfile.mkdtemp()
+    self.artifact_location = tempfile.mkdtemp(prefix='_vertex_ai_test')
+    self.gcs_artifact_location = os.path.join(
+        'gs://apache-beam-ml/testing/vertex_ai', uuid.uuid4().hex)
 
   def tearDown(self) -> None:
     shutil.rmtree(self.artifact_location)
@@ -158,7 +162,6 @@ def test_with_int_data_types(self):
                     embedding_config))
 
   def test_with_gcs_artifact_location(self):
-    artifact_location = ('gs://apache-beam-ml/testing/vertex_ai')
     with beam.Pipeline() as p:
       embedding_config = VertexAITextEmbeddings(
           model_name=model_name, columns=[test_query_column])
@@ -172,7 +175,7 @@ def test_with_gcs_artifact_location(self):
         _ = self.pipeline_with_configurable_artifact_location(
             pipeline=data,
             embedding_config=embedding_config,
-            write_artifact_location=artifact_location)
+            write_artifact_location=self.gcs_artifact_location)
 
       with beam.Pipeline() as p:
         data = (
@@ -183,7 +186,7 @@ def test_with_gcs_artifact_location(self):
                 test_query_column: test_query
             }]))
         result_pcoll = self.pipeline_with_configurable_artifact_location(
-            pipeline=data, read_artifact_location=artifact_location)
+            pipeline=data, read_artifact_location=self.gcs_artifact_location)
 
         def assert_element(element):
           assert round(element, 2) == 0.15
diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py
index 3a103962045f6..8b571d9a685e9 100644
--- a/sdks/python/apache_beam/ml/transforms/tft.py
+++ b/sdks/python/apache_beam/ml/transforms/tft.py
@@ -105,10 +105,6 @@ def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
           "artifact_location is not specified. Please specify the "
           "artifact_location for the op %s" % self.__class__.__name__)
 
-    transforms = kwargs.get('transforms')
-    if transforms:
-      params['transforms'] = transforms
-
     artifact_mode = kwargs.get('artifact_mode')
     if artifact_mode:
       params['artifact_mode'] = artifact_mode
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index eb0bbddd09e8b..dc9e6a28cb9ed 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -425,12 +425,13 @@ commands =
   /bin/sh -c 'pytest -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 -m uses_vertex_ai {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
 
 
-[testenv:py{38,39,310,311}-sentence-transformers-222]
+[testenv:py{38,39,310,311}-embeddings]
 deps =
   sentence-transformers==2.2.2
 extras = test,gcp
 commands =
   # Log aiplatform and its dependencies version for debugging
   /bin/sh -c "pip freeze | grep -E sentence-transformers"
+  /bin/sh -c "pip freeze | grep -E google-cloud-aiplatform"
   # Allow exit code 5 (no tests run) so that we can run this command safely on arbitrary subdirectories.
-  /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
+  /bin/sh -c 'pytest apache_beam/ml/transforms/embeddings -o junit_suite_name={envname} --junitxml=pytest_{envname}.xml -n 6 {posargs}; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'

From 9dce3cf09491f86f4e8bf8b1e336b1c41b651423 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 20:04:03 +0000
Subject: [PATCH 48/52] modify build.gradle to match tox task names

---
 sdks/python/test-suites/tox/py38/build.gradle | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle
index a5a624b998e8d..1e03b50580830 100644
--- a/sdks/python/test-suites/tox/py38/build.gradle
+++ b/sdks/python/test-suites/tox/py38/build.gradle
@@ -141,9 +141,9 @@ toxTask "testPy38transformers-430", "py38-transformers-430", "${posargs}"
 test.dependsOn "testPy38transformers-430"
 preCommitPyCoverage.dependsOn "testPy38transformers-430"
 
-toxTask "testPy38sentenceTransformers-222", "py38-sentence-transformers-222", "${posargs}"
-test.dependsOn "testPy38sentenceTransformers-222"
-preCommitPyCoverage.dependsOn "testPy38sentenceTransformers-222"
+toxTask "testPy38embeddingsMLTransform", "py38-embeddings", "${posargs}"
+test.dependsOn "testPy38embeddingsMLTransform"
+preCommitPyCoverage.dependsOn "testPy38embeddingsMLTransform"
 
 toxTask "whitespacelint", "whitespacelint", "${posargs}"
 

From 539c9adf8040320e76d6870e954967b38b634f21 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 20:04:23 +0000
Subject: [PATCH 49/52] Add note to CHANGES.md

---
 CHANGES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGES.md b/CHANGES.md
index 7686b7a92d96a..0cc27d3560475 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -67,6 +67,7 @@
 * Python GCSIO is now implemented with GCP GCS Client instead of apitools ([#25676](https://github.com/apache/beam/issues/25676))
 * Adding support for LowCardinality DataType in ClickHouse (Java) ([#29533](https://github.com/apache/beam/pull/29533)).
 * Added support for handling bad records to KafkaIO (Java) ([#29546](https://github.com/apache/beam/pull/29546))
+* Add support for generating text embeddings in MLTransform for Vertex AI and Huggingface hub models.([#29564](https://github.com/apache/beam/pull/29564))
 
 ## New Features / Improvements
 

From b967cd8fb9183ee557d0d757ee75cbc7e353a060 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Fri, 8 Dec 2023 20:51:25 +0000
Subject: [PATCH 50/52] change gcs bucket to gs://temp-storage-for-perf-tests

---
 .../apache_beam/ml/transforms/embeddings/huggingface_test.py  | 4 +++-
 .../apache_beam/ml/transforms/embeddings/vertex_ai_test.py    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
index e59090151c5e9..779a6daf8f3c1 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/huggingface_test.py
@@ -84,8 +84,10 @@
 class SentenceTrasformerEmbeddingsTest(unittest.TestCase):
   def setUp(self) -> None:
     self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_')
+    # this bucket has TTL and will be deleted periodically
     self.gcs_artifact_location = os.path.join(
-        'gs://apache-beam-ml/testing/sentence_transformers', uuid.uuid4().hex)
+        'gs://temp-storage-for-perf-tests/sentence_transformers',
+        uuid.uuid4().hex)
 
   def tearDown(self) -> None:
     shutil.rmtree(self.artifact_location)
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
index 3d8e1ea31673a..04a730eaefb0f 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai_test.py
@@ -48,7 +48,7 @@ class VertexAIEmbeddingsTest(unittest.TestCase):
   def setUp(self) -> None:
     self.artifact_location = tempfile.mkdtemp(prefix='_vertex_ai_test')
     self.gcs_artifact_location = os.path.join(
-        'gs://apache-beam-ml/testing/vertex_ai', uuid.uuid4().hex)
+        'gs://temp-storage-for-perf-tests/vertex_ai', uuid.uuid4().hex)
 
   def tearDown(self) -> None:
     shutil.rmtree(self.artifact_location)

From f1bb42c3376520da4a06006f5c6fc24acda5da90 Mon Sep 17 00:00:00 2001
From: Anand Inguva <anandinguva98@gmail.com>
Date: Mon, 11 Dec 2023 09:55:51 -0500
Subject: [PATCH 51/52] Add TODO GH links

---
 sdks/python/apache_beam/ml/transforms/base.py               | 6 ++++--
 .../apache_beam/ml/transforms/embeddings/vertex_ai.py       | 3 ++-
 sdks/python/apache_beam/ml/transforms/utils.py              | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py
index 859fbc68ffd68..d5f4d1b60e140 100644
--- a/sdks/python/apache_beam/ml/transforms/base.py
+++ b/sdks/python/apache_beam/ml/transforms/base.py
@@ -162,7 +162,8 @@ def append_transform(self, transform: BaseOperation):
     """
 
 
-# TODO: Add support for inference_fn
+# TODO:https://github.com/apache/beam/issues/29356
+#  Add support for inference_fn
 class EmbeddingsManager(MLTransformProvider):
   def __init__(
       self,
@@ -385,7 +386,8 @@ class _JsonPickleTransformAttributeManager(_TransformAttributeManager):
   @staticmethod
   def _is_remote_path(path):
     is_gcs = path.find('gs://') != -1
-    # TODO: Add support for other remote paths.
+    # TODO:https://github.com/apache/beam/issues/29356
+    #  Add support for other remote paths.
     if not is_gcs and path.find('://') != -1:
       raise RuntimeError(
           "Artifact locations are currently supported for only available for "
diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
index b80498fc7a13d..1f4c1577eb797 100644
--- a/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
+++ b/sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py
@@ -40,7 +40,8 @@
 __all__ = ["VertexAITextEmbeddings"]
 
 DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT"
-# TODO: Can this list be automatically pulled from Vertex SDK?
+# TODO: https://github.com/apache/beam/issues/29356
+# Can this list be automatically pulled from Vertex SDK?
 TASK_TYPE_INPUTS = [
     "RETRIEVAL_DOCUMENT",
     "RETRIEVAL_QUERY",
diff --git a/sdks/python/apache_beam/ml/transforms/utils.py b/sdks/python/apache_beam/ml/transforms/utils.py
index b0aef5898cf08..fadf611b0e66e 100644
--- a/sdks/python/apache_beam/ml/transforms/utils.py
+++ b/sdks/python/apache_beam/ml/transforms/utils.py
@@ -32,7 +32,8 @@ class ArtifactsFetcher():
   def __init__(self, artifact_location):
     files = os.listdir(artifact_location)
     files.remove(base._ATTRIBUTE_FILE_NAME)
-    # TODO: Integrate ArtifactFetcher into MLTransform.
+    # TODO: https://github.com/apache/beam/issues/29356
+    #  Integrate ArtifactFetcher into MLTransform.
     if len(files) > 1:
       raise NotImplementedError(
           "MLTransform may have been utilized alongside transforms written "

From c173d6ad13ab24d469ca251bd81e0bf539faff20 Mon Sep 17 00:00:00 2001
From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com>
Date: Mon, 11 Dec 2023 10:14:18 -0500
Subject: [PATCH 52/52] Update CHANGES.md

Co-authored-by: Danny McCormick <dannymccormick@google.com>
---
 CHANGES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 0cc27d3560475..60b5a820cf3bd 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -67,7 +67,7 @@
 * Python GCSIO is now implemented with GCP GCS Client instead of apitools ([#25676](https://github.com/apache/beam/issues/25676))
 * Adding support for LowCardinality DataType in ClickHouse (Java) ([#29533](https://github.com/apache/beam/pull/29533)).
 * Added support for handling bad records to KafkaIO (Java) ([#29546](https://github.com/apache/beam/pull/29546))
-* Add support for generating text embeddings in MLTransform for Vertex AI and Huggingface hub models.([#29564](https://github.com/apache/beam/pull/29564))
+* Add support for generating text embeddings in MLTransform for Vertex AI and Hugging Face Hub models.([#29564](https://github.com/apache/beam/pull/29564))
 
 ## New Features / Improvements