Default truncation to second for text similarity the task type(#713)

In reranking the first input (the query) is generally shorter. In this case it makes more sense to truncate the second input (the document text)
elastic · Aug 5, 2024 · fd8886d · fd8886d
1 parent bee6d0e
commit fd8886d
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 0 deletions.
diff --git a/eland/ml/pytorch/transformers.py b/eland/ml/pytorch/transformers.py
@@ -770,6 +770,9 @@ def _create_config(
             tokenization_config.span = 128
             tokenization_config.truncate = "none"
 
+        if self._task_type == "text_similarity":
+            tokenization_config.truncate = "second"
+
         if self._traceable_model.classification_labels():
             inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
                 tokenization=tokenization_config,

diff --git a/tests/ml/pytorch/test_pytorch_model_config_pytest.py b/tests/ml/pytorch/test_pytorch_model_config_pytest.py
@@ -217,6 +217,9 @@ def test_model_config(
                 assert isinstance(config.inference_config.classification_labels, list)
                 assert len(config.inference_config.classification_labels) > 0
 
+            if task_type == "text_similarity":
+                assert tokenization.truncate == "second"
+
             del tm
 
     def test_model_config_with_prefix_string(self):