Skip to content

Commit

Permalink
Default truncation to second for text similarity the task type(#713)
Browse files Browse the repository at this point in the history
In reranking the first input (the query) is generally shorter. In this case
it makes more sense to truncate the second input (the document text)
  • Loading branch information
davidkyle authored Aug 5, 2024
1 parent bee6d0e commit fd8886d
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
3 changes: 3 additions & 0 deletions eland/ml/pytorch/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,9 @@ def _create_config(
tokenization_config.span = 128
tokenization_config.truncate = "none"

if self._task_type == "text_similarity":
tokenization_config.truncate = "second"

if self._traceable_model.classification_labels():
inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
tokenization=tokenization_config,
Expand Down
3 changes: 3 additions & 0 deletions tests/ml/pytorch/test_pytorch_model_config_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ def test_model_config(
assert isinstance(config.inference_config.classification_labels, list)
assert len(config.inference_config.classification_labels) > 0

if task_type == "text_similarity":
assert tokenization.truncate == "second"

del tm

def test_model_config_with_prefix_string(self):
Expand Down

0 comments on commit fd8886d

Please sign in to comment.