diff --git a/haystack/preview/components/preprocessors/__init__.py b/haystack/preview/components/preprocessors/__init__.py new file mode 100644 index 0000000000..33a0e2cd18 --- /dev/null +++ b/haystack/preview/components/preprocessors/__init__.py @@ -0,0 +1,3 @@ +from haystack.preview.components.preprocessors.text_document_splitter import TextDocumentSplitter + +__all__ = ["TextDocumentSplitter"] diff --git a/haystack/preview/components/preprocessors/text_document_splitter.py b/haystack/preview/components/preprocessors/text_document_splitter.py new file mode 100644 index 0000000000..6273f8167f --- /dev/null +++ b/haystack/preview/components/preprocessors/text_document_splitter.py @@ -0,0 +1,104 @@ +from copy import deepcopy +from typing import List, Dict, Any, Literal + +from more_itertools import windowed + +from haystack.preview import component, Document, default_from_dict, default_to_dict + + +@component +class TextDocumentSplitter: + """ + Splits a list of text documents into a list of text documents with shorter texts. + This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models. + """ + + def __init__( + self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0 + ): + """ + :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ", + "sentence" for splitting by ".", or "passage" for splitting by "\n\n". + :param split_length: The maximum number of units in each split. + :param split_overlap: The number of units that each split should overlap. + """ + + self.split_by = split_by + if split_by not in ["word", "sentence", "passage"]: + raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.") + if split_length <= 0: + raise ValueError("split_length must be greater than 0.") + self.split_length = split_length + if split_overlap < 0: + raise ValueError("split_overlap must be greater than or equal to 0.") + self.split_overlap = split_overlap + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Splits the documents by split_by after split_length units with an overlap of split_overlap units. + Returns a list of documents with the split texts. + A metadata field "source_id" is added to each document to keep track of the original document that was split. + :param documents: The documents to split. + :return: A list of documents with the split texts. + """ + + if not documents or not isinstance(documents, list) or not isinstance(documents[0], Document): + raise TypeError("TextDocumentSplitter expects a List of Documents as input.") + split_docs = [] + for doc in documents: + if doc.text is None: + raise ValueError( + f"TextDocumentSplitter only works with text documents but document.text for document ID {doc.id} is None." + ) + units = self._split_into_units(doc.text, self.split_by) + text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) + metadata = deepcopy(doc.metadata) + metadata["source_id"] = doc.id + split_docs += [Document(text=txt, metadata=metadata) for txt in text_splits] + return {"documents": split_docs} + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, split_by=self.split_by, split_length=self.split_length, split_overlap=self.split_overlap + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TextDocumentSplitter": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]: + if split_by == "passage": + split_at = "\n\n" + elif split_by == "sentence": + split_at = "." + elif split_by == "word": + split_at = " " + else: + raise NotImplementedError( + "TextDocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options." + ) + units = text.split(split_at) + # Add the delimiter back to all units except the last one + for i in range(len(units) - 1): + units[i] += split_at + return units + + def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]: + """ + Concatenates the elements into parts of split_length units. + """ + text_splits = [] + segments = windowed(elements, n=split_length, step=split_length - split_overlap) + for seg in segments: + current_units = [unit for unit in seg if unit is not None] + txt = "".join(current_units) + if len(txt) > 0: + text_splits.append(txt) + return text_splits diff --git a/releasenotes/notes/preprocessor-2-0-9828d930562fa3f5.yaml b/releasenotes/notes/preprocessor-2-0-9828d930562fa3f5.yaml new file mode 100644 index 0000000000..bb58645d6d --- /dev/null +++ b/releasenotes/notes/preprocessor-2-0-9828d930562fa3f5.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Add the `TextDocumentSplitter` component for Haystack 2.0 that splits a Document with long text into multiple Documents with shorter texts. Thereby the texts match the maximum length that the language models in Embedders or other components can process. diff --git a/test/preview/components/preprocessors/__init__.py b/test/preview/components/preprocessors/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/preview/components/preprocessors/test_text_document_splitter.py b/test/preview/components/preprocessors/test_text_document_splitter.py new file mode 100644 index 0000000000..584afc6b3e --- /dev/null +++ b/test/preview/components/preprocessors/test_text_document_splitter.py @@ -0,0 +1,157 @@ +import pytest + +from haystack.preview import Document +from haystack.preview.components.preprocessors import TextDocumentSplitter + + +class TestTextDocumentSplitter: + @pytest.mark.unit + def test_non_text_document(self): + with pytest.raises( + ValueError, match="TextDocumentSplitter only works with text documents but document.text for document ID" + ): + splitter = TextDocumentSplitter() + splitter.run(documents=[Document()]) + + @pytest.mark.unit + def test_single_doc(self): + with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."): + splitter = TextDocumentSplitter() + splitter.run(documents=Document()) + + @pytest.mark.unit + def test_empty_list(self): + with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."): + splitter = TextDocumentSplitter() + splitter.run(documents=[]) + + @pytest.mark.unit + def test_unsupported_split_by(self): + with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."): + TextDocumentSplitter(split_by="unsupported") + + @pytest.mark.unit + def test_unsupported_split_length(self): + with pytest.raises(ValueError, match="split_length must be greater than 0."): + TextDocumentSplitter(split_length=0) + + @pytest.mark.unit + def test_unsupported_split_overlap(self): + with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."): + TextDocumentSplitter(split_overlap=-1) + + @pytest.mark.unit + def test_split_by_word(self): + splitter = TextDocumentSplitter(split_by="word", split_length=10) + result = splitter.run( + documents=[ + Document( + text="This is a text with some words. There is a second sentence. And there is a third sentence." + ) + ] + ) + assert len(result["documents"]) == 2 + assert result["documents"][0].text == "This is a text with some words. There is a " + assert result["documents"][1].text == "second sentence. And there is a third sentence." + + @pytest.mark.unit + def test_split_by_word_multiple_input_docs(self): + splitter = TextDocumentSplitter(split_by="word", split_length=10) + result = splitter.run( + documents=[ + Document( + text="This is a text with some words. There is a second sentence. And there is a third sentence." + ), + Document( + text="This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence." + ), + ] + ) + assert len(result["documents"]) == 5 + assert result["documents"][0].text == "This is a text with some words. There is a " + assert result["documents"][1].text == "second sentence. And there is a third sentence." + assert result["documents"][2].text == "This is a different text with some words. There is " + assert result["documents"][3].text == "a second sentence. And there is a third sentence. And " + assert result["documents"][4].text == "there is a fourth sentence." + + @pytest.mark.unit + def test_split_by_sentence(self): + splitter = TextDocumentSplitter(split_by="sentence", split_length=1) + result = splitter.run( + documents=[ + Document( + text="This is a text with some words. There is a second sentence. And there is a third sentence." + ) + ] + ) + assert len(result["documents"]) == 3 + assert result["documents"][0].text == "This is a text with some words." + assert result["documents"][1].text == " There is a second sentence." + assert result["documents"][2].text == " And there is a third sentence." + + @pytest.mark.unit + def test_split_by_passage(self): + splitter = TextDocumentSplitter(split_by="passage", split_length=1) + result = splitter.run( + documents=[ + Document( + text="This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage." + ) + ] + ) + assert len(result["documents"]) == 3 + assert result["documents"][0].text == "This is a text with some words. There is a second sentence.\n\n" + assert result["documents"][1].text == "And there is a third sentence.\n\n" + assert result["documents"][2].text == " And another passage." + + @pytest.mark.unit + def test_split_by_word_with_overlap(self): + splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2) + result = splitter.run( + documents=[ + Document( + text="This is a text with some words. There is a second sentence. And there is a third sentence." + ) + ] + ) + assert len(result["documents"]) == 2 + assert result["documents"][0].text == "This is a text with some words. There is a " + assert result["documents"][1].text == "is a second sentence. And there is a third sentence." + + @pytest.mark.unit + def test_to_dict(self): + splitter = TextDocumentSplitter() + data = splitter.to_dict() + assert data == { + "type": "TextDocumentSplitter", + "init_parameters": {"split_by": "word", "split_length": 200, "split_overlap": 0}, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + splitter = TextDocumentSplitter(split_by="passage", split_length=100, split_overlap=1) + data = splitter.to_dict() + assert data == { + "type": "TextDocumentSplitter", + "init_parameters": {"split_by": "passage", "split_length": 100, "split_overlap": 1}, + } + + @pytest.mark.unit + def test_from_dict(self): + data = { + "type": "TextDocumentSplitter", + "init_parameters": {"split_by": "passage", "split_length": 100, "split_overlap": 1}, + } + splitter = TextDocumentSplitter.from_dict(data) + assert splitter.split_by == "passage" + assert splitter.split_length == 100 + assert splitter.split_overlap == 1 + + @pytest.mark.unit + def test_source_id_stored_in_metadata(self): + splitter = TextDocumentSplitter(split_by="word", split_length=10) + doc1 = Document(text="This is a text with some words.") + doc2 = Document(text="This is a different text with some words.") + result = splitter.run(documents=[doc1, doc2]) + assert result["documents"][0].metadata["source_id"] == doc1.id + assert result["documents"][1].metadata["source_id"] == doc2.id