From 2b3c77e41dab4ee4c97fbeeeb6c42454f1cf0404 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 9 Nov 2023 14:28:56 +0100 Subject: [PATCH] fix: make `JoinDocuments` correctly handle duplicate documents w null scores (#6261) * fix error with null values * release note * simplify --- haystack/nodes/other/join_docs.py | 6 ++-- ...join-docs-null-score-746c392a87adffcc.yaml | 7 ++++ test/nodes/test_join_documents.py | 35 +++++++++++++++++++ 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/fix-join-docs-null-score-746c392a87adffcc.yaml diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 7ce0de819d..274e90a38d 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -14,8 +14,8 @@ class JoinDocuments(JoinNode): A node to join documents outputted by multiple retriever nodes. The node allows multiple join modes: - * concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded. - The score is only determined by the last node that outputs the document. + * concatenate: combine the documents from multiple nodes. + In case of duplicate documents, the one with the highest score is kept. * merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents. * reciprocal_rank_fusion: combines the documents based on their rank in multiple nodes. @@ -130,7 +130,7 @@ def _concatenate_results(self, results, document_map): for doc in result: if doc.id == idx: tmp.append(doc) - item_best_score = max(tmp, key=lambda x: x.score) + item_best_score = max(tmp, key=lambda x: x.score if x.score is not None else -inf) scores_map.update({idx: item_best_score.score}) return scores_map diff --git a/releasenotes/notes/fix-join-docs-null-score-746c392a87adffcc.yaml b/releasenotes/notes/fix-join-docs-null-score-746c392a87adffcc.yaml new file mode 100644 index 0000000000..33d2e9311d --- /dev/null +++ b/releasenotes/notes/fix-join-docs-null-score-746c392a87adffcc.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + When using `JoinDocuments` with `join_mode=concatenate` (default) and + passing duplicate documents, including some with a null score, this + node raised an exception. + Now this case is handled correctly and the documents are joined as expected. diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index aa303e26b2..463aeaa577 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -78,3 +78,38 @@ def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(): result, _ = join_docs.run(inputs) assert len(result["documents"]) == 2 assert result["documents"] == expected_outputs["documents"] + + +@pytest.mark.unit +def test_joindocuments_concatenate_duplicate_docs_null_score(): + """ + Test that the concatenate method correctly handles duplicate documents, + when one has a null score. + """ + inputs = [ + { + "documents": [ + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3), + Document(content="text document 3", content_type="text", score=None), + ] + }, + { + "documents": [ + Document(content="text document 2", content_type="text", score=0.7), + Document(content="text document 1", content_type="text", score=None), + ] + }, + ] + expected_outputs = { + "documents": [ + Document(content="text document 2", content_type="text", score=0.7), + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 3", content_type="text", score=None), + ] + } + + join_docs = JoinDocuments(join_mode="concatenate") + result, _ = join_docs.run(inputs) + assert len(result["documents"]) == 3 + assert result["documents"] == expected_outputs["documents"]