fix: Support isolated node eval in run_batch in Generators (#5291)

* Add isolated node eval to BaseGenerator's run_batch * Add unit tests
deepset-ai · Jul 7, 2023 · 0697f5c · 0697f5c
1 parent 395854d
commit 0697f5c
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 82 deletions.
diff --git a/haystack/nodes/answer_generator/base.py b/haystack/nodes/answer_generator/base.py
@@ -61,12 +61,31 @@ def run_batch(  # type: ignore
         queries: List[str],
         documents: Union[List[Document], List[List[Document]]],
         top_k: Optional[int] = None,
+        labels: Optional[List[MultiLabel]] = None,
         batch_size: Optional[int] = None,
+        add_isolated_node_eval: bool = False,
         max_tokens: Optional[int] = None,
     ):
         results = self.predict_batch(
             queries=queries, documents=documents, top_k=top_k, batch_size=batch_size, max_tokens=max_tokens
         )
+
+        # run evaluation with "perfect" labels as node inputs to calculate "upper bound" metrics for just this node
+        if add_isolated_node_eval and labels is not None:
+            relevant_documents = []
+            for labelx in labels:
+                # Deduplicate same Documents in a MultiLabel based on their Document ID and filter out empty Documents
+                relevant_docs_labels = list(
+                    {
+                        label.document.id: label.document
+                        for label in labelx.labels
+                        if not isinstance(label.document.content, str) or label.document.content.strip() != ""
+                    }.values()
+                )
+                relevant_documents.append(relevant_docs_labels)
+            results_label_input = self.predict_batch(queries=queries, documents=relevant_documents, top_k=top_k)
+
+            results["answers_isolated"] = results_label_input["answers"]
         return results, "output_1"
 
     def _flatten_docs(self, documents: List[Document]):

diff --git a/test/conftest.py b/test/conftest.py
@@ -47,7 +47,7 @@
     PromptTemplate,
 )
 from haystack.nodes.prompt import PromptNode
-from haystack.schema import Document, FilterType
+from haystack.schema import Document, FilterType, MultiLabel, Label, Span
 
 from .mocks import pinecone as pinecone_mock
 
@@ -476,6 +476,43 @@ def gc_cleanup(request):
     gc.collect()
 
 
+@pytest.fixture
+def eval_labels() -> List[MultiLabel]:
+    EVAL_LABELS = [
+        MultiLabel(
+            labels=[
+                Label(
+                    query="Who lives in Berlin?",
+                    answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]),
+                    document=Document(
+                        id="a0747b83aea0b60c4b114b15476dd32d",
+                        content_type="text",
+                        content="My name is Carla and I live in Berlin",
+                    ),
+                    is_correct_answer=True,
+                    is_correct_document=True,
+                    origin="gold-label",
+                )
+            ]
+        ),
+        MultiLabel(
+            labels=[
+                Label(
+                    query="Who lives in Munich?",
+                    answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]),
+                    document=Document(
+                        id="something_else", content_type="text", content="My name is Carla and I live in Munich"
+                    ),
+                    is_correct_answer=True,
+                    is_correct_document=True,
+                    origin="gold-label",
+                )
+            ]
+        ),
+    ]
+    return EVAL_LABELS
+
+
 @pytest.fixture
 def deepset_cloud_fixture():
     if MOCK_DC:

diff --git a/test/nodes/test_generator.py b/test/nodes/test_generator.py
@@ -2,7 +2,7 @@
 
 import pytest
 from haystack import Pipeline
-from haystack.schema import Document
+from haystack.schema import Document, Answer
 from haystack.nodes.answer_generator import OpenAIAnswerGenerator
 from haystack.nodes import PromptTemplate
 
@@ -135,3 +135,42 @@ def test_openai_answer_generator_pipeline_max_tokens():
         result = pipeline.run(query=question, documents=nyc_docs, params={"generator": {"max_tokens": 3}})
         assert result["answers"] == mocked_response
         openai_generator.run.assert_called_with(query=question, documents=nyc_docs, max_tokens=3)
+
+
+@pytest.mark.unit
+@patch("haystack.nodes.answer_generator.openai.OpenAIAnswerGenerator.predict")
+def test_openai_answer_generator_run_with_labels_and_isolated_node_eval(patched_predict, eval_labels):
+    label = eval_labels[0]
+    query = label.query
+    document = label.labels[0].document
+
+    patched_predict.return_value = {
+        "answers": [Answer(answer=label.labels[0].answer.answer, document_ids=[document.id])]
+    }
+    with patch("haystack.nodes.answer_generator.openai.load_openai_tokenizer"):
+        openai_generator = OpenAIAnswerGenerator(api_key="fake_api_key", model="text-babbage-001", top_k=1)
+        result, _ = openai_generator.run(query=query, documents=[document], labels=label, add_isolated_node_eval=True)
+
+    assert "answers_isolated" in result
+
+
+@pytest.mark.unit
+@patch("haystack.nodes.answer_generator.base.BaseGenerator.predict_batch")
+def test_openai_answer_generator_run_batch_with_labels_and_isolated_node_eval(patched_predict_batch, eval_labels):
+    queries = [label.query for label in eval_labels]
+    documents = [[label.labels[0].document] for label in eval_labels]
+
+    patched_predict_batch.return_value = {
+        "queries": queries,
+        "answers": [
+            [Answer(answer=label.labels[0].answer.answer, document_ids=[label.labels[0].document.id])]
+            for label in eval_labels
+        ],
+    }
+    with patch("haystack.nodes.answer_generator.openai.load_openai_tokenizer"):
+        openai_generator = OpenAIAnswerGenerator(api_key="fake_api_key", model="text-babbage-001", top_k=1)
+        result, _ = openai_generator.run_batch(
+            queries=queries, documents=documents, labels=eval_labels, add_isolated_node_eval=True
+        )
+
+    assert "answers_isolated" in result