python[patch]: evaluators can return primitives (#1203)

```python from langsmith import evaluate def foo(run, example): return 0 def bar(run, example): return "long" # removed, needs to be list of dict # def baz(run, example): # return [0, 0.2, "how are ya"] def app(inputs): return inputs evaluate(app, data="Sample Dataset 3", evaluators=[foo, bar, baz]) ```
langchain-ai · Nov 14, 2024 · d8adcde · d8adcde
2 parents 33a621c + cf80a82
commit d8adcde
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 28 deletions.
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -1848,7 +1848,7 @@ def extract_evaluation_results_keys(node, variables):
     try:
         tree = ast.parse(python_code)
         function_def = tree.body[0]
-        if not isinstance(function_def, ast.FunctionDef):
+        if not isinstance(function_def, (ast.FunctionDef, ast.AsyncFunctionDef)):
             return []
 
         variables = {}

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -236,9 +236,8 @@ def _coerce_evaluation_result(
                     "Expected an EvaluationResult object, or dict with a metric"
                     f" 'key' and optional 'score'; got empty result: {result}"
                 )
-            if "key" not in result:
-                if allow_no_key:
-                    result["key"] = self._name
+            if "key" not in result and allow_no_key:
+                result["key"] = self._name
             if all(k not in result for k in ("score", "value", "comment")):
                 raise ValueError(
                     "Expected an EvaluationResult object, or dict with a metric"
@@ -265,27 +264,41 @@ def _coerce_evaluation_results(
             return EvaluationResults(**cp)
 
         return self._coerce_evaluation_result(
-            cast(dict, results), allow_no_key=True, source_run_id=source_run_id
+            cast(dict, results), source_run_id=source_run_id, allow_no_key=True
         )
 
     def _format_result(
         self,
-        result: Union[EvaluationResult, EvaluationResults, dict],
+        result: Union[
+            EvaluationResult, EvaluationResults, dict, str, int, bool, float, list
+        ],
         source_run_id: uuid.UUID,
     ) -> Union[EvaluationResult, EvaluationResults]:
-        if isinstance(result, EvaluationResult):
+        if isinstance(result, (bool, float, int)):
+            result = {"score": result}
+        elif not result:
+            raise ValueError(
+                f"Expected a non-empty dict, str, bool, int, float, list, "
+                f"EvaluationResult, or EvaluationResults. Got {result}"
+            )
+        elif isinstance(result, EvaluationResult):
             if not result.source_run_id:
                 result.source_run_id = source_run_id
             return result
-        if not result:
-            raise ValueError(
-                "Expected an EvaluationResult or EvaluationResults object, or a"
-                " dict with key and one of score or value, EvaluationResults,"
-                f" got {result}"
-            )
-        if not isinstance(result, dict):
+        elif isinstance(result, list):
+            if not all(isinstance(x, dict) for x in result):
+                raise ValueError(
+                    f"Expected a list of dicts or EvaluationResult. Received {result}."
+                )
+            result = {"results": result}  # type: ignore[misc]
+        elif isinstance(result, str):
+            result = {"value": result}
+        elif isinstance(result, dict):
+            pass
+        else:
             raise ValueError(
-                f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
+                f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
+                f"EvaluationResults. Got {result}"
             )
 
         return self._coerce_evaluation_results(result, source_run_id)

diff --git a/python/tests/unit_tests/evaluation/test_evaluator.py b/python/tests/unit_tests/evaluation/test_evaluator.py
@@ -321,8 +321,8 @@ async def sample_evaluator(
     assert result["results"][1].score == 2.0
 
 
-@pytest.mark.parametrize("response", [None, {}, {"accuracy": 5}])
-async def test_evaluator_raises_for_null_ouput(response: Any):
+@pytest.mark.parametrize("response", [None, {}, []])
+async def test_evaluator_raises_for_null_output(response: Any):
     @run_evaluator  # type: ignore
     def bad_evaluator(run: schemas.Run, example: schemas.Example):
         return response
@@ -334,13 +334,36 @@ async def abad_evaluator(run: schemas.Run, example: schemas.Example):
     fake_run = MagicMock()
     fake_example = MagicMock()
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
         bad_evaluator.evaluate_run(fake_run, fake_example)
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
         await bad_evaluator.aevaluate_run(fake_run, fake_example)
 
-    with pytest.raises(ValueError, match="Expected an EvaluationResult "):
+    with pytest.raises(ValueError, match="Expected a non-empty "):
+        await abad_evaluator.aevaluate_run(fake_run, fake_example)
+
+
+@pytest.mark.parametrize("response", [[5], {"accuracy": 5}])
+async def test_evaluator_raises_for_bad_output(response: Any):
+    @run_evaluator  # type: ignore
+    def bad_evaluator(run: schemas.Run, example: schemas.Example):
+        return response
+
+    @run_evaluator  # type: ignore
+    async def abad_evaluator(run: schemas.Run, example: schemas.Example):
+        return response
+
+    fake_run = MagicMock()
+    fake_example = MagicMock()
+
+    with pytest.raises(ValueError, match="Expected"):
+        bad_evaluator.evaluate_run(fake_run, fake_example)
+
+    with pytest.raises(ValueError, match="Expected"):
+        await bad_evaluator.aevaluate_run(fake_run, fake_example)
+
+    with pytest.raises(ValueError, match="Expected"):
         await abad_evaluator.aevaluate_run(fake_run, fake_example)
 
 

diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
@@ -196,11 +196,26 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    def eval_float(run, example):
+        ordering_of_stuff.append("evaluate")
+        return 0.2
+
+    def eval_str(run, example):
+        ordering_of_stuff.append("evaluate")
+        return "good"
+
+    def eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return [
+            {"score": True, "key": "list_eval_bool"},
+            {"score": 1, "key": "list_eval_int"},
+        ]
+
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=[score_value_first, eval_float, eval_str, eval_list],
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -231,14 +246,14 @@ def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (N_PREDS - 1) * 5
 
     def score_value(run, example):
         return {"score": 0.7}
@@ -260,6 +275,22 @@ def score_value(run, example):
         assert r["run"].reference_example_id in dev_xample_ids
     assert not fake_request.should_fail
 
+    # Returning list of non-dicts not supported.
+    def bad_eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return ["foo", 1]
+
+    results = evaluate(
+        predict,
+        client=client,
+        data=dev_split,
+        evaluators=[bad_eval_list],
+        num_repetitions=NUM_REPETITIONS,
+        blocking=blocking,
+    )
+    for r in results:
+        assert r["evaluation_results"]["results"][0].extra == {"error": True}
+
 
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
@@ -366,11 +397,26 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    async def eval_float(run, example):
+        ordering_of_stuff.append("evaluate")
+        return 0.2
+
+    async def eval_str(run, example):
+        ordering_of_stuff.append("evaluate")
+        return "good"
+
+    async def eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return [
+            {"score": True, "key": "list_eval_bool"},
+            {"score": 1, "key": "list_eval_int"},
+        ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first],
+        evaluators=[score_value_first, eval_float, eval_str, eval_list],
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -406,14 +452,14 @@ async def score_value_first(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
-    assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
+    assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
     assert slow_index is not None
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS * 2) - 2
+    assert slow_index == (N_PREDS - 1) * 5
 
     assert fake_request.created_session["name"]
 
@@ -431,3 +477,19 @@ async def score_value(run, example):
         assert r["evaluation_results"]["results"][0].score == 0.7
         assert r["run"].reference_example_id in dev_xample_ids
     assert not fake_request.should_fail
+    # Returning list of non-dicts not supported.
+
+    async def bad_eval_list(run, example):
+        ordering_of_stuff.append("evaluate")
+        return ["foo", 1]
+
+    results = await aevaluate(
+        predict,
+        client=client,
+        data=dev_split,
+        evaluators=[bad_eval_list],
+        num_repetitions=NUM_REPETITIONS,
+        blocking=blocking,
+    )
+    async for r in results:
+        assert r["evaluation_results"]["results"][0].extra == {"error": True}