diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index ceb5d856..382bc639 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1848,7 +1848,7 @@ def extract_evaluation_results_keys(node, variables): try: tree = ast.parse(python_code) function_def = tree.body[0] - if not isinstance(function_def, ast.FunctionDef): + if not isinstance(function_def, (ast.FunctionDef, ast.AsyncFunctionDef)): return [] variables = {} diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 065f5b16..21e475d6 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -236,9 +236,8 @@ def _coerce_evaluation_result( "Expected an EvaluationResult object, or dict with a metric" f" 'key' and optional 'score'; got empty result: {result}" ) - if "key" not in result: - if allow_no_key: - result["key"] = self._name + if "key" not in result and allow_no_key: + result["key"] = self._name if all(k not in result for k in ("score", "value", "comment")): raise ValueError( "Expected an EvaluationResult object, or dict with a metric" @@ -265,27 +264,41 @@ def _coerce_evaluation_results( return EvaluationResults(**cp) return self._coerce_evaluation_result( - cast(dict, results), allow_no_key=True, source_run_id=source_run_id + cast(dict, results), source_run_id=source_run_id, allow_no_key=True ) def _format_result( self, - result: Union[EvaluationResult, EvaluationResults, dict], + result: Union[ + EvaluationResult, EvaluationResults, dict, str, int, bool, float, list + ], source_run_id: uuid.UUID, ) -> Union[EvaluationResult, EvaluationResults]: - if isinstance(result, EvaluationResult): + if isinstance(result, (bool, float, int)): + result = {"score": result} + elif not result: + raise ValueError( + f"Expected a non-empty dict, str, bool, int, float, list, " + f"EvaluationResult, or EvaluationResults. Got {result}" + ) + elif isinstance(result, EvaluationResult): if not result.source_run_id: result.source_run_id = source_run_id return result - if not result: - raise ValueError( - "Expected an EvaluationResult or EvaluationResults object, or a" - " dict with key and one of score or value, EvaluationResults," - f" got {result}" - ) - if not isinstance(result, dict): + elif isinstance(result, list): + if not all(isinstance(x, dict) for x in result): + raise ValueError( + f"Expected a list of dicts or EvaluationResult. Received {result}." + ) + result = {"results": result} # type: ignore[misc] + elif isinstance(result, str): + result = {"value": result} + elif isinstance(result, dict): + pass + else: raise ValueError( - f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}" + f"Expected a dict, str, bool, int, float, list, EvaluationResult, or " + f"EvaluationResults. Got {result}" ) return self._coerce_evaluation_results(result, source_run_id) diff --git a/python/tests/unit_tests/evaluation/test_evaluator.py b/python/tests/unit_tests/evaluation/test_evaluator.py index c3b90770..09f1d7eb 100644 --- a/python/tests/unit_tests/evaluation/test_evaluator.py +++ b/python/tests/unit_tests/evaluation/test_evaluator.py @@ -321,8 +321,8 @@ async def sample_evaluator( assert result["results"][1].score == 2.0 -@pytest.mark.parametrize("response", [None, {}, {"accuracy": 5}]) -async def test_evaluator_raises_for_null_ouput(response: Any): +@pytest.mark.parametrize("response", [None, {}, []]) +async def test_evaluator_raises_for_null_output(response: Any): @run_evaluator # type: ignore def bad_evaluator(run: schemas.Run, example: schemas.Example): return response @@ -334,13 +334,36 @@ async def abad_evaluator(run: schemas.Run, example: schemas.Example): fake_run = MagicMock() fake_example = MagicMock() - with pytest.raises(ValueError, match="Expected an EvaluationResult "): + with pytest.raises(ValueError, match="Expected a non-empty "): bad_evaluator.evaluate_run(fake_run, fake_example) - with pytest.raises(ValueError, match="Expected an EvaluationResult "): + with pytest.raises(ValueError, match="Expected a non-empty "): await bad_evaluator.aevaluate_run(fake_run, fake_example) - with pytest.raises(ValueError, match="Expected an EvaluationResult "): + with pytest.raises(ValueError, match="Expected a non-empty "): + await abad_evaluator.aevaluate_run(fake_run, fake_example) + + +@pytest.mark.parametrize("response", [[5], {"accuracy": 5}]) +async def test_evaluator_raises_for_bad_output(response: Any): + @run_evaluator # type: ignore + def bad_evaluator(run: schemas.Run, example: schemas.Example): + return response + + @run_evaluator # type: ignore + async def abad_evaluator(run: schemas.Run, example: schemas.Example): + return response + + fake_run = MagicMock() + fake_example = MagicMock() + + with pytest.raises(ValueError, match="Expected"): + bad_evaluator.evaluate_run(fake_run, fake_example) + + with pytest.raises(ValueError, match="Expected"): + await bad_evaluator.aevaluate_run(fake_run, fake_example) + + with pytest.raises(ValueError, match="Expected"): await abad_evaluator.aevaluate_run(fake_run, fake_example) diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 943f55b2..38cee048 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -196,11 +196,26 @@ def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + def eval_float(run, example): + ordering_of_stuff.append("evaluate") + return 0.2 + + def eval_str(run, example): + ordering_of_stuff.append("evaluate") + return "good" + + def eval_list(run, example): + ordering_of_stuff.append("evaluate") + return [ + {"score": True, "key": "list_eval_bool"}, + {"score": 1, "key": "list_eval_int"}, + ] + results = evaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first], + evaluators=[score_value_first, eval_float, eval_str, eval_list], num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -231,14 +246,14 @@ def score_value_first(run, example): assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2) + _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5) _wait_until(lambda: slow_index is not None) # Want it to be interleaved - assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS + assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS * 2) - 2 + assert slow_index == (N_PREDS - 1) * 5 def score_value(run, example): return {"score": 0.7} @@ -260,6 +275,22 @@ def score_value(run, example): assert r["run"].reference_example_id in dev_xample_ids assert not fake_request.should_fail + # Returning list of non-dicts not supported. + def bad_eval_list(run, example): + ordering_of_stuff.append("evaluate") + return ["foo", 1] + + results = evaluate( + predict, + client=client, + data=dev_split, + evaluators=[bad_eval_list], + num_repetitions=NUM_REPETITIONS, + blocking=blocking, + ) + for r in results: + assert r["evaluation_results"]["results"][0].extra == {"error": True} + def test_evaluate_raises_for_async(): async def my_func(inputs: dict): @@ -366,11 +397,26 @@ async def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + async def eval_float(run, example): + ordering_of_stuff.append("evaluate") + return 0.2 + + async def eval_str(run, example): + ordering_of_stuff.append("evaluate") + return "good" + + async def eval_list(run, example): + ordering_of_stuff.append("evaluate") + return [ + {"score": True, "key": "list_eval_bool"}, + {"score": 1, "key": "list_eval_int"}, + ] + results = await aevaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first], + evaluators=[score_value_first, eval_float, eval_str, eval_list], num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -406,14 +452,14 @@ async def score_value_first(run, example): assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2) + _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5) _wait_until(lambda: slow_index is not None) # Want it to be interleaved - assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS + assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS assert slow_index is not None # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS * 2) - 2 + assert slow_index == (N_PREDS - 1) * 5 assert fake_request.created_session["name"] @@ -431,3 +477,19 @@ async def score_value(run, example): assert r["evaluation_results"]["results"][0].score == 0.7 assert r["run"].reference_example_id in dev_xample_ids assert not fake_request.should_fail + # Returning list of non-dicts not supported. + + async def bad_eval_list(run, example): + ordering_of_stuff.append("evaluate") + return ["foo", 1] + + results = await aevaluate( + predict, + client=client, + data=dev_split, + evaluators=[bad_eval_list], + num_repetitions=NUM_REPETITIONS, + blocking=blocking, + ) + async for r in results: + assert r["evaluation_results"]["results"][0].extra == {"error": True}