Skip to content

Commit

Permalink
python[patch]: evaluators can return primitives (#1203)
Browse files Browse the repository at this point in the history
```python
from langsmith import evaluate

def foo(run, example):
    return 0

def bar(run, example):
    return "long"

# removed, needs to be list of dict
# def baz(run, example):
#     return [0, 0.2, "how are ya"]

def app(inputs):
    return inputs

evaluate(app, data="Sample Dataset 3", evaluators=[foo, bar, baz])
```
  • Loading branch information
baskaryan authored Nov 14, 2024
2 parents 33a621c + cf80a82 commit d8adcde
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 28 deletions.
2 changes: 1 addition & 1 deletion python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1848,7 +1848,7 @@ def extract_evaluation_results_keys(node, variables):
try:
tree = ast.parse(python_code)
function_def = tree.body[0]
if not isinstance(function_def, ast.FunctionDef):
if not isinstance(function_def, (ast.FunctionDef, ast.AsyncFunctionDef)):
return []

variables = {}
Expand Down
41 changes: 27 additions & 14 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,8 @@ def _coerce_evaluation_result(
"Expected an EvaluationResult object, or dict with a metric"
f" 'key' and optional 'score'; got empty result: {result}"
)
if "key" not in result:
if allow_no_key:
result["key"] = self._name
if "key" not in result and allow_no_key:
result["key"] = self._name
if all(k not in result for k in ("score", "value", "comment")):
raise ValueError(
"Expected an EvaluationResult object, or dict with a metric"
Expand All @@ -265,27 +264,41 @@ def _coerce_evaluation_results(
return EvaluationResults(**cp)

return self._coerce_evaluation_result(
cast(dict, results), allow_no_key=True, source_run_id=source_run_id
cast(dict, results), source_run_id=source_run_id, allow_no_key=True
)

def _format_result(
self,
result: Union[EvaluationResult, EvaluationResults, dict],
result: Union[
EvaluationResult, EvaluationResults, dict, str, int, bool, float, list
],
source_run_id: uuid.UUID,
) -> Union[EvaluationResult, EvaluationResults]:
if isinstance(result, EvaluationResult):
if isinstance(result, (bool, float, int)):
result = {"score": result}
elif not result:
raise ValueError(
f"Expected a non-empty dict, str, bool, int, float, list, "
f"EvaluationResult, or EvaluationResults. Got {result}"
)
elif isinstance(result, EvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
if not result:
raise ValueError(
"Expected an EvaluationResult or EvaluationResults object, or a"
" dict with key and one of score or value, EvaluationResults,"
f" got {result}"
)
if not isinstance(result, dict):
elif isinstance(result, list):
if not all(isinstance(x, dict) for x in result):
raise ValueError(
f"Expected a list of dicts or EvaluationResult. Received {result}."
)
result = {"results": result} # type: ignore[misc]
elif isinstance(result, str):
result = {"value": result}
elif isinstance(result, dict):
pass
else:
raise ValueError(
f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
f"EvaluationResults. Got {result}"
)

return self._coerce_evaluation_results(result, source_run_id)
Expand Down
33 changes: 28 additions & 5 deletions python/tests/unit_tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,8 @@ async def sample_evaluator(
assert result["results"][1].score == 2.0


@pytest.mark.parametrize("response", [None, {}, {"accuracy": 5}])
async def test_evaluator_raises_for_null_ouput(response: Any):
@pytest.mark.parametrize("response", [None, {}, []])
async def test_evaluator_raises_for_null_output(response: Any):
@run_evaluator # type: ignore
def bad_evaluator(run: schemas.Run, example: schemas.Example):
return response
Expand All @@ -334,13 +334,36 @@ async def abad_evaluator(run: schemas.Run, example: schemas.Example):
fake_run = MagicMock()
fake_example = MagicMock()

with pytest.raises(ValueError, match="Expected an EvaluationResult "):
with pytest.raises(ValueError, match="Expected a non-empty "):
bad_evaluator.evaluate_run(fake_run, fake_example)

with pytest.raises(ValueError, match="Expected an EvaluationResult "):
with pytest.raises(ValueError, match="Expected a non-empty "):
await bad_evaluator.aevaluate_run(fake_run, fake_example)

with pytest.raises(ValueError, match="Expected an EvaluationResult "):
with pytest.raises(ValueError, match="Expected a non-empty "):
await abad_evaluator.aevaluate_run(fake_run, fake_example)


@pytest.mark.parametrize("response", [[5], {"accuracy": 5}])
async def test_evaluator_raises_for_bad_output(response: Any):
@run_evaluator # type: ignore
def bad_evaluator(run: schemas.Run, example: schemas.Example):
return response

@run_evaluator # type: ignore
async def abad_evaluator(run: schemas.Run, example: schemas.Example):
return response

fake_run = MagicMock()
fake_example = MagicMock()

with pytest.raises(ValueError, match="Expected"):
bad_evaluator.evaluate_run(fake_run, fake_example)

with pytest.raises(ValueError, match="Expected"):
await bad_evaluator.aevaluate_run(fake_run, fake_example)

with pytest.raises(ValueError, match="Expected"):
await abad_evaluator.aevaluate_run(fake_run, fake_example)


Expand Down
78 changes: 70 additions & 8 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,26 @@ def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}

def eval_float(run, example):
ordering_of_stuff.append("evaluate")
return 0.2

def eval_str(run, example):
ordering_of_stuff.append("evaluate")
return "good"

def eval_list(run, example):
ordering_of_stuff.append("evaluate")
return [
{"score": True, "key": "list_eval_bool"},
{"score": 1, "key": "list_eval_int"},
]

results = evaluate(
predict,
client=client,
data=dev_split,
evaluators=[score_value_first],
evaluators=[score_value_first, eval_float, eval_str, eval_list],
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
Expand Down Expand Up @@ -231,14 +246,14 @@ def score_value_first(run, example):
assert fake_request.created_session
_wait_until(lambda: fake_request.runs)
N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
_wait_until(lambda: slow_index is not None)
# Want it to be interleaved
assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS

# It's delayed, so it'll be the penultimate event
# Will run all other preds and evals, then this, then the last eval
assert slow_index == (N_PREDS * 2) - 2
assert slow_index == (N_PREDS - 1) * 5

def score_value(run, example):
return {"score": 0.7}
Expand All @@ -260,6 +275,22 @@ def score_value(run, example):
assert r["run"].reference_example_id in dev_xample_ids
assert not fake_request.should_fail

# Returning list of non-dicts not supported.
def bad_eval_list(run, example):
ordering_of_stuff.append("evaluate")
return ["foo", 1]

results = evaluate(
predict,
client=client,
data=dev_split,
evaluators=[bad_eval_list],
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
for r in results:
assert r["evaluation_results"]["results"][0].extra == {"error": True}


def test_evaluate_raises_for_async():
async def my_func(inputs: dict):
Expand Down Expand Up @@ -366,11 +397,26 @@ async def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}

async def eval_float(run, example):
ordering_of_stuff.append("evaluate")
return 0.2

async def eval_str(run, example):
ordering_of_stuff.append("evaluate")
return "good"

async def eval_list(run, example):
ordering_of_stuff.append("evaluate")
return [
{"score": True, "key": "list_eval_bool"},
{"score": 1, "key": "list_eval_int"},
]

results = await aevaluate(
predict,
client=client,
data=dev_split,
evaluators=[score_value_first],
evaluators=[score_value_first, eval_float, eval_str, eval_list],
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
Expand Down Expand Up @@ -406,14 +452,14 @@ async def score_value_first(run, example):
assert fake_request.created_session
_wait_until(lambda: fake_request.runs)
N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 2)
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
_wait_until(lambda: slow_index is not None)
# Want it to be interleaved
assert ordering_of_stuff != ["predict"] * N_PREDS + ["evaluate"] * N_PREDS
assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
assert slow_index is not None
# It's delayed, so it'll be the penultimate event
# Will run all other preds and evals, then this, then the last eval
assert slow_index == (N_PREDS * 2) - 2
assert slow_index == (N_PREDS - 1) * 5

assert fake_request.created_session["name"]

Expand All @@ -431,3 +477,19 @@ async def score_value(run, example):
assert r["evaluation_results"]["results"][0].score == 0.7
assert r["run"].reference_example_id in dev_xample_ids
assert not fake_request.should_fail
# Returning list of non-dicts not supported.

async def bad_eval_list(run, example):
ordering_of_stuff.append("evaluate")
return ["foo", 1]

results = await aevaluate(
predict,
client=client,
data=dev_split,
evaluators=[bad_eval_list],
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
async for r in results:
assert r["evaluation_results"]["results"][0].extra == {"error": True}

0 comments on commit d8adcde

Please sign in to comment.