Skip to content

Commit

Permalink
Support evaluate on existing experiment (#1000)
Browse files Browse the repository at this point in the history
  • Loading branch information
hinthornw authored Sep 13, 2024
1 parent 30365bc commit de3fec5
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 18 deletions.
21 changes: 18 additions & 3 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ async def aevaluate(
num_repetitions: int = 1,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
) -> AsyncExperimentResults:
r"""Evaluate an async target system or function on a given dataset.
Expand All @@ -90,6 +91,9 @@ async def aevaluate(
Defaults to None.
blocking (bool): Whether to block until the evaluation is complete.
Defaults to True.
experiment (Optional[schemas.TracerSession]): An existing experiment to
extend. If provided, experiment_prefix is ignored. For advanced
usage only.
Returns:
AsyncIterator[ExperimentResultRow]: An async iterator over the experiment results.
Expand Down Expand Up @@ -220,6 +224,12 @@ async def aevaluate(
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
if experiment and experiment_prefix:
raise ValueError(
"Expected at most one of 'experiment' or 'experiment_prefix',"
" but both were provided. "
f"Got: experiment={experiment}, experiment_prefix={experiment_prefix}"
)
return await _aevaluate(
target,
data=data,
Expand All @@ -232,11 +242,12 @@ async def aevaluate(
num_repetitions=num_repetitions,
client=client,
blocking=blocking,
experiment=experiment,
)


async def aevaluate_existing(
experiment: Union[str, uuid.UUID],
experiment: Union[str, uuid.UUID, schemas.TracerSession],
/,
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
Expand Down Expand Up @@ -314,7 +325,11 @@ async def aevaluate_existing(
""" # noqa: E501
client = client or langsmith.Client()
project = await aitertools.aio_to_thread(_load_experiment, experiment, client)
project = (
experiment
if isinstance(experiment, schemas.TracerSession)
else (await aitertools.aio_to_thread(_load_experiment, experiment, client))
)
runs = await aitertools.aio_to_thread(
_load_traces, experiment, client, load_nested=load_nested
)
Expand Down Expand Up @@ -346,7 +361,7 @@ async def _aevaluate(
num_repetitions: int = 1,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[schemas.TracerSession] = None,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
) -> AsyncExperimentResults:
is_async_target = asyncio.iscoroutinefunction(target) or (
hasattr(target, "__aiter__") and asyncio.iscoroutine(target.__aiter__())
Expand Down
57 changes: 43 additions & 14 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def evaluate(
num_repetitions: int = 1,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
) -> ExperimentResults:
r"""Evaluate a target system or function on a given dataset.
Expand All @@ -120,6 +121,9 @@ def evaluate(
num_repetitions (int): The number of times to run the evaluation.
Each item in the dataset will be run and evaluated this many times.
Defaults to 1.
experiment (Optional[schemas.TracerSession]): An existing experiment to
extend. If provided, experiment_prefix is ignored. For advanced
usage only.
Returns:
ExperimentResults: The results of the evaluation.
Expand Down Expand Up @@ -248,6 +252,12 @@ def evaluate(
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
if experiment and experiment_prefix:
raise ValueError(
"Expected at most one of 'experiment' or 'experiment_prefix',"
" but both were provided. "
f"Got: experiment={experiment}, experiment_prefix={experiment_prefix}"
)
return _evaluate(
target,
data=data,
Expand All @@ -260,11 +270,12 @@ def evaluate(
num_repetitions=num_repetitions,
client=client,
blocking=blocking,
experiment=experiment,
)


def evaluate_existing(
experiment: Union[str, uuid.UUID],
experiment: Union[str, uuid.UUID, schemas.TracerSession],
/,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
Expand Down Expand Up @@ -336,7 +347,11 @@ def evaluate_existing(
View the evaluation results for experiment:...
""" # noqa: E501
client = client or langsmith.Client()
project = _load_experiment(experiment, client)
project = (
experiment
if isinstance(experiment, schemas.TracerSession)
else _load_experiment(experiment, client)
)
runs = _load_traces(experiment, client, load_nested=load_nested)
data_map = _load_examples_map(client, project)
data = [data_map[cast(uuid.UUID, run.reference_example_id)] for run in runs]
Expand Down Expand Up @@ -841,7 +856,7 @@ def _evaluate(
num_repetitions: int = 1,
client: Optional[langsmith.Client] = None,
blocking: bool = True,
experiment: Optional[schemas.TracerSession] = None,
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
) -> ExperimentResults:
# Initialize the experiment manager.
client = client or langsmith.Client()
Expand Down Expand Up @@ -903,14 +918,18 @@ def _load_experiment(


def _load_traces(
project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
project: Union[str, uuid.UUID, schemas.TracerSession],
client: langsmith.Client,
load_nested: bool = False,
) -> List[schemas.Run]:
"""Load nested traces for a given project."""
execution_order = None if load_nested else 1
if isinstance(project, uuid.UUID) or _is_uuid(project):
runs = client.list_runs(project_id=project, execution_order=execution_order)
is_root = None if load_nested else True
if isinstance(project, schemas.TracerSession):
runs = client.list_runs(project_id=project.id, is_root=is_root)
elif isinstance(project, uuid.UUID) or _is_uuid(project):
runs = client.list_runs(project_id=project, is_root=is_root)
else:
runs = client.list_runs(project_name=project, execution_order=execution_order)
runs = client.list_runs(project_name=project, is_root=is_root)
if not load_nested:
return list(runs)

Expand Down Expand Up @@ -1593,26 +1612,36 @@ def _ensure_traceable(


def _resolve_experiment(
experiment: Optional[schemas.TracerSession],
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]],
runs: Optional[Iterable[schemas.Run]],
client: langsmith.Client,
) -> Tuple[
Optional[Union[schemas.TracerSession, str]], Optional[Iterable[schemas.Run]]
]:
# TODO: Remove this, handle outside the manager
if experiment is not None:
if not experiment.name:
if isinstance(experiment, schemas.TracerSession):
experiment_ = experiment
else:
experiment_ = _load_experiment(experiment, client)

if not experiment_.name:
raise ValueError("Experiment name must be defined if provided.")
return experiment, runs
if not experiment_.reference_dataset_id:
raise ValueError(
"Experiment must have an associated reference_dataset_id, "
"but none was provided."
)
return experiment_, runs
# If we have runs, that means the experiment was already started.
if runs is not None:
if runs is not None:
runs_, runs = itertools.tee(runs)
first_run = next(runs_)
experiment = client.read_project(project_id=first_run.session_id)
if not experiment.name:
experiment_ = client.read_project(project_id=first_run.session_id)
if not experiment_.name:
raise ValueError("Experiment name not found for provided runs.")
return experiment, runs
return experiment_, runs
return None, None


Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.119"
version = "0.1.120"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <support@langchain.dev>"]
license = "MIT"
Expand Down
62 changes: 62 additions & 0 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,37 @@ def predict(inputs: dict) -> dict:
for example in examples:
assert len([r for r in results if r["example"].id == example.id]) == 3

# Run it again with the existing project
results2 = evaluate(
predict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=results.experiment_name,
)
assert len(results2) == 10

# ... and again with the object
experiment = client.read_project(project_name=results.experiment_name)
results3 = evaluate(
predict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=experiment,
)
assert len(results3) == 10

# ... and again with the ID
results4 = evaluate(
predict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=str(experiment.id),
)
assert len(results4) == 10


async def test_aevaluate():
client = Client()
Expand Down Expand Up @@ -142,6 +173,37 @@ def check_run_count():
all_examples
), f"Expected {2 * len(all_examples)} runs, but got {len(final_runs)}"

# Run it again with the existing project
results2 = await aevaluate(
apredict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=results.experiment_name,
)
assert len(results2) == 10

# ... and again with the object
experiment = client.read_project(project_name=results.experiment_name)
results3 = await aevaluate(
apredict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=experiment,
)
assert len(results3) == 10

# ... and again with the ID
results4 = await aevaluate(
apredict,
data=dataset_name,
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=str(experiment.id),
)
assert len(results4) == 10


@test
def test_foo():
Expand Down

0 comments on commit de3fec5

Please sign in to comment.