Support evaluate on existing experiment (#1000)

langchain-ai · Sep 13, 2024 · de3fec5 · de3fec5
1 parent 30365bc
commit de3fec5
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 18 deletions.
diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -65,6 +65,7 @@ async def aevaluate(
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
+    experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
 ) -> AsyncExperimentResults:
     r"""Evaluate an async target system or function on a given dataset.
 
@@ -90,6 +91,9 @@ async def aevaluate(
             Defaults to None.
         blocking (bool): Whether to block until the evaluation is complete.
             Defaults to True.
+        experiment (Optional[schemas.TracerSession]): An existing experiment to
+            extend. If provided, experiment_prefix is ignored. For advanced
+            usage only.
 
     Returns:
         AsyncIterator[ExperimentResultRow]: An async iterator over the experiment results.
@@ -220,6 +224,12 @@ async def aevaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
+    if experiment and experiment_prefix:
+        raise ValueError(
+            "Expected at most one of 'experiment' or 'experiment_prefix',"
+            " but both were provided. "
+            f"Got: experiment={experiment}, experiment_prefix={experiment_prefix}"
+        )
     return await _aevaluate(
         target,
         data=data,
@@ -232,11 +242,12 @@ async def aevaluate(
         num_repetitions=num_repetitions,
         client=client,
         blocking=blocking,
+        experiment=experiment,
     )
 
 
 async def aevaluate_existing(
-    experiment: Union[str, uuid.UUID],
+    experiment: Union[str, uuid.UUID, schemas.TracerSession],
     /,
     evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
@@ -314,7 +325,11 @@ async def aevaluate_existing(
 
     """  # noqa: E501
     client = client or langsmith.Client()
-    project = await aitertools.aio_to_thread(_load_experiment, experiment, client)
+    project = (
+        experiment
+        if isinstance(experiment, schemas.TracerSession)
+        else (await aitertools.aio_to_thread(_load_experiment, experiment, client))
+    )
     runs = await aitertools.aio_to_thread(
         _load_traces, experiment, client, load_nested=load_nested
     )
@@ -346,7 +361,7 @@ async def _aevaluate(
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
-    experiment: Optional[schemas.TracerSession] = None,
+    experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
 ) -> AsyncExperimentResults:
     is_async_target = asyncio.iscoroutinefunction(target) or (
         hasattr(target, "__aiter__") and asyncio.iscoroutine(target.__aiter__())

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -95,6 +95,7 @@ def evaluate(
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
+    experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
 ) -> ExperimentResults:
     r"""Evaluate a target system or function on a given dataset.
 
@@ -120,6 +121,9 @@ def evaluate(
         num_repetitions (int): The number of times to run the evaluation.
             Each item in the dataset will be run and evaluated this many times.
             Defaults to 1.
+        experiment (Optional[schemas.TracerSession]): An existing experiment to
+            extend. If provided, experiment_prefix is ignored. For advanced
+            usage only.
 
     Returns:
         ExperimentResults: The results of the evaluation.
@@ -248,6 +252,12 @@ def evaluate(
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
     """  # noqa: E501
+    if experiment and experiment_prefix:
+        raise ValueError(
+            "Expected at most one of 'experiment' or 'experiment_prefix',"
+            " but both were provided. "
+            f"Got: experiment={experiment}, experiment_prefix={experiment_prefix}"
+        )
     return _evaluate(
         target,
         data=data,
@@ -260,11 +270,12 @@ def evaluate(
         num_repetitions=num_repetitions,
         client=client,
         blocking=blocking,
+        experiment=experiment,
     )
 
 
 def evaluate_existing(
-    experiment: Union[str, uuid.UUID],
+    experiment: Union[str, uuid.UUID, schemas.TracerSession],
     /,
     evaluators: Optional[Sequence[EVALUATOR_T]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
@@ -336,7 +347,11 @@ def evaluate_existing(
         View the evaluation results for experiment:...
     """  # noqa: E501
     client = client or langsmith.Client()
-    project = _load_experiment(experiment, client)
+    project = (
+        experiment
+        if isinstance(experiment, schemas.TracerSession)
+        else _load_experiment(experiment, client)
+    )
     runs = _load_traces(experiment, client, load_nested=load_nested)
     data_map = _load_examples_map(client, project)
     data = [data_map[cast(uuid.UUID, run.reference_example_id)] for run in runs]
@@ -841,7 +856,7 @@ def _evaluate(
     num_repetitions: int = 1,
     client: Optional[langsmith.Client] = None,
     blocking: bool = True,
-    experiment: Optional[schemas.TracerSession] = None,
+    experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
 ) -> ExperimentResults:
     # Initialize the experiment manager.
     client = client or langsmith.Client()
@@ -903,14 +918,18 @@ def _load_experiment(
 
 
 def _load_traces(
-    project: Union[str, uuid.UUID], client: langsmith.Client, load_nested: bool = False
+    project: Union[str, uuid.UUID, schemas.TracerSession],
+    client: langsmith.Client,
+    load_nested: bool = False,
 ) -> List[schemas.Run]:
     """Load nested traces for a given project."""
-    execution_order = None if load_nested else 1
-    if isinstance(project, uuid.UUID) or _is_uuid(project):
-        runs = client.list_runs(project_id=project, execution_order=execution_order)
+    is_root = None if load_nested else True
+    if isinstance(project, schemas.TracerSession):
+        runs = client.list_runs(project_id=project.id, is_root=is_root)
+    elif isinstance(project, uuid.UUID) or _is_uuid(project):
+        runs = client.list_runs(project_id=project, is_root=is_root)
     else:
-        runs = client.list_runs(project_name=project, execution_order=execution_order)
+        runs = client.list_runs(project_name=project, is_root=is_root)
     if not load_nested:
         return list(runs)
 
@@ -1593,26 +1612,36 @@ def _ensure_traceable(
 
 
 def _resolve_experiment(
-    experiment: Optional[schemas.TracerSession],
+    experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]],
     runs: Optional[Iterable[schemas.Run]],
     client: langsmith.Client,
 ) -> Tuple[
     Optional[Union[schemas.TracerSession, str]], Optional[Iterable[schemas.Run]]
 ]:
     # TODO: Remove this, handle outside the manager
     if experiment is not None:
-        if not experiment.name:
+        if isinstance(experiment, schemas.TracerSession):
+            experiment_ = experiment
+        else:
+            experiment_ = _load_experiment(experiment, client)
+
+        if not experiment_.name:
             raise ValueError("Experiment name must be defined if provided.")
-        return experiment, runs
+        if not experiment_.reference_dataset_id:
+            raise ValueError(
+                "Experiment must have an associated reference_dataset_id, "
+                "but none was provided."
+            )
+        return experiment_, runs
     # If we have runs, that means the experiment was already started.
     if runs is not None:
         if runs is not None:
             runs_, runs = itertools.tee(runs)
             first_run = next(runs_)
-        experiment = client.read_project(project_id=first_run.session_id)
-        if not experiment.name:
+        experiment_ = client.read_project(project_id=first_run.session_id)
+        if not experiment_.name:
             raise ValueError("Experiment name not found for provided runs.")
-        return experiment, runs
+        return experiment_, runs
     return None, None
 
 

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.119"
+version = "0.1.120"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"

diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
@@ -71,6 +71,37 @@ def predict(inputs: dict) -> dict:
     for example in examples:
         assert len([r for r in results if r["example"].id == example.id]) == 3
 
+    # Run it again with the existing project
+    results2 = evaluate(
+        predict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=results.experiment_name,
+    )
+    assert len(results2) == 10
+
+    # ... and again with the object
+    experiment = client.read_project(project_name=results.experiment_name)
+    results3 = evaluate(
+        predict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=experiment,
+    )
+    assert len(results3) == 10
+
+    # ... and again with the ID
+    results4 = evaluate(
+        predict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=str(experiment.id),
+    )
+    assert len(results4) == 10
+
 
 async def test_aevaluate():
     client = Client()
@@ -142,6 +173,37 @@ def check_run_count():
         all_examples
     ), f"Expected {2 * len(all_examples)} runs, but got {len(final_runs)}"
 
+    # Run it again with the existing project
+    results2 = await aevaluate(
+        apredict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=results.experiment_name,
+    )
+    assert len(results2) == 10
+
+    # ... and again with the object
+    experiment = client.read_project(project_name=results.experiment_name)
+    results3 = await aevaluate(
+        apredict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=experiment,
+    )
+    assert len(results3) == 10
+
+    # ... and again with the ID
+    results4 = await aevaluate(
+        apredict,
+        data=dataset_name,
+        evaluators=[accuracy],
+        summary_evaluators=[precision],
+        experiment=str(experiment.id),
+    )
+    assert len(results4) == 10
+
 
 @test
 def test_foo():