From b775912fb0720d3f7d79fb8e0407d31424a832f0 Mon Sep 17 00:00:00 2001 From: SN <6432132+samnoyes@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:45:52 -0700 Subject: [PATCH] allow filtering datasets by metadata --- js/package.json | 2 +- js/src/client.ts | 8 ++++++++ js/src/index.ts | 2 +- js/src/tests/client.int.test.ts | 7 +++++++ python/langsmith/client.py | 7 +++++++ python/pyproject.toml | 2 +- python/tests/integration_tests/test_client.py | 11 ++++++++++- 7 files changed, 35 insertions(+), 4 deletions(-) diff --git a/js/package.json b/js/package.json index 9c10136d3..b0c1d8fc4 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "langsmith", - "version": "0.1.44", + "version": "0.1.45", "description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.", "packageManager": "yarn@1.22.19", "files": [ diff --git a/js/src/client.ts b/js/src/client.ts index 6f39194b4..a0cc62510 100644 --- a/js/src/client.ts +++ b/js/src/client.ts @@ -1910,16 +1910,19 @@ export class Client { dataType, inputsSchema, outputsSchema, + metadata, }: { description?: string; dataType?: DataType; inputsSchema?: KVMap; outputsSchema?: KVMap; + metadata?: RecordStringAny; } = {} ): Promise { const body: KVMap = { name, description, + extra: metadata ? { metadata } : undefined, }; if (dataType) { body.data_type = dataType; @@ -2065,12 +2068,14 @@ export class Client { datasetIds, datasetName, datasetNameContains, + metadata, }: { limit?: number; offset?: number; datasetIds?: string[]; datasetName?: string; datasetNameContains?: string; + metadata?: RecordStringAny; } = {}): AsyncIterable { const path = "/datasets"; const params = new URLSearchParams({ @@ -2088,6 +2093,9 @@ export class Client { if (datasetNameContains !== undefined) { params.append("name_contains", datasetNameContains); } + if (metadata !== undefined) { + params.append("metadata", JSON.stringify(metadata)); + } for await (const datasets of this._getPaginated(path, params)) { yield* datasets; } diff --git a/js/src/index.ts b/js/src/index.ts index c9c91bb3a..84c7000a1 100644 --- a/js/src/index.ts +++ b/js/src/index.ts @@ -12,4 +12,4 @@ export type { export { RunTree, type RunTreeConfig } from "./run_trees.js"; // Update using yarn bump-version -export const __version__ = "0.1.44"; +export const __version__ = "0.1.45"; diff --git a/js/src/tests/client.int.test.ts b/js/src/tests/client.int.test.ts index 1608079af..e9343a238 100644 --- a/js/src/tests/client.int.test.ts +++ b/js/src/tests/client.int.test.ts @@ -183,6 +183,7 @@ test.concurrent( }); const dataset = await langchainClient.createDataset(datasetName, { dataType: "llm", + metadata: { key: "valuefoo" }, }); await langchainClient.createExample( { input: "hello world" }, @@ -193,6 +194,12 @@ test.concurrent( ); const loadedDataset = await langchainClient.readDataset({ datasetName }); expect(loadedDataset.data_type).toEqual("llm"); + + const datasetsByMetadata = await toArray( + langchainClient.listDatasets({ metadata: { key: "valuefoo" } }) + ); + expect(datasetsByMetadata.length).toEqual(1); + expect(datasetsByMetadata.map((d) => d.id)).toContain(dataset.id); await langchainClient.deleteDataset({ datasetName }); }, 180_000 diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 82fff681c..3d6e7f134 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -2504,6 +2504,7 @@ def create_dataset( data_type: ls_schemas.DataType = ls_schemas.DataType.kv, inputs_schema: Optional[Dict[str, Any]] = None, outputs_schema: Optional[Dict[str, Any]] = None, + metadata: Optional[dict] = None, ) -> ls_schemas.Dataset: """Create a dataset in the LangSmith API. @@ -2515,6 +2516,8 @@ def create_dataset( The description of the dataset. data_type : DataType or None, default=DataType.kv The data type of the dataset. + metadata: dict or None, default=None + Additional metadata to associate with the dataset. Returns: ------- @@ -2525,6 +2528,7 @@ def create_dataset( "name": dataset_name, "data_type": data_type.value, "created_at": datetime.datetime.now().isoformat(), + "extra": {"metadata": metadata} if metadata else None, } if description is not None: dataset["description"] = description @@ -2737,6 +2741,7 @@ def list_datasets( data_type: Optional[str] = None, dataset_name: Optional[str] = None, dataset_name_contains: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, limit: Optional[int] = None, ) -> Iterator[ls_schemas.Dataset]: """List the datasets on the LangSmith API. @@ -2757,6 +2762,8 @@ def list_datasets( params["name"] = dataset_name if dataset_name_contains is not None: params["name_contains"] = dataset_name_contains + if metadata is not None: + params["metadata"] = json.dumps(metadata) for i, dataset in enumerate( self._get_paginated_list("/datasets", params=params) ): diff --git a/python/pyproject.toml b/python/pyproject.toml index 54cb2be9e..4f4ef8a5f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.102" +version = "0.1.103" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index bd7b583b5..57dffd963 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -461,7 +461,9 @@ def test_list_datasets(langchain_client: Client) -> None: ds1n = "__test_list_datasets1" + uuid4().hex[:4] ds2n = "__test_list_datasets2" + uuid4().hex[:4] try: - dataset1 = langchain_client.create_dataset(ds1n, data_type=DataType.llm) + dataset1 = langchain_client.create_dataset( + ds1n, data_type=DataType.llm, metadata={"foo": "barqux"} + ) dataset2 = langchain_client.create_dataset(ds2n, data_type=DataType.kv) assert dataset1.url is not None assert dataset2.url is not None @@ -484,6 +486,13 @@ def test_list_datasets(langchain_client: Client) -> None: ) ) assert len(datasets) == 1 + # Sub-filter on metadata + datasets = list( + langchain_client.list_datasets( + dataset_ids=[dataset1.id, dataset2.id], metadata={"foo": "barqux"} + ) + ) + assert len(datasets) == 1 finally: # Delete datasets for name in [ds1n, ds2n]: