-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: lu-cashmoney <lucas.corley@gmail.com>
- Loading branch information
1 parent
eff5eed
commit 410bf37
Showing
4 changed files
with
311 additions
and
0 deletions.
There are no files selected for viewing
202 changes: 202 additions & 0 deletions
202
docs/modules/indexes/document_loaders/examples/bigquery.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# BigQuery Loader\n", | ||
"\n", | ||
"Load a BigQuery query with one document per row." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import BigQueryLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"BASE_QUERY = '''\n", | ||
"SELECT\n", | ||
" id,\n", | ||
" dna_sequence,\n", | ||
" organism\n", | ||
"FROM (\n", | ||
" SELECT\n", | ||
" ARRAY (\n", | ||
" SELECT\n", | ||
" AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", | ||
" UNION ALL\n", | ||
" SELECT\n", | ||
" AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", | ||
" UNION ALL\n", | ||
" SELECT\n", | ||
" AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", | ||
" UNNEST(new_array)\n", | ||
"'''" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Basic Usage" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = BigQueryLoader(BASE_QUERY)\n", | ||
"\n", | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={}, lookup_index=0)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Specifying Which Columns are Content vs Metadata" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = BigQueryLoader(BASE_QUERY, page_content_columns=[\"dna_sequence\", \"organism\"], metadata_columns=[\"id\"])\n", | ||
"\n", | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[Document(page_content='dna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={'id': 1}, lookup_index=0), Document(page_content='dna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={'id': 2}, lookup_index=0), Document(page_content='dna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={'id': 3}, lookup_index=0)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Adding Source to Metadata" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Note that the `id` column is being returned twice, with one instance aliased as `source`\n", | ||
"ALIASED_QUERY = '''\n", | ||
"SELECT\n", | ||
" id,\n", | ||
" dna_sequence,\n", | ||
" organism,\n", | ||
" id as source\n", | ||
"FROM (\n", | ||
" SELECT\n", | ||
" ARRAY (\n", | ||
" SELECT\n", | ||
" AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", | ||
" UNION ALL\n", | ||
" SELECT\n", | ||
" AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", | ||
" UNION ALL\n", | ||
" SELECT\n", | ||
" AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", | ||
" UNNEST(new_array)\n", | ||
"'''" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = BigQueryLoader(ALIASED_QUERY, metadata_columns=[\"source\"])\n", | ||
"\n", | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).\\nsource: 1', lookup_str='', metadata={'source': 1}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).\\nsource: 2', lookup_str='', metadata={'source': 2}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).\\nsource: 3', lookup_str='', metadata={'source': 3}, lookup_index=0)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(data)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from typing import List, Optional | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class BigQueryLoader(BaseLoader): | ||
"""Loads a query result from BigQuery into a list of documents. | ||
Each document represents one row of the result. The `page_content_columns` | ||
are written into the `page_content` of the document. The `metadata_columns` | ||
are written into the `metadata` of the document. By default, all columns | ||
are written into the `page_content` and none into the `metadata`. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
query: str, | ||
project: Optional[str] = None, | ||
page_content_columns: Optional[List[str]] = None, | ||
metadata_columns: Optional[List[str]] = None, | ||
): | ||
self.query = query | ||
self.project = project | ||
self.page_content_columns = page_content_columns | ||
self.metadata_columns = metadata_columns | ||
|
||
def load(self) -> List[Document]: | ||
try: | ||
from google.cloud import bigquery | ||
except ImportError as ex: | ||
raise ValueError( | ||
"Could not import google-cloud-bigquery python package. " | ||
"Please install it with `pip install google-cloud-bigquery`." | ||
) from ex | ||
|
||
bq_client = bigquery.Client(self.project) | ||
query_result = bq_client.query(self.query).result() | ||
docs: List[Document] = [] | ||
|
||
page_content_columns = self.page_content_columns | ||
metadata_columns = self.metadata_columns | ||
|
||
if page_content_columns is None: | ||
page_content_columns = [column.name for column in query_result.schema] | ||
if metadata_columns is None: | ||
metadata_columns = [] | ||
|
||
for row in query_result: | ||
page_content = "\n".join( | ||
f"{k}: {v}" for k, v in row.items() if k in page_content_columns | ||
) | ||
metadata = {k: v for k, v in row.items() if k in metadata_columns} | ||
doc = Document(page_content=page_content, metadata=metadata) | ||
docs.append(doc) | ||
|
||
return docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pytest | ||
|
||
from langchain.document_loaders.bigquery import BigQueryLoader | ||
|
||
try: | ||
from google.cloud import bigquery # noqa: F401 | ||
|
||
bigquery_installed = True | ||
except ImportError: | ||
bigquery_installed = False | ||
|
||
|
||
@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") | ||
def test_bigquery_loader_no_options() -> None: | ||
loader = BigQueryLoader("SELECT 1 AS a, 2 AS b") | ||
docs = loader.load() | ||
|
||
assert len(docs) == 1 | ||
assert docs[0].page_content == "a: 1\nb: 2" | ||
assert docs[0].metadata == {} | ||
|
||
|
||
@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") | ||
def test_bigquery_loader_page_content_columns() -> None: | ||
loader = BigQueryLoader( | ||
"SELECT 1 AS a, 2 AS b UNION ALL SELECT 3 AS a, 4 AS b", | ||
page_content_columns=["a"], | ||
) | ||
docs = loader.load() | ||
|
||
assert len(docs) == 2 | ||
assert docs[0].page_content == "a: 1" | ||
assert docs[0].metadata == {} | ||
|
||
assert docs[1].page_content == "a: 3" | ||
assert docs[1].metadata == {} | ||
|
||
|
||
@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") | ||
def test_bigquery_loader_metadata_columns() -> None: | ||
loader = BigQueryLoader( | ||
"SELECT 1 AS a, 2 AS b", | ||
page_content_columns=["a"], | ||
metadata_columns=["b"], | ||
) | ||
docs = loader.load() | ||
|
||
assert len(docs) == 1 | ||
assert docs[0].page_content == "a: 1" | ||
assert docs[0].metadata == {"b": 2} |