-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Works just like the GenericLoader but concurrently for those who choose to optimize their workflow. @rlancemartin @eyurtsev --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
- Loading branch information
Showing
4 changed files
with
211 additions
and
0 deletions.
There are no files selected for viewing
94 changes: 94 additions & 0 deletions
94
docs/extras/integrations/document_loaders/concurrent.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "23c6e167", | ||
"metadata": {}, | ||
"source": [ | ||
"# Concurrent Loader\n", | ||
"\n", | ||
"Works just like the GenericLoader but concurrently for those who choose to optimize their workflow.\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "6ff3fb1f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import ConcurrentLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "ce96fa20", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = ConcurrentLoader.from_filesystem('example_data/', glob=\"**/*.txt\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "06a6cf5d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"files = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"id": "b87d3e58", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"2" | ||
] | ||
}, | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(files)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "668f1ee5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from __future__ import annotations | ||
|
||
import concurrent.futures | ||
from pathlib import Path | ||
from typing import Iterator, Literal, Optional, Sequence, Union | ||
|
||
from langchain.document_loaders.base import BaseBlobParser | ||
from langchain.document_loaders.blob_loaders import BlobLoader, FileSystemBlobLoader | ||
from langchain.document_loaders.generic import GenericLoader | ||
from langchain.document_loaders.parsers.registry import get_parser | ||
from langchain.schema import Document | ||
|
||
_PathLike = Union[str, Path] | ||
|
||
DEFAULT = Literal["default"] | ||
|
||
|
||
class ConcurrentLoader(GenericLoader): | ||
""" | ||
A generic document loader that loads and parses documents concurrently. | ||
""" | ||
|
||
def __init__( | ||
self, blob_loader: BlobLoader, blob_parser: BaseBlobParser, num_workers: int = 4 | ||
) -> None: | ||
super().__init__(blob_loader, blob_parser) | ||
self.num_workers = num_workers | ||
|
||
def lazy_load( | ||
self, | ||
) -> Iterator[Document]: | ||
"""Load documents lazily with concurrent parsing.""" | ||
with concurrent.futures.ThreadPoolExecutor( | ||
max_workers=self.num_workers | ||
) as executor: | ||
futures = { | ||
executor.submit(self.blob_parser.lazy_parse, blob) | ||
for blob in self.blob_loader.yield_blobs() | ||
} | ||
for future in concurrent.futures.as_completed(futures): | ||
yield from future.result() | ||
|
||
@classmethod | ||
def from_filesystem( | ||
cls, | ||
path: _PathLike, | ||
*, | ||
glob: str = "**/[!.]*", | ||
suffixes: Optional[Sequence[str]] = None, | ||
show_progress: bool = False, | ||
parser: Union[DEFAULT, BaseBlobParser] = "default", | ||
num_workers: int = 4, | ||
) -> ConcurrentLoader: | ||
""" | ||
Create a concurrent generic document loader using a | ||
filesystem blob loader. | ||
""" | ||
blob_loader = FileSystemBlobLoader( | ||
path, glob=glob, suffixes=suffixes, show_progress=show_progress | ||
) | ||
if isinstance(parser, str): | ||
blob_parser = get_parser(parser) | ||
else: | ||
blob_parser = parser | ||
return cls(blob_loader, blob_parser, num_workers) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters