Skip to content

Commit

Permalink
Harrison/unstructured io (langchain-ai#1200)
Browse files Browse the repository at this point in the history
  • Loading branch information
hwchase17 authored and zachschillaci27 committed Mar 8, 2023
1 parent f2628ba commit 0ba7d77
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/modules/indexes/chain_examples/chat_vector_db.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 5,
"id": "562769c6",
"metadata": {},
"outputs": [],
Expand Down
6 changes: 5 additions & 1 deletion langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@
from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.unstructured import (
UnstructuredFileIOLoader,
UnstructuredFileLoader,
)
from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.youtube import YoutubeLoader

__all__ = [
"UnstructuredFileLoader",
"UnstructuredFileIOLoader",
"UnstructuredURLLoader",
"DirectoryLoader",
"NotionDirectoryLoader",
Expand Down
53 changes: 45 additions & 8 deletions langchain/document_loaders/unstructured.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Loader that uses unstructured to load files."""
from typing import List
from abc import ABC, abstractmethod
from typing import IO, List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class UnstructuredFileLoader(BaseLoader):
class UnstructuredBaseLoader(BaseLoader, ABC):
"""Loader that uses unstructured to load files."""

def __init__(self, file_path: str, mode: str = "single"):
def __init__(self, mode: str = "single"):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
Expand All @@ -22,21 +23,23 @@ def __init__(self, file_path: str, mode: str = "single"):
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
self.file_path = file_path
self.mode = mode

@abstractmethod
def _get_elements(self) -> List:
from unstructured.partition.auto import partition
"""Get elements."""

return partition(filename=self.file_path)
@abstractmethod
def _get_metadata(self) -> dict:
"""Get metadata."""

def load(self) -> List[Document]:
"""Load file."""
elements = self._get_elements()
if self.mode == "elements":
docs: List[Document] = list()
for element in elements:
metadata = {"source": self.file_path}
metadata = self._get_metadata()
# NOTE(MthwRobinson) - the attribute check is for backward compatibility
# with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
if hasattr(element, "metadata"):
Expand All @@ -45,9 +48,43 @@ def load(self) -> List[Document]:
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
elif self.mode == "single":
metadata = {"source": self.file_path}
metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements])
docs = [Document(page_content=text, metadata=metadata)]
else:
raise ValueError(f"mode of {self.mode} not supported.")
return docs


class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load files."""

def __init__(self, file_path: str, mode: str = "single"):
"""Initialize with file path."""
self.file_path = file_path
super().__init__(mode=mode)

def _get_elements(self) -> List:
from unstructured.partition.auto import partition

return partition(filename=self.file_path)

def _get_metadata(self) -> dict:
return {"source": self.file_path}


class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load file IO objects."""

def __init__(self, file: IO, mode: str = "single"):
"""Initialize with file path."""
self.file = file
super().__init__(mode=mode)

def _get_elements(self) -> List:
from unstructured.partition.auto import partition

return partition(file=self.file)

def _get_metadata(self) -> dict:
return {}

0 comments on commit 0ba7d77

Please sign in to comment.