langchain-ai · hwchase17 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023
diff --git a/docs/ecosystem/runhouse.md b/docs/ecosystem/runhouse.md
@@ -26,6 +26,6 @@ the `SelfHostedEmbedding` class.
 from langchain.llms import SelfHostedPipeline, SelfHostedHuggingFaceLLM
 ```
 
-For a more detailed walkthrough of the Self-hosted Embeddings, see [this notebook](../modules/utils/combine_docs_examples/embeddings.ipynb)
+For a more detailed walkthrough of the Self-hosted Embeddings, see [this notebook](../modules/indexes/examples/embeddings.ipynb)
 
 ##
diff --git a/docs/modules/chains/async_chain.ipynb b/docs/modules/chains/async_chain.ipynb
@@ -9,7 +9,7 @@
     "\n",
     "LangChain provides async support for Chains by leveraging the [asyncio](https://docs.python.org/3/library/asyncio.html) library.\n",
     "\n",
-    "Async methods are currently supported in `LLMChain` (through `arun`, `apredict`, `acall`) and `LLMMathChain` (through `arun` and `acall`), `ChatVectorDBChain`, and [QA chains](https://langchain.readthedocs.io/en/latest/modules/chains/combine_docs_examples/question_answering.html). Async support for other chains is on the roadmap."
+    "Async methods are currently supported in `LLMChain` (through `arun`, `apredict`, `acall`) and `LLMMathChain` (through `arun` and `acall`), `ChatVectorDBChain`, and [QA chains](../indexes/chain_examples/question_answering.html). Async support for other chains is on the roadmap."
    ]
   },
   {
@@ -124,7 +124,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,

diff --git a/docs/modules/document_loaders/examples/gitbook.ipynb b/docs/modules/document_loaders/examples/gitbook.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "4babfba5",
    "metadata": {},
@@ -31,7 +30,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "65d5ddce",
    "metadata": {},
@@ -71,18 +69,17 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "c325048c",
    "metadata": {},
    "source": [
     "### Load from all paths in a given GitBook\n",
-    "For this to work, the GitbookLoader needs to be initialized with the root path (`https://docs.gitbook.com` in this example)."
+    "For this to work, the GitbookLoader needs to be initialized with the root path (`https://docs.gitbook.com` in this example) and have `load_all_paths` set to `True`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "938ff4ee",
    "metadata": {},
    "outputs": [
@@ -122,12 +119,13 @@
     }
    ],
    "source": [
-    "all_pages_data = loader.load_from_all_paths()"
+    "loader = GitbookLoader(\"https://docs.gitbook.com\", load_all_paths=True)\n",
+    "all_pages_data = loader.load()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "db92fc39",
    "metadata": {},
    "outputs": [
@@ -144,7 +142,7 @@
        "Document(page_content=\"Import\\nFind out how to easily migrate your existing documentation and which formats are supported.\\nThe import function allows you to migrate and unify existing documentation in GitBook. You can choose to import single or multiple pages although limits apply. \\nPermissions\\nAll members with editor permission or above can use the import feature.\\nSupported formats\\nGitBook supports imports from websites or files that are:\\nMarkdown (.md or .markdown)\\nHTML (.html)\\nMicrosoft Word (.docx).\\nWe also support import from:\\nConfluence\\nNotion\\nGitHub Wiki\\nQuip\\nDropbox Paper\\nGoogle Docs\\nYou can also upload a ZIP\\n \\ncontaining HTML or Markdown files when \\nimporting multiple pages.\\nNote: this feature is in beta.\\nFeel free to suggest import sources we don't support yet and \\nlet us know\\n if you have any issues.\\nImport panel\\nWhen you create a new space, you'll have the option to import content straight away:\\nThe new page menu\\nImport a page or subpage by selecting \\nImport Page\\n from the New Page menu, or \\nImport Subpage\\n in the page action menu, found in the table of contents:\\nImport from the page action menu\\nWhen you choose your input source, instructions will explain how to proceed.\\nAlthough GitBook supports importing content from different kinds of sources, the end result might be different from your source due to differences in product features and document format.\\nLimits\\nGitBook currently has the following limits for imported content:\\nThe maximum number of pages that can be uploaded in a single import is \\n20.\\nThe maximum number of files (images etc.) that can be uploaded in a single import is \\n20.\\nGetting started - \\nPrevious\\nOverview\\nNext\\n - Getting started\\nGit Sync\\nLast modified \\n4mo ago\", lookup_str='', metadata={'source': 'https://docs.gitbook.com/getting-started/import', 'title': 'Import'}, lookup_index=0)"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -154,11 +152,19 @@
     "# show second document\n",
     "all_pages_data[2]"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92cb3eda",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -172,7 +178,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   },
   "vscode": {
    "interpreter": {

diff --git a/docs/modules/indexes/chain_examples/chat_vector_db.ipynb b/docs/modules/indexes/chain_examples/chat_vector_db.ipynb
@@ -225,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "id": "562769c6",
    "metadata": {},
    "outputs": [],

diff --git a/docs/use_cases/combine_docs.md b/docs/use_cases/combine_docs.md
@@ -82,7 +82,7 @@ for language models.
 ## Augmenting
 So you've fetched your relevant data - now what? How do you pass them to the language model in a format it can understand?
 For a detailed overview of the different ways of doing so, and the tradeoffs between them, please see 
-[this documentation](../modules/chains/combine_docs.md)
+[this documentation](../modules/indexes/combine_docs.md)
 
 ## Use Cases
 LangChain supports the above three methods of augmenting LLMs with external data.

diff --git a/docs/use_cases/question_answering.md b/docs/use_cases/question_answering.md
@@ -12,8 +12,8 @@ chain.run(input_documents=docs, question=query)
 ```
 
 The following resources exist:
-- [Question Answering Notebook](/modules/chains/combine_docs_examples/question_answering.ipynb): A notebook walking through how to accomplish this task.
-- [VectorDB Question Answering Notebook](/modules/chains/combine_docs_examples/vector_db_qa.ipynb): A notebook walking through how to do question answering over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings.
+- [Question Answering Notebook](/modules/indexes/chain_examples/question_answering.ipynb): A notebook walking through how to accomplish this task.
+- [VectorDB Question Answering Notebook](/modules/indexes/chain_examples/vector_db_qa.ipynb): A notebook walking through how to do question answering over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings.
 
 ### Adding in sources
 
@@ -28,12 +28,12 @@ chain({"input_documents": docs, "question": query}, return_only_outputs=True)
 ```
 
 The following resources exist:
-- [QA With Sources Notebook](/modules/chains/combine_docs_examples/qa_with_sources.ipynb): A notebook walking through how to accomplish this task.
-- [VectorDB QA With Sources Notebook](/modules/chains/combine_docs_examples/vector_db_qa_with_sources.ipynb): A notebook walking through how to do question answering with sources over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings.
+- [QA With Sources Notebook](/modules/indexes/chain_examples/qa_with_sources.ipynb): A notebook walking through how to accomplish this task.
+- [VectorDB QA With Sources Notebook](/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb): A notebook walking through how to do question answering with sources over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings.
 
 ### Additional Related Resources
 
 Additional related resources include:
 - [Utilities for working with Documents](/modules/utils/how_to_guides.rst): Guides on how to use several of the utilities which will prove helpful for this task, including Text Splitters (for splitting up long documents) and Embeddings & Vectorstores (useful for the above Vector DB example).
-- [CombineDocuments Chains](/modules/chains/combine_docs.md): A conceptual overview of specific types of chains by which you can accomplish this task.
+- [CombineDocuments Chains](/modules/indexes/combine_docs.md): A conceptual overview of specific types of chains by which you can accomplish this task.
 - [Data Augmented Generation](combine_docs.md): An overview of data augmented generation, which is the general concept of combining external data with LLMs (of which this is a subset).
diff --git a/docs/use_cases/summarization.md b/docs/use_cases/summarization.md
@@ -12,9 +12,9 @@ chain.run(docs)
 ```
 
 The following resources exist:
-- [Summarization Notebook](../modules/chains/combine_docs_examples/summarize.ipynb): A notebook walking through how to accomplish this task.
+- [Summarization Notebook](../modules/indexes/chain_examples/summarize.ipynb): A notebook walking through how to accomplish this task.
 
 Additional related resources include:
 - [Utilities for working with Documents](../modules/utils/how_to_guides.rst): Guides on how to use several of the utilities which will prove helpful for this task, including Text Splitters (for splitting up long documents).
-- [CombineDocuments Chains](../modules/chains/combine_docs.md): A conceptual overview of specific types of chains by which you can accomplish this task.
+- [CombineDocuments Chains](../modules/indexes/combine_docs.md): A conceptual overview of specific types of chains by which you can accomplish this task.
 - [Data Augmented Generation](./combine_docs.md): An overview of data augmented generation, which is the general concept of combining external data with LLMs (of which this is a subset).
diff --git a/langchain/__init__.py b/langchain/__init__.py
@@ -28,7 +28,6 @@
     Cohere,
     ForefrontAI,
     GooseAI,
-    HuggingFaceEndpoint,
     HuggingFaceHub,
     OpenAI,
     Petals,
@@ -77,7 +76,6 @@
     "PromptTemplate",
     "ReActChain",
     "Wikipedia",
-    "HuggingFaceEndpoint",
     "HuggingFaceHub",
     "HuggingFacePipeline",
     "SQLDatabase",

diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
@@ -28,13 +28,17 @@
 from langchain.document_loaders.srt import SRTLoader
 from langchain.document_loaders.telegram import TelegramChatLoader
 from langchain.document_loaders.text import TextLoader
-from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from langchain.document_loaders.unstructured import (
+    UnstructuredFileIOLoader,
+    UnstructuredFileLoader,
+)
 from langchain.document_loaders.url import UnstructuredURLLoader
 from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.youtube import YoutubeLoader
 
 __all__ = [
     "UnstructuredFileLoader",
+    "UnstructuredFileIOLoader",
     "UnstructuredURLLoader",
     "DirectoryLoader",
     "NotionDirectoryLoader",

diff --git a/langchain/document_loaders/gitbook.py b/langchain/document_loaders/gitbook.py
@@ -12,25 +12,26 @@ class GitbookLoader(WebBaseLoader):
     2. load all (relative) paths in the navbar.
     """
 
-    def load(self, custom_web_path: Optional[str] = None) -> List[Document]:
+    def __init__(self, web_page: str, load_all_paths: bool = False):
+        """Initialize with web page and whether to load all paths."""
+        super().__init__(web_page)
+        self.load_all_paths = load_all_paths
+
+    def load(self) -> List[Document]:
         """Fetch text from one single GitBook page."""
-        soup_info = self.scrape(custom_web_path)
-        url = custom_web_path if custom_web_path else self.web_path
-        return [self._get_document(soup_info, url)]
-
-    def load_from_all_paths(self) -> List[Document]:
-        """Fetch text from all pages in the navbar.
-
-        Make sure the initialized web_path is the root of the GitBook
-        """
-        soup_info = self.scrape()
-        relative_paths = self._get_paths(soup_info)
-        documents = []
-        for path in relative_paths:
-            url = self.web_path + path
-            print(f"Fetching text from {url}")
-            documents += self.load(url)
-        return documents
+        if self.load_all_paths:
+            soup_info = self.scrape()
+            relative_paths = self._get_paths(soup_info)
+            documents = []
+            for path in relative_paths:
+                url = self.web_path + path
+                print(f"Fetching text from {url}")
+                soup_info = self._scrape(url)
+                documents.append(self._get_document(soup_info, url))
+            return documents
+        else:
+            soup_info = self.scrape()
+            return [self._get_document(soup_info, self.web_path)]
 
     def _get_document(self, soup: Any, custom_url: Optional[str] = None) -> Document:
         """Fetch content from page and return Document."""

diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py
@@ -1,14 +1,15 @@
 """Loader that uses unstructured to load files."""
-from typing import List
+from abc import ABC, abstractmethod
+from typing import IO, List
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 
 
-class UnstructuredFileLoader(BaseLoader):
+class UnstructuredBaseLoader(BaseLoader, ABC):
     """Loader that uses unstructured to load files."""
 
-    def __init__(self, file_path: str, mode: str = "single"):
+    def __init__(self, mode: str = "single"):
         """Initialize with file path."""
         try:
             import unstructured  # noqa:F401
@@ -22,21 +23,23 @@ def __init__(self, file_path: str, mode: str = "single"):
             raise ValueError(
                 f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
             )
-        self.file_path = file_path
         self.mode = mode
 
+    @abstractmethod
     def _get_elements(self) -> List:
-        from unstructured.partition.auto import partition
+        """Get elements."""
 
-        return partition(filename=self.file_path)
+    @abstractmethod
+    def _get_metadata(self) -> dict:
+        """Get metadata."""
 
     def load(self) -> List[Document]:
         """Load file."""
         elements = self._get_elements()
         if self.mode == "elements":
             docs: List[Document] = list()
             for element in elements:
-                metadata = {"source": self.file_path}
+                metadata = self._get_metadata()
                 # NOTE(MthwRobinson) - the attribute check is for backward compatibility
                 # with unstructured<0.4.9. The metadata attributed was added in 0.4.9.
                 if hasattr(element, "metadata"):
@@ -45,9 +48,43 @@ def load(self) -> List[Document]:
                     metadata["category"] = element.category
                 docs.append(Document(page_content=str(element), metadata=metadata))
         elif self.mode == "single":
-            metadata = {"source": self.file_path}
+            metadata = self._get_metadata()
             text = "\n\n".join([str(el) for el in elements])
             docs = [Document(page_content=text, metadata=metadata)]
         else:
             raise ValueError(f"mode of {self.mode} not supported.")
         return docs
+
+
+class UnstructuredFileLoader(UnstructuredBaseLoader):
+    """Loader that uses unstructured to load files."""
+
+    def __init__(self, file_path: str, mode: str = "single"):
+        """Initialize with file path."""
+        self.file_path = file_path
+        super().__init__(mode=mode)
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.auto import partition
+
+        return partition(filename=self.file_path)
+
+    def _get_metadata(self) -> dict:
+        return {"source": self.file_path}
+
+
+class UnstructuredFileIOLoader(UnstructuredBaseLoader):
+    """Loader that uses unstructured to load file IO objects."""
+
+    def __init__(self, file: IO, mode: str = "single"):
+        """Initialize with file path."""
+        self.file = file
+        super().__init__(mode=mode)
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.auto import partition
+
+        return partition(file=self.file)
+
+    def _get_metadata(self) -> dict:
+        return {}
diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py
@@ -1,5 +1,5 @@
 """Web base loader class."""
-from typing import Any, List, Optional
+from typing import Any, List
 
 import requests
 
@@ -14,15 +14,18 @@ def __init__(self, web_path: str):
         """Initialize with webpage path."""
         self.web_path = web_path
 
-    def scrape(self, custom_web_path: Optional[str] = None) -> Any:
-        """Scrape data from webpage and return it in BeautifulSoup format."""
+    @staticmethod
+    def _scrape(url: str) -> Any:
         from bs4 import BeautifulSoup
 
-        url = custom_web_path if custom_web_path else self.web_path
         html_doc = requests.get(url)
         soup = BeautifulSoup(html_doc.text, "html.parser")
         return soup
 
+    def scrape(self) -> Any:
+        """Scrape data from webpage and return it in BeautifulSoup format."""
+        return self._scrape(self.web_path)
+
     def load(self) -> List[Document]:
         """Load data into document objects."""
         soup = self.scrape()

diff --git a/langchain/llms/huggingface_endpoint.py b/langchain/llms/huggingface_endpoint.py
@@ -23,7 +23,7 @@ class HuggingFaceEndpoint(LLM, BaseModel):
     Example:
         .. code-block:: python
 
-            from langchain.llms.huggingface_endpoint import HuggingFaceEndpoint
+            from langchain.llms import HuggingFaceEndpoint
             endpoint_url = (
                 "https://abcdefghijklmnop.us-east-1.aws.endpoints.huggingface.cloud"
             )