Add lazy iteration interface to document loaders (#3659)

Adding a lazy iteration for document loaders. Following the plan here: #2833 Keeping the `load` method as is for backwards compatibility. The `load` returns a materialized list of documents and downstream users may rely on that fact. A new method that returns an iterable is introduced for handling lazy loading. --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
langchain-ai · Apr 27, 2023 · 2052e70 · 2052e70
1 parent 8a54217
commit 2052e70
Showing 1 changed file with 23 additions and 3 deletions.
diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py
@@ -1,15 +1,25 @@
-"""Base loader class."""
+"""Abstract interface for document loader implementations."""
 
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import Iterable, List, Optional
 
 from langchain.docstore.document import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 
 
 class BaseLoader(ABC):
-    """Base loader class."""
+    """Interface for loading documents.
 
+    Implementations should implement the lazy-loading method using generators
+    to avoid loading all documents into memory at once.
+
+    The `load` method will remain as is for backwards compatibility, but it's
+    implementation should be just `list(self.lazy_load())`.
+    """
+
+    # Sub-classes should implement this method
+    # as return list(self.lazy_load()).
+    # This method returns a List which is materialized in memory.
     @abstractmethod
     def load(self) -> List[Document]:
         """Load data into document objects."""
@@ -24,3 +34,13 @@ def load_and_split(
             _text_splitter = text_splitter
         docs = self.load()
         return _text_splitter.split_documents(docs)
+
+    # Attention: This method will be upgraded into an abstractmethod once it's
+    #            implemented in all the existing subclasses.
+    def lazy_load(
+        self,
+    ) -> Iterable[Document]:
+        """A lazy loader for document content."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement lazy_load()"
+        )