From eef18dec442eabb2c2532bd67cc2efa12a43d406 Mon Sep 17 00:00:00 2001 From: ccurme Date: Thu, 19 Sep 2024 14:40:25 -0400 Subject: [PATCH] unstructured[patch]: support loading URLs (#26670) `unstructured.partition.auto.partition` supports a `url` kwarg, but `url` in `UnstructuredLoader.__init__` is reserved for the server URL. Here we add a `web_url` kwarg that is passed to the partition kwargs: ```python self.unstructured_kwargs["url"] = web_url ``` --- .../document_loaders/unstructured_file.ipynb | 45 ++++++++++++++++++- .../document_loaders.py | 22 +++++++++ .../test_document_loaders.py | 11 +++++ 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/docs/docs/integrations/document_loaders/unstructured_file.ipynb b/docs/docs/integrations/document_loaders/unstructured_file.ipynb index b376721eb8193..02ff6a7c799d2 100644 --- a/docs/docs/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/docs/integrations/document_loaders/unstructured_file.ipynb @@ -16,7 +16,7 @@ "\n", "| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/docs/integrations/document_loaders/file_loaders/unstructured/)|\n", "| :--- | :--- | :---: | :---: | :---: |\n", - "| [UnstructuredLoader](https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html) | [langchain_community](https://python.langchain.com/api_reference/unstructured/index.html) | ✅ | ❌ | ✅ | \n", + "| [UnstructuredLoader](https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html) | [langchain_unstructured](https://python.langchain.com/api_reference/unstructured/index.html) | ✅ | ❌ | ✅ | \n", "### Loader features\n", "| Source | Document Lazy Loading | Native Async Support\n", "| :---: | :---: | :---: | \n", @@ -519,6 +519,47 @@ "print(\"Length of text in the document:\", len(docs[0].page_content))" ] }, + { + "cell_type": "markdown", + "id": "3ec3c22d-02cd-498b-921f-b839d1404f32", + "metadata": {}, + "source": [ + "## Loading web pages\n", + "\n", + "`UnstructuredLoader` accepts a `web_url` kwarg when run locally that populates the `url` parameter of the underlying Unstructured [partition](https://docs.unstructured.io/open-source/core-functionality/partitioning). This allows for the parsing of remotely hosted documents, such as HTML web pages.\n", + "\n", + "Example usage:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bf9a8546-659d-4861-bff2-fdf1ad93ac65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Example Domain' metadata={'category_depth': 0, 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'Title', 'element_id': 'fdaa78d856f9d143aeeed85bf23f58f8'}\n", + "\n", + "page_content='This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.' metadata={'languages': ['eng'], 'parent_id': 'fdaa78d856f9d143aeeed85bf23f58f8', 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'NarrativeText', 'element_id': '3652b8458b0688639f973fe36253c992'}\n", + "\n", + "page_content='More information...' metadata={'category_depth': 0, 'link_texts': ['More information...'], 'link_urls': ['https://www.iana.org/domains/example'], 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'Title', 'element_id': '793ab98565d6f6d6f3a6d614e3ace2a9'}\n", + "\n" + ] + } + ], + "source": [ + "from langchain_unstructured import UnstructuredLoader\n", + "\n", + "loader = UnstructuredLoader(web_url=\"https://www.example.com\")\n", + "docs = loader.load()\n", + "\n", + "for doc in docs:\n", + " print(f\"{doc}\\n\")" + ] + }, { "cell_type": "markdown", "id": "ce01aa40", @@ -546,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/libs/partners/unstructured/langchain_unstructured/document_loaders.py b/libs/partners/unstructured/langchain_unstructured/document_loaders.py index bd3bcd6dbbe63..aef5135f2ef72 100644 --- a/libs/partners/unstructured/langchain_unstructured/document_loaders.py +++ b/libs/partners/unstructured/langchain_unstructured/document_loaders.py @@ -76,6 +76,25 @@ class UnstructuredLoader(BaseLoader): {'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'} + Load URL: + .. code-block:: python + + loader = UnstructuredLoader(web_url="https://www.example.com/") + print(docs[0]) + + .. code-block:: none + + page_content='Example Domain' metadata={'category_depth': 0, 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'Title', 'element_id': 'fdaa78d856f9d143aeeed85bf23f58f8'} + + .. code-block:: python + + print(docs[1]) + + .. code-block:: none + + page_content='This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.' metadata={'languages': ['eng'], 'parent_id': 'fdaa78d856f9d143aeeed85bf23f58f8', 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'NarrativeText', 'element_id': '3652b8458b0688639f973fe36253c992'} + + References ---------- https://docs.unstructured.io/api-reference/api-services/sdk @@ -95,6 +114,7 @@ def __init__( api_key: Optional[str] = None, client: Optional[UnstructuredClient] = None, url: Optional[str] = None, + web_url: Optional[str] = None, **kwargs: Any, ): """Initialize loader.""" @@ -124,6 +144,8 @@ def __init__( self.partition_via_api = partition_via_api self.post_processors = post_processors self.unstructured_kwargs = kwargs + if web_url: + self.unstructured_kwargs["url"] = web_url def lazy_load(self) -> Iterator[Document]: """Load file(s) to the _UnstructuredBaseLoader.""" diff --git a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py index f27ddf718670e..3b1824f3b4b5b 100644 --- a/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py +++ b/libs/partners/unstructured/tests/integration_tests/test_document_loaders.py @@ -128,6 +128,17 @@ def test_loader_partitions_locally_and_applies_post_processors( assert docs[0].page_content.endswith("THE END!") +@pytest.mark.local +def test_url_loader() -> None: + docs = UnstructuredLoader(web_url="https://www.example.com/").load() + + for doc in docs: + assert doc.page_content + assert doc.metadata["filetype"] == "text/html" + assert doc.metadata["url"] == "https://www.example.com/" + assert doc.metadata["category"] + + # -- API partition --