-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: document loader for MS Word documents (#1282)
### Summary Adds a document loader for MS Word Documents. Works with both `.docx` and `.doc` files as longer as the user has installed `unstructured>=0.4.11`. ### Testing The follow workflow test the loader for both `.doc` and `.docx` files using example docs from the `unstructured` repo. #### `.docx` ```python from langchain.document_loaders import UnstructuredWordDocumentLoader filename = "../unstructured/example-docs/fake.docx" loader = UnstructuredWordDocumentLoader(filename) loader.load() ``` #### `.doc` ```python from langchain.document_loaders import UnstructuredWordDocumentLoader filename = "../unstructured/example-docs/fake.doc" loader = UnstructuredWordDocumentLoader(filename) loader.load() ```
- Loading branch information
1 parent
96db6ed
commit 2f15c11
Showing
3 changed files
with
182 additions
and
0 deletions.
There are no files selected for viewing
137 changes: 137 additions & 0 deletions
137
docs/modules/document_loaders/examples/word_document.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "39af9ecd", | ||
"metadata": {}, | ||
"source": [ | ||
"# Word Documents\n", | ||
"\n", | ||
"This covers how to load Word documents into a document format that we can use downstream." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "721c48aa", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import UnstructuredWordDocumentLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "9d3d0e35", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = UnstructuredWordDocumentLoader(\"fake.docx\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "06073f91", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "c9adc5cb", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "525d6b67", | ||
"metadata": {}, | ||
"source": [ | ||
"## Retain Elements\n", | ||
"\n", | ||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "064f9162", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = UnstructuredWordDocumentLoader(\"fake.docx\", mode=\"elements\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "abefbbdb", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "a547c534", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data[0]" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
"""Loader that loads word documents.""" | ||
import os | ||
from typing import List | ||
|
||
from langchain.document_loaders.unstructured import UnstructuredFileLoader | ||
|
||
|
||
class UnstructuredWordDocumentLoader(UnstructuredFileLoader): | ||
"""Loader that uses unstructured to load word documents.""" | ||
|
||
def _get_elements(self) -> List: | ||
from unstructured.__version__ import __version__ as __unstructured_version__ | ||
from unstructured.file_utils.filetype import FileType, detect_filetype | ||
|
||
unstructured_version = tuple( | ||
[int(x) for x in __unstructured_version__.split(".")] | ||
) | ||
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic | ||
# system dependency isn't installed. If it's not installed, we'll just | ||
# check the file extension | ||
try: | ||
import magic # noqa: F401 | ||
|
||
is_doc = detect_filetype(self.file_path) == FileType.DOC | ||
except ImportError: | ||
_, extension = os.path.splitext(self.file_path) | ||
is_doc = extension == ".doc" | ||
|
||
if is_doc and unstructured_version < (0, 4, 11): | ||
raise ValueError( | ||
f"You are on unstructured version {__unstructured_version__}. " | ||
"Partitioning .doc files is only supported in unstructured>=0.4.11. " | ||
"Please upgrade the unstructured package and try again." | ||
) | ||
|
||
if is_doc: | ||
from unstructured.partition.doc import partition_doc | ||
|
||
return partition_doc(filename=self.file_path) | ||
else: | ||
from unstructured.partition.docx import partition_docx | ||
|
||
return partition_docx(filename=self.file_path) |