Skip to content

Commit

Permalink
Add use multimodal feature at llama parse (#868)
Browse files Browse the repository at this point in the history
* create _add_multimodal_params function

* Add Use Multimodal docs
  • Loading branch information
bwook00 authored Oct 20, 2024
1 parent 411174d commit 1c9b4eb
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 1 deletion.
80 changes: 79 additions & 1 deletion autorag/data/parse/llamaparse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List, Tuple
from itertools import chain

Expand All @@ -9,7 +10,13 @@

@parser_node
def llama_parse(
data_path_list: List[str], batch: int = 8, **kwargs
data_path_list: List[str],
batch: int = 8,
use_vendor_multimodal_model: bool = False,
vendor_multimodal_model_name: str = "openai-gpt4o",
use_own_key: bool = False,
vendor_multimodal_api_key: str = None,
**kwargs,
) -> Tuple[List[str], List[str], List[int]]:
"""
Parse documents to use llama_parse.
Expand All @@ -18,9 +25,22 @@ def llama_parse(
:param data_path_list: The list of data paths to parse.
:param batch: The batch size for parse documents. Default is 8.
:param use_vendor_multimodal_model: Whether to use the vendor multimodal model. Default is False.
:param vendor_multimodal_model_name: The name of the vendor multimodal model. Default is "openai-gpt4o".
:param use_own_key: Whether to use the own API key. Default is False.
:param vendor_multimodal_api_key: The API key for the vendor multimodal model.
:param kwargs: The extra parameters for creating the llama_parse instance.
:return: tuple of lists containing the parsed texts, path and pages.
"""
if use_vendor_multimodal_model:
kwargs = _add_multimodal_params(
kwargs,
use_vendor_multimodal_model,
vendor_multimodal_model_name,
use_own_key,
vendor_multimodal_api_key,
)

parse_instance = LlamaParse(**kwargs)

tasks = [
Expand All @@ -46,3 +66,61 @@ async def llama_parse_pure(
pages = list(range(1, len(documents) + 1))

return texts, path, pages


def _add_multimodal_params(
kwargs,
use_vendor_multimodal_model,
vendor_multimodal_model_name,
use_own_key,
vendor_multimodal_api_key,
) -> dict:
kwargs["use_vendor_multimodal_model"] = use_vendor_multimodal_model
kwargs["vendor_multimodal_model_name"] = vendor_multimodal_model_name

def set_multimodal_api_key(
multimodal_model_name: str = "openai-gpt4o", _api_key: str = None
) -> str:
if multimodal_model_name in ["openai-gpt4o", "openai-gpt-4o-mini"]:
_api_key = (
os.getenv("OPENAI_API_KEY", None) if _api_key is None else _api_key
)
if _api_key is None:
raise KeyError(
"Please set the OPENAI_API_KEY in the environment variable OPENAI_API_KEY "
"or directly set it on the config YAML file."
)
elif multimodal_model_name in ["anthropic-sonnet-3.5"]:
_api_key = (
os.getenv("ANTHROPIC_API_KEY", None) if _api_key is None else _api_key
)
if _api_key is None:
raise KeyError(
"Please set the ANTHROPIC_API_KEY in the environment variable ANTHROPIC_API_KEY "
"or directly set it on the config YAML file."
)
elif multimodal_model_name in ["gemini-1.5-flash", "gemini-1.5-pro"]:
_api_key = (
os.getenv("GEMINI_API_KEY", None) if _api_key is None else _api_key
)
if _api_key is None:
raise KeyError(
"Please set the GEMINI_API_KEY in the environment variable GEMINI_API_KEY "
"or directly set it on the config YAML file."
)
elif multimodal_model_name in ["custom-azure-model"]:
raise NotImplementedError(
"Custom Azure multimodal model is not supported yet."
)
else:
raise ValueError("Invalid multimodal model name.")

return _api_key

if use_own_key:
api_key = set_multimodal_api_key(
vendor_multimodal_model_name, vendor_multimodal_api_key
)
kwargs["vendor_multimodal_api_key"] = api_key

return kwargs
67 changes: 67 additions & 0 deletions docs/source/data_creation/parse/llama_parse.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
Parse raw documents to use
[Llama Parse](https://github.com/run-llama/llama_parse).

## Set Environment Variables
You need to set the `LLAMA_CLOUD_API_KEY` environment variables to use Llama Parse.

You can get API Key at [here](https://docs.cloud.llamaindex.ai/llamaparse/getting_started/get_an_api_key)

## Language Support

You can find more information about the supported languages at
Expand All @@ -19,11 +24,73 @@ If you have tables in your raw document, set `result_type: markdown` to convert
- markdown
- json

## Use Multimodal Model

You can see more information about multimodal model at [Multimodal Parsing](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal)

If you want to use multimodal model, you need to set the following parameters.

1. `use_vendor_multimodal_model`: Whether to use the vendor multimodal model. If you want to use multimodal model, set it to True. Default is False.
2. `vendor_multimodal_model_name`: The name of the vendor multimodal model. Default is "openai-gpt4o".
- You can find the list of available multimodal models [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal).

3. `use_own_key`: Whether to use the own API key. Default is False.
- If this is set to False, the [Basic Plan](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) provided by llama parse will be used.
- If set to False, only set environ variable `LLAMA_CLOUD_API_KEY` is required to use it.
- If true, you will need to set the api_key below.
- There are two ways to set up an API key.
- Putting `vendor_multimodal_api_key` directly into the YAML File
- Put the API Key in an environment variable based on `vendor_multimodal_model_name`.
- `vendor_multimodal_model_name`: `openai-gpt4o` or `openai-gpt-4o-mini`
- Set `OPENAI_API_KEY` environment variable
- `vendor_multimodal_model_name`: `anthropic-sonnet-3.5`
- Set `ANTHROPIC_API_KEY` environment variable
- `vendor_multimodal_model_name`: `gemini-1.5-flash` or `gemini-1.5-pro`
- Set the `GEMINI_API_KEY` environment variable


```{note}
vendor_multimodal_model_name: "custom-azure-model" is not supported in this module.
```


## Example YAML

- Not use multimodal model

```yaml
modules:
- module_type: llama_parse
result_type: markdown
language: en
```
- Use multimodal model with [Basic Plan](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) provided by llama parse
```yaml
modules:
- module_type: llamaparse
result_type: markdown
use_vendor_multimodal_model: true
vendor_multimodal_model_name: openai-gpt-4o-mini
```
- Use multimodal model with own API Key
```yaml
modules:
- module_type: llamaparse
result_type: markdown
use_vendor_multimodal_model: true
vendor_multimodal_model_name: openai-gpt-4o-mini
use_own_key: true
vendor_multimodal_api_key: YOUR_OPENAI_API_KEY
```
- Use multimodal model with own API Key (Environment Variable)
```yaml
modules:
- module_type: llamaparse
result_type: markdown
use_vendor_multimodal_model: true
vendor_multimodal_model_name: openai-gpt-4o-mini
use_own_key: true
```
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ aiohttp # for async http requests
voyageai # for voyageai reranker
mixedbread-ai # for mixedbread-ai reranker
llama-index-llms-bedrock
scikit-learn

### API server ###
quart
Expand Down
2 changes: 2 additions & 0 deletions sample_config/parse/parse_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ modules:
- module_type: llamaparse
result_type: markdown
language: ko
use_vendor_multimodal_model: true
vendor_multimodal_model_name: openai-gpt-4o-mini
- module_type: table_hybrid_parse
text_parse_module: langchain_parse
text_params:
Expand Down
7 changes: 7 additions & 0 deletions sample_config/parse/parse_multimodal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
modules:
- module_type: llamaparse
result_type: markdown
language: ko
use_vendor_multimodal_model: true
vendor_multimodal_model_name: openai-gpt-4o-mini
use_own_key: true
115 changes: 115 additions & 0 deletions tests/autorag/data/parse/test_llamaparse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List
from unittest.mock import patch

Expand Down Expand Up @@ -63,3 +64,117 @@ def test_llama_parse_multiple_pdf_node():
)
assert result_df["page"].tolist() == [1, 1]
assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal():
llama_parse_original = llamaparse.__wrapped__
texts, path, pages = llama_parse_original(
multiple_pdf_data_list,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
)
check_parse_result(texts, path, "multiple_pdf")
assert pages == [1, 1]
assert texts == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal_node():
result_df = llamaparse(
eng_text_glob,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
)
check_parse_result(
result_df["texts"].tolist(),
result_df["path"].tolist(),
"multiple_pdf",
)
assert result_df["page"].tolist() == [1, 1]
assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal_use_env_key():
temp_env_vars = {
"OPENAI_API_KEY": "mock_openai_api_key",
}

with patch.dict(os.environ, temp_env_vars):
llama_parse_original = llamaparse.__wrapped__
texts, path, pages = llama_parse_original(
multiple_pdf_data_list,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
use_own_key=True,
)
check_parse_result(texts, path, "multiple_pdf")
assert pages == [1, 1]
assert texts == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal_use_env_key_node():
temp_env_vars = {
"OPENAI_API_KEY": "mock_openai_api_key",
}

with patch.dict(os.environ, temp_env_vars):
result_df = llamaparse(
eng_text_glob,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
use_own_key=True,
)
check_parse_result(
result_df["texts"].tolist(),
result_df["path"].tolist(),
"multiple_pdf",
)
assert result_df["page"].tolist() == [1, 1]
assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal_use_own_key():
llama_parse_original = llamaparse.__wrapped__
texts, path, pages = llama_parse_original(
multiple_pdf_data_list,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
vendor_multimodal_api_key="mock_openai_api_key",
)
check_parse_result(texts, path, "multiple_pdf")
assert pages == [1, 1]
assert texts == ["I love AutoRAG", "I love AutoRAG"]


@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
def test_llama_parse_multimodal_use_own_key_node():
result_df = llamaparse(
eng_text_glob,
url="mock_url",
api_key="mock_api_key",
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt-4o-mini",
use_own_key=True,
vendor_multimodal_api_key="mock_openai_api_key",
)
check_parse_result(
result_df["texts"].tolist(),
result_df["path"].tolist(),
"multiple_pdf",
)
assert result_df["page"].tolist() == [1, 1]
assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]

0 comments on commit 1c9b4eb

Please sign in to comment.