Add use multimodal feature at llama parse (#868)

* create _add_multimodal_params function * Add Use Multimodal docs
Marker-Inc-Korea · Oct 20, 2024 · 1c9b4eb · 1c9b4eb
1 parent 411174d
commit 1c9b4eb
Show file tree

Hide file tree

Showing 6 changed files with 271 additions and 1 deletion.
diff --git a/autorag/data/parse/llamaparse.py b/autorag/data/parse/llamaparse.py
@@ -1,3 +1,4 @@
+import os
 from typing import List, Tuple
 from itertools import chain
 
@@ -9,7 +10,13 @@
 
 @parser_node
 def llama_parse(
-	data_path_list: List[str], batch: int = 8, **kwargs
+	data_path_list: List[str],
+	batch: int = 8,
+	use_vendor_multimodal_model: bool = False,
+	vendor_multimodal_model_name: str = "openai-gpt4o",
+	use_own_key: bool = False,
+	vendor_multimodal_api_key: str = None,
+	**kwargs,
 ) -> Tuple[List[str], List[str], List[int]]:
 	"""
 	Parse documents to use llama_parse.
@@ -18,9 +25,22 @@ def llama_parse(
 
 	:param data_path_list: The list of data paths to parse.
 	:param batch: The batch size for parse documents. Default is 8.
+	:param use_vendor_multimodal_model: Whether to use the vendor multimodal model. Default is False.
+	:param vendor_multimodal_model_name: The name of the vendor multimodal model. Default is "openai-gpt4o".
+	:param use_own_key: Whether to use the own API key. Default is False.
+	:param vendor_multimodal_api_key: The API key for the vendor multimodal model.
 	:param kwargs: The extra parameters for creating the llama_parse instance.
 	:return: tuple of lists containing the parsed texts, path and pages.
 	"""
+	if use_vendor_multimodal_model:
+		kwargs = _add_multimodal_params(
+			kwargs,
+			use_vendor_multimodal_model,
+			vendor_multimodal_model_name,
+			use_own_key,
+			vendor_multimodal_api_key,
+		)
+
 	parse_instance = LlamaParse(**kwargs)
 
 	tasks = [
@@ -46,3 +66,61 @@ async def llama_parse_pure(
 	pages = list(range(1, len(documents) + 1))
 
 	return texts, path, pages
+
+
+def _add_multimodal_params(
+	kwargs,
+	use_vendor_multimodal_model,
+	vendor_multimodal_model_name,
+	use_own_key,
+	vendor_multimodal_api_key,
+) -> dict:
+	kwargs["use_vendor_multimodal_model"] = use_vendor_multimodal_model
+	kwargs["vendor_multimodal_model_name"] = vendor_multimodal_model_name
+
+	def set_multimodal_api_key(
+		multimodal_model_name: str = "openai-gpt4o", _api_key: str = None
+	) -> str:
+		if multimodal_model_name in ["openai-gpt4o", "openai-gpt-4o-mini"]:
+			_api_key = (
+				os.getenv("OPENAI_API_KEY", None) if _api_key is None else _api_key
+			)
+			if _api_key is None:
+				raise KeyError(
+					"Please set the OPENAI_API_KEY in the environment variable OPENAI_API_KEY "
+					"or directly set it on the config YAML file."
+				)
+		elif multimodal_model_name in ["anthropic-sonnet-3.5"]:
+			_api_key = (
+				os.getenv("ANTHROPIC_API_KEY", None) if _api_key is None else _api_key
+			)
+			if _api_key is None:
+				raise KeyError(
+					"Please set the ANTHROPIC_API_KEY in the environment variable ANTHROPIC_API_KEY "
+					"or directly set it on the config YAML file."
+				)
+		elif multimodal_model_name in ["gemini-1.5-flash", "gemini-1.5-pro"]:
+			_api_key = (
+				os.getenv("GEMINI_API_KEY", None) if _api_key is None else _api_key
+			)
+			if _api_key is None:
+				raise KeyError(
+					"Please set the GEMINI_API_KEY in the environment variable GEMINI_API_KEY "
+					"or directly set it on the config YAML file."
+				)
+		elif multimodal_model_name in ["custom-azure-model"]:
+			raise NotImplementedError(
+				"Custom Azure multimodal model is not supported yet."
+			)
+		else:
+			raise ValueError("Invalid multimodal model name.")
+
+		return _api_key
+
+	if use_own_key:
+		api_key = set_multimodal_api_key(
+			vendor_multimodal_model_name, vendor_multimodal_api_key
+		)
+		kwargs["vendor_multimodal_api_key"] = api_key
+
+	return kwargs
diff --git a/docs/source/data_creation/parse/llama_parse.md b/docs/source/data_creation/parse/llama_parse.md
@@ -3,6 +3,11 @@
 Parse raw documents to use
 [Llama Parse](https://github.com/run-llama/llama_parse).
 
+## Set Environment Variables
+You need to set the `LLAMA_CLOUD_API_KEY` environment variables to use Llama Parse.
+
+You can get API Key at [here](https://docs.cloud.llamaindex.ai/llamaparse/getting_started/get_an_api_key)
+
 ## Language Support
 
 You can find more information about the supported languages at
@@ -19,11 +24,73 @@ If you have tables in your raw document, set `result_type: markdown` to convert
 - markdown
 - json
 
+## Use Multimodal Model
+
+You can see more information about multimodal model at [Multimodal Parsing](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal)
+
+If you want to use multimodal model, you need to set the following parameters.
+
+1. `use_vendor_multimodal_model`: Whether to use the vendor multimodal model. If you want to use multimodal model, set it to True. Default is False.
+2. `vendor_multimodal_model_name`: The name of the vendor multimodal model. Default is "openai-gpt4o".
+- You can find the list of available multimodal models [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal).
+
+3. `use_own_key`: Whether to use the own API key. Default is False.
+- If this is set to False, the [Basic Plan](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) provided by llama parse will be used.
+  - If set to False, only set environ variable `LLAMA_CLOUD_API_KEY` is required to use it.
+- If true, you will need to set the api_key below.
+- There are two ways to set up an API key.
+  - Putting `vendor_multimodal_api_key` directly into the YAML File
+  - Put the API Key in an environment variable based on `vendor_multimodal_model_name`.
+    - `vendor_multimodal_model_name`: `openai-gpt4o` or `openai-gpt-4o-mini`
+      - Set `OPENAI_API_KEY` environment variable
+    - `vendor_multimodal_model_name`: `anthropic-sonnet-3.5`
+      - Set `ANTHROPIC_API_KEY` environment variable
+    - `vendor_multimodal_model_name`: `gemini-1.5-flash` or `gemini-1.5-pro`
+      - Set the `GEMINI_API_KEY` environment variable
+
+
+```{note}
+vendor_multimodal_model_name: "custom-azure-model" is not supported in this module.
+```
+
+
 ## Example YAML
 
+- Not use multimodal model
+
 ```yaml
 modules:
   - module_type: llama_parse
     result_type: markdown
     language: en
 ```
+
+- Use multimodal model with [Basic Plan](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) provided by llama parse
+```yaml
+modules:
+  - module_type: llamaparse
+    result_type: markdown
+    use_vendor_multimodal_model: true
+    vendor_multimodal_model_name: openai-gpt-4o-mini
+```
+
+- Use multimodal model with own API Key
+```yaml
+modules:
+  - module_type: llamaparse
+    result_type: markdown
+    use_vendor_multimodal_model: true
+    vendor_multimodal_model_name: openai-gpt-4o-mini
+    use_own_key: true
+    vendor_multimodal_api_key: YOUR_OPENAI_API_KEY
+```
+
+- Use multimodal model with own API Key (Environment Variable)
+```yaml
+modules:
+  - module_type: llamaparse
+    result_type: markdown
+    use_vendor_multimodal_model: true
+    vendor_multimodal_model_name: openai-gpt-4o-mini
+    use_own_key: true
+```
diff --git a/requirements.txt b/requirements.txt
@@ -19,6 +19,7 @@ aiohttp # for async http requests
 voyageai # for voyageai reranker
 mixedbread-ai # for mixedbread-ai reranker
 llama-index-llms-bedrock
+scikit-learn
 
 ### API server ###
 quart

diff --git a/sample_config/parse/parse_full.yaml b/sample_config/parse/parse_full.yaml
@@ -11,6 +11,8 @@ modules:
   - module_type: llamaparse
     result_type: markdown
     language: ko
+    use_vendor_multimodal_model: true
+    vendor_multimodal_model_name: openai-gpt-4o-mini
   - module_type: table_hybrid_parse
     text_parse_module: langchain_parse
     text_params:

diff --git a/sample_config/parse/parse_multimodal.yaml b/sample_config/parse/parse_multimodal.yaml
@@ -0,0 +1,7 @@
+modules:
+  - module_type: llamaparse
+    result_type: markdown
+    language: ko
+    use_vendor_multimodal_model: true
+    vendor_multimodal_model_name: openai-gpt-4o-mini
+    use_own_key: true
diff --git a/tests/autorag/data/parse/test_llamaparse.py b/tests/autorag/data/parse/test_llamaparse.py
@@ -1,3 +1,4 @@
+import os
 from typing import List
 from unittest.mock import patch
 
@@ -63,3 +64,117 @@ def test_llama_parse_multiple_pdf_node():
 	)
 	assert result_df["page"].tolist() == [1, 1]
 	assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal():
+	llama_parse_original = llamaparse.__wrapped__
+	texts, path, pages = llama_parse_original(
+		multiple_pdf_data_list,
+		url="mock_url",
+		api_key="mock_api_key",
+		use_vendor_multimodal_model=True,
+		vendor_multimodal_model_name="openai-gpt-4o-mini",
+	)
+	check_parse_result(texts, path, "multiple_pdf")
+	assert pages == [1, 1]
+	assert texts == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal_node():
+	result_df = llamaparse(
+		eng_text_glob,
+		url="mock_url",
+		api_key="mock_api_key",
+		use_vendor_multimodal_model=True,
+		vendor_multimodal_model_name="openai-gpt-4o-mini",
+	)
+	check_parse_result(
+		result_df["texts"].tolist(),
+		result_df["path"].tolist(),
+		"multiple_pdf",
+	)
+	assert result_df["page"].tolist() == [1, 1]
+	assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal_use_env_key():
+	temp_env_vars = {
+		"OPENAI_API_KEY": "mock_openai_api_key",
+	}
+
+	with patch.dict(os.environ, temp_env_vars):
+		llama_parse_original = llamaparse.__wrapped__
+		texts, path, pages = llama_parse_original(
+			multiple_pdf_data_list,
+			url="mock_url",
+			api_key="mock_api_key",
+			use_vendor_multimodal_model=True,
+			vendor_multimodal_model_name="openai-gpt-4o-mini",
+			use_own_key=True,
+		)
+		check_parse_result(texts, path, "multiple_pdf")
+		assert pages == [1, 1]
+		assert texts == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal_use_env_key_node():
+	temp_env_vars = {
+		"OPENAI_API_KEY": "mock_openai_api_key",
+	}
+
+	with patch.dict(os.environ, temp_env_vars):
+		result_df = llamaparse(
+			eng_text_glob,
+			url="mock_url",
+			api_key="mock_api_key",
+			use_vendor_multimodal_model=True,
+			vendor_multimodal_model_name="openai-gpt-4o-mini",
+			use_own_key=True,
+		)
+		check_parse_result(
+			result_df["texts"].tolist(),
+			result_df["path"].tolist(),
+			"multiple_pdf",
+		)
+		assert result_df["page"].tolist() == [1, 1]
+		assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal_use_own_key():
+	llama_parse_original = llamaparse.__wrapped__
+	texts, path, pages = llama_parse_original(
+		multiple_pdf_data_list,
+		url="mock_url",
+		api_key="mock_api_key",
+		use_vendor_multimodal_model=True,
+		vendor_multimodal_model_name="openai-gpt-4o-mini",
+		vendor_multimodal_api_key="mock_openai_api_key",
+	)
+	check_parse_result(texts, path, "multiple_pdf")
+	assert pages == [1, 1]
+	assert texts == ["I love AutoRAG", "I love AutoRAG"]
+
+
+@patch.object(llama_parse.base.LlamaParse, "aload_data", mock_llama_parse_aload_data)
+def test_llama_parse_multimodal_use_own_key_node():
+	result_df = llamaparse(
+		eng_text_glob,
+		url="mock_url",
+		api_key="mock_api_key",
+		use_vendor_multimodal_model=True,
+		vendor_multimodal_model_name="openai-gpt-4o-mini",
+		use_own_key=True,
+		vendor_multimodal_api_key="mock_openai_api_key",
+	)
+	check_parse_result(
+		result_df["texts"].tolist(),
+		result_df["path"].tolist(),
+		"multiple_pdf",
+	)
+	assert result_df["page"].tolist() == [1, 1]
+	assert result_df["texts"].tolist() == ["I love AutoRAG", "I love AutoRAG"]