Skip to content

Commit

Permalink
Personal/ranxia/fix image readme (#155)
Browse files Browse the repository at this point in the history
* fix multi_modal and readme

* fix multi_modal and readme

* fix multi_modal and readme
  • Loading branch information
Ceceliachenen authored Aug 15, 2024
1 parent b93d7ee commit 684b896
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 39 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ PAI-RAG is an easy-to-use opensource framework for modular RAG (Retrieval-Augmen
conda activate rag_env
```

Inorder to convert PDF to a PIL Image object, you need to use pdf2image. Please refer to this link to download Poppler first:https://github.com/Belval/pdf2image

- (1) CPU

Use poetry to install project dependency packages directly:
Expand Down
2 changes: 2 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ PAI-RAG 是一个易于使用的模块化 RAG(检索增强生成)开源框
conda activate rag_env
```

为了将pdf转化为图片,你需要使用pdf2image,请先参考这个链接下载poppler: https://github.com/Belval/pdf2image

- (1) CPU环境

直接使用poetry安装项目依赖包:
Expand Down
17 changes: 9 additions & 8 deletions src/pai_rag/app/web/tabs/upload_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def upload_knowledge(
chunk_overlap,
enable_qa_extraction,
enable_raptor,
enable_ocr,
enable_multimodal,
enable_table_summary,
):
if not upload_files:
Expand All @@ -25,7 +25,7 @@ def upload_knowledge(
{
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"enable_ocr": enable_ocr,
"enable_multimodal": enable_multimodal,
"enable_table_summary": enable_table_summary,
}
)
Expand Down Expand Up @@ -116,10 +116,11 @@ def create_upload_tab() -> Dict[str, Any]:
info="Process with Raptor Node Enhancement",
elem_id="enable_raptor",
)
enable_ocr = gr.Checkbox(
enable_multimodal = gr.Checkbox(
label="Yes",
info="Process with OCR",
elem_id="enable_ocr",
info="Process with MultiModal",
elem_id="enable_multimodal",
visible=False,
)
enable_table_summary = gr.Checkbox(
label="Yes",
Expand Down Expand Up @@ -152,7 +153,7 @@ def create_upload_tab() -> Dict[str, Any]:
chunk_overlap,
enable_qa_extraction,
enable_raptor,
enable_ocr,
enable_multimodal,
enable_table_summary,
],
outputs=[upload_file_state_df, upload_file_state],
Expand All @@ -172,7 +173,7 @@ def create_upload_tab() -> Dict[str, Any]:
chunk_overlap,
enable_qa_extraction,
enable_raptor,
enable_ocr,
enable_multimodal,
enable_table_summary,
],
outputs=[upload_dir_state_df, upload_dir_state],
Expand All @@ -189,6 +190,6 @@ def create_upload_tab() -> Dict[str, Any]:
chunk_overlap.elem_id: chunk_overlap,
enable_qa_extraction.elem_id: enable_qa_extraction,
enable_raptor.elem_id: enable_raptor,
enable_ocr.elem_id: enable_ocr,
enable_multimodal.elem_id: enable_multimodal,
enable_table_summary.elem_id: enable_table_summary,
}
10 changes: 5 additions & 5 deletions src/pai_rag/app/web/view_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class ViewModel(BaseModel):
reader_type: str = "SimpleDirectoryReader"
enable_qa_extraction: bool = False
enable_raptor: bool = False
enable_ocr: bool = False
enable_multimodal: bool = False
enable_table_summary: bool = False

config_file: str = None
Expand Down Expand Up @@ -251,8 +251,8 @@ def from_app_config(config):
view_model.enable_raptor = config["data_reader"].get(
"enable_raptor", view_model.enable_raptor
)
view_model.enable_ocr = config["data_reader"].get(
"enable_ocr", view_model.enable_ocr
view_model.enable_multimodal = config["data_reader"].get(
"enable_multimodal", view_model.enable_multimodal
)
view_model.enable_table_summary = config["data_reader"].get(
"enable_table_summary", view_model.enable_table_summary
Expand Down Expand Up @@ -326,7 +326,7 @@ def to_app_config(self):

config["data_reader"]["enable_qa_extraction"] = self.enable_qa_extraction
config["data_reader"]["enable_raptor"] = self.enable_raptor
config["data_reader"]["enable_ocr"] = self.enable_ocr
config["data_reader"]["enable_multimodal"] = self.enable_multimodal
config["data_reader"]["enable_table_summary"] = self.enable_table_summary
config["data_reader"]["type"] = self.reader_type

Expand Down Expand Up @@ -504,7 +504,7 @@ def to_component_settings(self) -> Dict[str, Dict[str, Any]]:
settings["chunk_overlap"] = {"value": self.chunk_overlap}
settings["enable_qa_extraction"] = {"value": self.enable_qa_extraction}
settings["enable_raptor"] = {"value": self.enable_raptor}
settings["enable_ocr"] = {"value": self.enable_ocr}
settings["enable_multimodal"] = {"value": self.enable_multimodal}
settings["enable_table_summary"] = {"value": self.enable_table_summary}

# retrieval and rerank
Expand Down
29 changes: 7 additions & 22 deletions src/pai_rag/integrations/readers/pai_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
)
import pdfplumber
from pdf2image import convert_from_bytes
import easyocr
from llama_index.core import Settings
from pai_rag.utils.constants import DEFAULT_MODEL_DIR
import json
import unicodedata
import logging
Expand Down Expand Up @@ -49,44 +47,31 @@ class PaiPDFReader(BaseReader):
"""Read PDF files including texts, tables, images.
Args:
enable_image_ocr (bool): whether load ocr model to process images
model_dir: (str): ocr model path
enable_multimodal (bool): whether to use multimodal to process images
"""

def __init__(
self,
enable_image_ocr: bool = False,
enable_multimodal: bool = False,
enable_table_summary: bool = False,
model_dir: str = DEFAULT_MODEL_DIR,
oss_cache: Any = None,
) -> None:
self.enable_image_ocr = enable_image_ocr
self.enable_table_summary = enable_table_summary
self.enable_multimodal = enable_multimodal
self._oss_cache = oss_cache
if self.enable_table_summary:
logger.info("process with table summary")
if self.enable_image_ocr:
self.model_dir = model_dir or os.path.join(DEFAULT_MODEL_DIR, "easyocr")
logger.info("start loading ocr model")
self.image_reader = easyocr.Reader(
["ch_sim", "en"],
model_storage_directory=self.model_dir,
download_enabled=True,
detector=True,
recognizer=True,
)
logger.info("finished loading ocr model")

def process_pdf_image(self, element: LTFigure, page_object: PageObject) -> str:
"""
Processes an image element from a PDF, crops it out, and performs OCR on the result.
Processes an image element from a PDF, crops it out, and performs multimodal on the result.
Args:
element (LTFigure): An LTFigure object representing the image in the PDF, containing its coordinates.
page_object (PageObject): A PageObject representing the page in the PDF to be cropped.
Returns:
str: The OCR-processed text from the cropped image.
str: The image_url from the cropped image.
"""
assert (
self._oss_cache is not None
Expand All @@ -102,7 +87,7 @@ def process_pdf_image(self, element: LTFigure, page_object: PageObject) -> str:
# Adjust the page's media box to crop the image based on the coordinates
page_object.mediabox.lower_left = (image_left, image_bottom)
page_object.mediabox.upper_right = (image_right, image_top)
# Save the cropped page as a new PDF file and perform OCR
# Save the cropped page as a new PDF file and get url
cropped_pdf_writer = PyPDF2.PdfWriter()
cropped_pdf_stream = BytesIO()

Expand Down Expand Up @@ -402,7 +387,7 @@ def load(
if isinstance(element, LTTextBoxHorizontal):
text_elements.append(element)

elif isinstance(element, LTFigure):
elif isinstance(element, LTFigure) and self.enable_multimodal:
image_url = self.process_pdf_image(element, page_object)
if image_url:
image_cnt += 1
Expand Down
3 changes: 1 addition & 2 deletions src/pai_rag/modules/datareader/datareader_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ def _create_new_instance(self, new_params: Dict[str, Any]):
".html": HtmlReader(),
".htm": HtmlReader(),
".pdf": PaiPDFReader(
enable_image_ocr=self.reader_config.get("enable_ocr", False),
enable_multimodal=self.reader_config.get("enable_multimodal", False),
enable_table_summary=self.reader_config.get(
"enable_table_summary", False
),
model_dir=self.reader_config.get("easyocr_model_dir", None),
oss_cache=self.oss_cache,
),
".csv": PaiPandasCSVReader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ def test_pai_pdf_reader():
input_dir="tests/testdata/data/pdf_data",
file_extractor={
".pdf": PaiPDFReader(
enable_image_ocr=reader_config.get("enable_image_ocr", False),
model_dir=reader_config.get("easyocr_model_dir", None),
enable_multimodal=reader_config.get("enable_multimodal", False)
)
},
)
Expand Down

0 comments on commit 684b896

Please sign in to comment.