From 684b89668a172513dc73e7a8b49cd0d424ea12d3 Mon Sep 17 00:00:00 2001 From: Ceceliachenen Date: Thu, 15 Aug 2024 11:59:49 +0800 Subject: [PATCH] Personal/ranxia/fix image readme (#155) * fix multi_modal and readme * fix multi_modal and readme * fix multi_modal and readme --- README.md | 2 ++ README_zh.md | 2 ++ src/pai_rag/app/web/tabs/upload_tab.py | 17 ++++++----- src/pai_rag/app/web/view_model.py | 10 +++---- .../integrations/readers/pai_pdf_reader.py | 29 +++++-------------- .../modules/datareader/datareader_factory.py | 3 +- ...syocr_pdf_reader.py => test_pdf_reader.py} | 3 +- 7 files changed, 27 insertions(+), 39 deletions(-) rename tests/data_readers/{test_easyocr_pdf_reader.py => test_pdf_reader.py} (91%) diff --git a/README.md b/README.md index d6fa4672..091ce8c9 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ PAI-RAG is an easy-to-use opensource framework for modular RAG (Retrieval-Augmen conda activate rag_env ``` + Inorder to convert PDF to a PIL Image object, you need to use pdf2image. Please refer to this link to download Poppler first:https://github.com/Belval/pdf2image + - (1) CPU Use poetry to install project dependency packages directly: diff --git a/README_zh.md b/README_zh.md index db37aa22..2fcbc6e9 100644 --- a/README_zh.md +++ b/README_zh.md @@ -50,6 +50,8 @@ PAI-RAG 是一个易于使用的模块化 RAG(检索增强生成)开源框 conda activate rag_env ``` + 为了将pdf转化为图片,你需要使用pdf2image,请先参考这个链接下载poppler: https://github.com/Belval/pdf2image + - (1) CPU环境 直接使用poetry安装项目依赖包: diff --git a/src/pai_rag/app/web/tabs/upload_tab.py b/src/pai_rag/app/web/tabs/upload_tab.py index 1d423c52..cf68e122 100644 --- a/src/pai_rag/app/web/tabs/upload_tab.py +++ b/src/pai_rag/app/web/tabs/upload_tab.py @@ -14,7 +14,7 @@ def upload_knowledge( chunk_overlap, enable_qa_extraction, enable_raptor, - enable_ocr, + enable_multimodal, enable_table_summary, ): if not upload_files: @@ -25,7 +25,7 @@ def upload_knowledge( { "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, - "enable_ocr": enable_ocr, + "enable_multimodal": enable_multimodal, "enable_table_summary": enable_table_summary, } ) @@ -116,10 +116,11 @@ def create_upload_tab() -> Dict[str, Any]: info="Process with Raptor Node Enhancement", elem_id="enable_raptor", ) - enable_ocr = gr.Checkbox( + enable_multimodal = gr.Checkbox( label="Yes", - info="Process with OCR", - elem_id="enable_ocr", + info="Process with MultiModal", + elem_id="enable_multimodal", + visible=False, ) enable_table_summary = gr.Checkbox( label="Yes", @@ -152,7 +153,7 @@ def create_upload_tab() -> Dict[str, Any]: chunk_overlap, enable_qa_extraction, enable_raptor, - enable_ocr, + enable_multimodal, enable_table_summary, ], outputs=[upload_file_state_df, upload_file_state], @@ -172,7 +173,7 @@ def create_upload_tab() -> Dict[str, Any]: chunk_overlap, enable_qa_extraction, enable_raptor, - enable_ocr, + enable_multimodal, enable_table_summary, ], outputs=[upload_dir_state_df, upload_dir_state], @@ -189,6 +190,6 @@ def create_upload_tab() -> Dict[str, Any]: chunk_overlap.elem_id: chunk_overlap, enable_qa_extraction.elem_id: enable_qa_extraction, enable_raptor.elem_id: enable_raptor, - enable_ocr.elem_id: enable_ocr, + enable_multimodal.elem_id: enable_multimodal, enable_table_summary.elem_id: enable_table_summary, } diff --git a/src/pai_rag/app/web/view_model.py b/src/pai_rag/app/web/view_model.py index d573476e..93f7ed1a 100644 --- a/src/pai_rag/app/web/view_model.py +++ b/src/pai_rag/app/web/view_model.py @@ -52,7 +52,7 @@ class ViewModel(BaseModel): reader_type: str = "SimpleDirectoryReader" enable_qa_extraction: bool = False enable_raptor: bool = False - enable_ocr: bool = False + enable_multimodal: bool = False enable_table_summary: bool = False config_file: str = None @@ -251,8 +251,8 @@ def from_app_config(config): view_model.enable_raptor = config["data_reader"].get( "enable_raptor", view_model.enable_raptor ) - view_model.enable_ocr = config["data_reader"].get( - "enable_ocr", view_model.enable_ocr + view_model.enable_multimodal = config["data_reader"].get( + "enable_multimodal", view_model.enable_multimodal ) view_model.enable_table_summary = config["data_reader"].get( "enable_table_summary", view_model.enable_table_summary @@ -326,7 +326,7 @@ def to_app_config(self): config["data_reader"]["enable_qa_extraction"] = self.enable_qa_extraction config["data_reader"]["enable_raptor"] = self.enable_raptor - config["data_reader"]["enable_ocr"] = self.enable_ocr + config["data_reader"]["enable_multimodal"] = self.enable_multimodal config["data_reader"]["enable_table_summary"] = self.enable_table_summary config["data_reader"]["type"] = self.reader_type @@ -504,7 +504,7 @@ def to_component_settings(self) -> Dict[str, Dict[str, Any]]: settings["chunk_overlap"] = {"value": self.chunk_overlap} settings["enable_qa_extraction"] = {"value": self.enable_qa_extraction} settings["enable_raptor"] = {"value": self.enable_raptor} - settings["enable_ocr"] = {"value": self.enable_ocr} + settings["enable_multimodal"] = {"value": self.enable_multimodal} settings["enable_table_summary"] = {"value": self.enable_table_summary} # retrieval and rerank diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py index 1d972279..078dbbad 100644 --- a/src/pai_rag/integrations/readers/pai_pdf_reader.py +++ b/src/pai_rag/integrations/readers/pai_pdf_reader.py @@ -16,9 +16,7 @@ ) import pdfplumber from pdf2image import convert_from_bytes -import easyocr from llama_index.core import Settings -from pai_rag.utils.constants import DEFAULT_MODEL_DIR import json import unicodedata import logging @@ -49,44 +47,31 @@ class PaiPDFReader(BaseReader): """Read PDF files including texts, tables, images. Args: - enable_image_ocr (bool): whether load ocr model to process images - model_dir: (str): ocr model path + enable_multimodal (bool): whether to use multimodal to process images """ def __init__( self, - enable_image_ocr: bool = False, + enable_multimodal: bool = False, enable_table_summary: bool = False, - model_dir: str = DEFAULT_MODEL_DIR, oss_cache: Any = None, ) -> None: - self.enable_image_ocr = enable_image_ocr self.enable_table_summary = enable_table_summary + self.enable_multimodal = enable_multimodal self._oss_cache = oss_cache if self.enable_table_summary: logger.info("process with table summary") - if self.enable_image_ocr: - self.model_dir = model_dir or os.path.join(DEFAULT_MODEL_DIR, "easyocr") - logger.info("start loading ocr model") - self.image_reader = easyocr.Reader( - ["ch_sim", "en"], - model_storage_directory=self.model_dir, - download_enabled=True, - detector=True, - recognizer=True, - ) - logger.info("finished loading ocr model") def process_pdf_image(self, element: LTFigure, page_object: PageObject) -> str: """ - Processes an image element from a PDF, crops it out, and performs OCR on the result. + Processes an image element from a PDF, crops it out, and performs multimodal on the result. Args: element (LTFigure): An LTFigure object representing the image in the PDF, containing its coordinates. page_object (PageObject): A PageObject representing the page in the PDF to be cropped. Returns: - str: The OCR-processed text from the cropped image. + str: The image_url from the cropped image. """ assert ( self._oss_cache is not None @@ -102,7 +87,7 @@ def process_pdf_image(self, element: LTFigure, page_object: PageObject) -> str: # Adjust the page's media box to crop the image based on the coordinates page_object.mediabox.lower_left = (image_left, image_bottom) page_object.mediabox.upper_right = (image_right, image_top) - # Save the cropped page as a new PDF file and perform OCR + # Save the cropped page as a new PDF file and get url cropped_pdf_writer = PyPDF2.PdfWriter() cropped_pdf_stream = BytesIO() @@ -402,7 +387,7 @@ def load( if isinstance(element, LTTextBoxHorizontal): text_elements.append(element) - elif isinstance(element, LTFigure): + elif isinstance(element, LTFigure) and self.enable_multimodal: image_url = self.process_pdf_image(element, page_object) if image_url: image_cnt += 1 diff --git a/src/pai_rag/modules/datareader/datareader_factory.py b/src/pai_rag/modules/datareader/datareader_factory.py index 68ed4841..4688d150 100644 --- a/src/pai_rag/modules/datareader/datareader_factory.py +++ b/src/pai_rag/modules/datareader/datareader_factory.py @@ -30,11 +30,10 @@ def _create_new_instance(self, new_params: Dict[str, Any]): ".html": HtmlReader(), ".htm": HtmlReader(), ".pdf": PaiPDFReader( - enable_image_ocr=self.reader_config.get("enable_ocr", False), + enable_multimodal=self.reader_config.get("enable_multimodal", False), enable_table_summary=self.reader_config.get( "enable_table_summary", False ), - model_dir=self.reader_config.get("easyocr_model_dir", None), oss_cache=self.oss_cache, ), ".csv": PaiPandasCSVReader( diff --git a/tests/data_readers/test_easyocr_pdf_reader.py b/tests/data_readers/test_pdf_reader.py similarity index 91% rename from tests/data_readers/test_easyocr_pdf_reader.py rename to tests/data_readers/test_pdf_reader.py index 446627bc..c2e0de59 100644 --- a/tests/data_readers/test_easyocr_pdf_reader.py +++ b/tests/data_readers/test_pdf_reader.py @@ -17,8 +17,7 @@ def test_pai_pdf_reader(): input_dir="tests/testdata/data/pdf_data", file_extractor={ ".pdf": PaiPDFReader( - enable_image_ocr=reader_config.get("enable_image_ocr", False), - model_dir=reader_config.get("easyocr_model_dir", None), + enable_multimodal=reader_config.get("enable_multimodal", False) ) }, )