From 9e2a69139f87910ad75ea7e60ad88db1a1a6020a Mon Sep 17 00:00:00 2001
From: w5688414 <w5688414@gmail.com>
Date: Tue, 11 Oct 2022 11:45:15 +0000
Subject: [PATCH] Add multi type files index update example for pipelines

---
 pipelines/examples/semantic-search/README.md  | 12 +++-
 .../pipelines/nodes/file_converter/docx.py    |  3 +-
 pipelines/pipelines/utils/preprocessing.py    | 71 +++++++++++--------
 pipelines/utils/offline_ann.py                |  7 +-
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md
index 302a2209678e..22d849741b69 100644
--- a/pipelines/examples/semantic-search/README.md
+++ b/pipelines/examples/semantic-search/README.md
@@ -161,7 +161,17 @@ sh examples/semantic-search/run_search_web.sh
 
 #### 3.4.5 数据更新
 
-数据更新的方法有两种，第一种使用前面的 `utils/offline_ann.py`进行数据更新，另一种是使用前端界面的文件上传进行数据更新，支持txt，pdf，image，word的格式，以txt格式的文件为例，每段文本需要使用空行隔开，程序会根据空行进行分段建立索引，示例数据如下(demo.txt)：
+数据更新的方法有两种，第一种使用前面的 `utils/offline_ann.py`进行数据更新，第二种是使用前端界面的文件上传（在界面的左侧）进行数据更新。对于第一种使用脚本的方式，可以使用多种文件更新数据，示例的文件更新建索引的命令如下，里面包含了图片（目前仅支持把图中所有的文字合并建立索引），docx（支持图文，需要按照空行进行划分段落），txt（需要按照空行划分段落）三种格式的文件建索引：
+
+```
+python utils/offline_ann.py --index_name dureader_robust_query_encoder \
+                            --doc_dir data/file_example \
+                            --port 9200 \
+                            --search_engine elastic \
+                            --delete_index
+```
+
+对于第二种使用界面的方式，支持txt，pdf，image，word的格式，以txt格式的文件为例，每段文本需要使用空行隔开，程序会根据空行进行分段建立索引，示例数据如下(demo.txt)：
 
 ```
 兴证策略认为，最恐慌的时候已经过去，未来一个月市场迎来阶段性修复窗口。
diff --git a/pipelines/pipelines/nodes/file_converter/docx.py b/pipelines/pipelines/nodes/file_converter/docx.py
index 3d036a3ada39..d580d7eaa9f8 100644
--- a/pipelines/pipelines/nodes/file_converter/docx.py
+++ b/pipelines/pipelines/nodes/file_converter/docx.py
@@ -126,7 +126,8 @@ def convert(
                     if (raw_text == ''):
                         continue
                     meta_data = {}
-                    meta_data['name'] = meta['name']
+                    if (meta is not None and 'name' in meta):
+                        meta_data['name'] = meta['name']
                     meta_data['images'] = text_dict['images']
                     document = {
                         "content": raw_text,
diff --git a/pipelines/pipelines/utils/preprocessing.py b/pipelines/pipelines/utils/preprocessing.py
index 29c3bb290427..5493ff400127 100644
--- a/pipelines/pipelines/utils/preprocessing.py
+++ b/pipelines/pipelines/utils/preprocessing.py
@@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
 
-from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
+from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter, ImageToTextConverter
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +39,7 @@ def convert_files_to_dicts(dir_path: str,
     :param encoding: character encoding to use when converting pdf documents.
     """
     file_paths = [p for p in Path(dir_path).glob("**/*")]
-    allowed_suffixes = [".pdf", ".txt", ".docx"]
+    allowed_suffixes = [".pdf", ".txt", ".docx", ".png", '.jpg']
     suffix2converter: Dict[str, BaseConverter] = {}
 
     suffix2paths: Dict[str, List[Path]] = {}
@@ -63,6 +63,8 @@ def convert_files_to_dicts(dir_path: str,
             suffix2converter[file_suffix] = TextConverter()
         if file_suffix == ".docx":
             suffix2converter[file_suffix] = DocxToTextConverter()
+        if file_suffix == ".png" or file_suffix == ".jpg":
+            suffix2converter[file_suffix] = ImageToTextConverter()
 
     documents = []
     for suffix, paths in suffix2paths.items():
@@ -70,39 +72,52 @@ def convert_files_to_dicts(dir_path: str,
             if encoding is None and suffix == ".pdf":
                 encoding = "Latin1"
             logger.info("Converting {}".format(path))
-            document = suffix2converter[suffix].convert(
+            list_documents = suffix2converter[suffix].convert(
                 file_path=path,
                 meta=None,
                 encoding=encoding,
-            )[0]  # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
-            text = document["content"]
+            )  # PDFToTextConverter, TextConverter, ImageToTextConverter and DocxToTextConverter return a list containing a single dict
+            for document in list_documents:
+                text = document["content"]
 
-            if clean_func:
-                text = clean_func(text)
+                if clean_func:
+                    text = clean_func(text)
 
-            if split_paragraphs:
-                for para in text.split("\n"):
-                    if not para.strip():  # skip empty paragraphs
-                        continue
-                    if (split_answers):
-                        query, answer = para.split('\t')
-                        documents.append({
-                            "content": query,
-                            "meta": {
+                if split_paragraphs:
+                    for para in text.split("\n"):
+                        if not para.strip():  # skip empty paragraphs
+                            continue
+                        if (split_answers):
+                            query, answer = para.split('\t')
+                            meta_data = {"name": path.name, "answer": answer}
+                            # Add image list parsed from docx into meta
+                            if (document['meta'] is not None
+                                    and 'images' in document['meta']):
+                                meta_data['images'] = document['meta']['images']
+
+                            documents.append({
+                                "content": query,
+                                "meta": meta_data
+                            })
+                        else:
+                            meta_data = {
                                 "name": path.name,
-                                "answer": answer,
-                            }
-                        })
-                    else:
-                        documents.append({
-                            "content": para,
-                            "meta": {
-                                "name": path.name
                             }
-                        })
-            else:
-                documents.append({"content": text, "meta": {"name": path.name}})
-
+                            # Add image list parsed from docx into meta
+                            if (document['meta'] is not None
+                                    and 'images' in document['meta']):
+                                meta_data['images'] = document['meta']['images']
+                            documents.append({
+                                "content": para,
+                                "meta": meta_data
+                            })
+                else:
+                    documents.append({
+                        "content": text,
+                        "meta": document['meta'] if 'meta' in document else {
+                            "name": path.name
+                        }
+                    })
     return documents
 
 
diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py
index 8b1c6d0fabe2..a48ddb60e81e 100644
--- a/pipelines/utils/offline_ann.py
+++ b/pipelines/utils/offline_ann.py
@@ -24,9 +24,12 @@
 data_dict = {
     'data/dureader_dev':
     "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip",
-    "data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip",
+    "data/baike":
+    "https://paddlenlp.bj.bcebos.com/applications/baike.zip",
     "data/insurance":
-    "https://paddlenlp.bj.bcebos.com/applications/insurance.zip"
+    "https://paddlenlp.bj.bcebos.com/applications/insurance.zip",
+    "data/file_example":
+    "https://paddlenlp.bj.bcebos.com/pipelines/file_examples.zip"
 }
 
 parser = argparse.ArgumentParser()