PaddlePaddle · w5688414 · Oct 11, 2022 · Oct 11, 2022 · Oct 11, 2022
diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md
@@ -161,7 +161,17 @@ sh examples/semantic-search/run_search_web.sh
 
 #### 3.4.5 数据更新
 
-数据更新的方法有两种，第一种使用前面的 `utils/offline_ann.py`进行数据更新，另一种是使用前端界面的文件上传进行数据更新，支持txt，pdf，image，word的格式，以txt格式的文件为例，每段文本需要使用空行隔开，程序会根据空行进行分段建立索引，示例数据如下(demo.txt)：
+数据更新的方法有两种，第一种使用前面的 `utils/offline_ann.py`进行数据更新，第二种是使用前端界面的文件上传（在界面的左侧）进行数据更新。对于第一种使用脚本的方式，可以使用多种文件更新数据，示例的文件更新建索引的命令如下，里面包含了图片（目前仅支持把图中所有的文字合并建立索引），docx（支持图文，需要按照空行进行划分段落），txt（需要按照空行划分段落）三种格式的文件建索引：
+
+```
+python utils/offline_ann.py --index_name dureader_robust_query_encoder \
+                            --doc_dir data/file_example \
+                            --port 9200 \
+                            --search_engine elastic \
+                            --delete_index
+```
+
+对于第二种使用界面的方式，支持txt，pdf，image，word的格式，以txt格式的文件为例，每段文本需要使用空行隔开，程序会根据空行进行分段建立索引，示例数据如下(demo.txt)：
 
 ```
 兴证策略认为，最恐慌的时候已经过去，未来一个月市场迎来阶段性修复窗口。

diff --git a/pipelines/pipelines/nodes/file_converter/docx.py b/pipelines/pipelines/nodes/file_converter/docx.py
@@ -126,7 +126,8 @@ def convert(
                     if (raw_text == ''):
                         continue
                     meta_data = {}
-                    meta_data['name'] = meta['name']
+                    if (meta is not None and 'name' in meta):
+                        meta_data['name'] = meta['name']
                     meta_data['images'] = text_dict['images']
                     document = {
                         "content": raw_text,

diff --git a/pipelines/pipelines/utils/preprocessing.py b/pipelines/pipelines/utils/preprocessing.py
@@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
 
-from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
+from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter, ImageToTextConverter
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +39,7 @@ def convert_files_to_dicts(dir_path: str,
     :param encoding: character encoding to use when converting pdf documents.
     """
     file_paths = [p for p in Path(dir_path).glob("**/*")]
-    allowed_suffixes = [".pdf", ".txt", ".docx"]
+    allowed_suffixes = [".pdf", ".txt", ".docx", ".png", '.jpg']
     suffix2converter: Dict[str, BaseConverter] = {}
 
     suffix2paths: Dict[str, List[Path]] = {}
@@ -63,46 +63,61 @@ def convert_files_to_dicts(dir_path: str,
             suffix2converter[file_suffix] = TextConverter()
         if file_suffix == ".docx":
             suffix2converter[file_suffix] = DocxToTextConverter()
+        if file_suffix == ".png" or file_suffix == ".jpg":
+            suffix2converter[file_suffix] = ImageToTextConverter()
 
     documents = []
     for suffix, paths in suffix2paths.items():
         for path in paths:
             if encoding is None and suffix == ".pdf":
                 encoding = "Latin1"
             logger.info("Converting {}".format(path))
-            document = suffix2converter[suffix].convert(
+            list_documents = suffix2converter[suffix].convert(
                 file_path=path,
                 meta=None,
                 encoding=encoding,
-            )[0]  # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
-            text = document["content"]
+            )  # PDFToTextConverter, TextConverter, ImageToTextConverter and DocxToTextConverter return a list containing a single dict
+            for document in list_documents:
+                text = document["content"]
 
-            if clean_func:
-                text = clean_func(text)
+                if clean_func:
+                    text = clean_func(text)
 
-            if split_paragraphs:
-                for para in text.split("\n"):
-                    if not para.strip():  # skip empty paragraphs
-                        continue
-                    if (split_answers):
-                        query, answer = para.split('\t')
-                        documents.append({
-                            "content": query,
-                            "meta": {
+                if split_paragraphs:
+                    for para in text.split("\n"):
+                        if not para.strip():  # skip empty paragraphs
+                            continue
+                        if (split_answers):
+                            query, answer = para.split('\t')
+                            meta_data = {"name": path.name, "answer": answer}
+                            # Add image list parsed from docx into meta
+                            if (document['meta'] is not None
+                                    and 'images' in document['meta']):
+                                meta_data['images'] = document['meta']['images']
+
+                            documents.append({
+                                "content": query,
+                                "meta": meta_data
+                            })
+                        else:
+                            meta_data = {
                                 "name": path.name,
-                                "answer": answer,
-                            }
-                        })
-                    else:
-                        documents.append({
-                            "content": para,
-                            "meta": {
-                                "name": path.name
                             }
-                        })
-            else:
-                documents.append({"content": text, "meta": {"name": path.name}})
-
+                            # Add image list parsed from docx into meta
+                            if (document['meta'] is not None
+                                    and 'images' in document['meta']):
+                                meta_data['images'] = document['meta']['images']
+                            documents.append({
+                                "content": para,
+                                "meta": meta_data
+                            })
+                else:
+                    documents.append({
+                        "content": text,
+                        "meta": document['meta'] if 'meta' in document else {
+                            "name": path.name
+                        }
+                    })
     return documents
 
 

diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py
@@ -24,9 +24,12 @@
 data_dict = {
     'data/dureader_dev':
     "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip",
-    "data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip",
+    "data/baike":
+    "https://paddlenlp.bj.bcebos.com/applications/baike.zip",
     "data/insurance":
-    "https://paddlenlp.bj.bcebos.com/applications/insurance.zip"
+    "https://paddlenlp.bj.bcebos.com/applications/insurance.zip",
+    "data/file_example":
+    "https://paddlenlp.bj.bcebos.com/pipelines/file_examples.zip"
 }
 
 parser = argparse.ArgumentParser()