From 9e2a69139f87910ad75ea7e60ad88db1a1a6020a Mon Sep 17 00:00:00 2001 From: w5688414 Date: Tue, 11 Oct 2022 11:45:15 +0000 Subject: [PATCH] Add multi type files index update example for pipelines --- pipelines/examples/semantic-search/README.md | 12 +++- .../pipelines/nodes/file_converter/docx.py | 3 +- pipelines/pipelines/utils/preprocessing.py | 71 +++++++++++-------- pipelines/utils/offline_ann.py | 7 +- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md index 302a2209678e..22d849741b69 100644 --- a/pipelines/examples/semantic-search/README.md +++ b/pipelines/examples/semantic-search/README.md @@ -161,7 +161,17 @@ sh examples/semantic-search/run_search_web.sh #### 3.4.5 数据更新 -数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,另一种是使用前端界面的文件上传进行数据更新,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt): +数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,第二种是使用前端界面的文件上传(在界面的左侧)进行数据更新。对于第一种使用脚本的方式,可以使用多种文件更新数据,示例的文件更新建索引的命令如下,里面包含了图片(目前仅支持把图中所有的文字合并建立索引),docx(支持图文,需要按照空行进行划分段落),txt(需要按照空行划分段落)三种格式的文件建索引: + +``` +python utils/offline_ann.py --index_name dureader_robust_query_encoder \ + --doc_dir data/file_example \ + --port 9200 \ + --search_engine elastic \ + --delete_index +``` + +对于第二种使用界面的方式,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt): ``` 兴证策略认为,最恐慌的时候已经过去,未来一个月市场迎来阶段性修复窗口。 diff --git a/pipelines/pipelines/nodes/file_converter/docx.py b/pipelines/pipelines/nodes/file_converter/docx.py index 3d036a3ada39..d580d7eaa9f8 100644 --- a/pipelines/pipelines/nodes/file_converter/docx.py +++ b/pipelines/pipelines/nodes/file_converter/docx.py @@ -126,7 +126,8 @@ def convert( if (raw_text == ''): continue meta_data = {} - meta_data['name'] = meta['name'] + if (meta is not None and 'name' in meta): + meta_data['name'] = meta['name'] meta_data['images'] = text_dict['images'] document = { "content": raw_text, diff --git a/pipelines/pipelines/utils/preprocessing.py b/pipelines/pipelines/utils/preprocessing.py index 29c3bb290427..5493ff400127 100644 --- a/pipelines/pipelines/utils/preprocessing.py +++ b/pipelines/pipelines/utils/preprocessing.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter +from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter, ImageToTextConverter logger = logging.getLogger(__name__) @@ -39,7 +39,7 @@ def convert_files_to_dicts(dir_path: str, :param encoding: character encoding to use when converting pdf documents. """ file_paths = [p for p in Path(dir_path).glob("**/*")] - allowed_suffixes = [".pdf", ".txt", ".docx"] + allowed_suffixes = [".pdf", ".txt", ".docx", ".png", '.jpg'] suffix2converter: Dict[str, BaseConverter] = {} suffix2paths: Dict[str, List[Path]] = {} @@ -63,6 +63,8 @@ def convert_files_to_dicts(dir_path: str, suffix2converter[file_suffix] = TextConverter() if file_suffix == ".docx": suffix2converter[file_suffix] = DocxToTextConverter() + if file_suffix == ".png" or file_suffix == ".jpg": + suffix2converter[file_suffix] = ImageToTextConverter() documents = [] for suffix, paths in suffix2paths.items(): @@ -70,39 +72,52 @@ def convert_files_to_dicts(dir_path: str, if encoding is None and suffix == ".pdf": encoding = "Latin1" logger.info("Converting {}".format(path)) - document = suffix2converter[suffix].convert( + list_documents = suffix2converter[suffix].convert( file_path=path, meta=None, encoding=encoding, - )[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict - text = document["content"] + ) # PDFToTextConverter, TextConverter, ImageToTextConverter and DocxToTextConverter return a list containing a single dict + for document in list_documents: + text = document["content"] - if clean_func: - text = clean_func(text) + if clean_func: + text = clean_func(text) - if split_paragraphs: - for para in text.split("\n"): - if not para.strip(): # skip empty paragraphs - continue - if (split_answers): - query, answer = para.split('\t') - documents.append({ - "content": query, - "meta": { + if split_paragraphs: + for para in text.split("\n"): + if not para.strip(): # skip empty paragraphs + continue + if (split_answers): + query, answer = para.split('\t') + meta_data = {"name": path.name, "answer": answer} + # Add image list parsed from docx into meta + if (document['meta'] is not None + and 'images' in document['meta']): + meta_data['images'] = document['meta']['images'] + + documents.append({ + "content": query, + "meta": meta_data + }) + else: + meta_data = { "name": path.name, - "answer": answer, - } - }) - else: - documents.append({ - "content": para, - "meta": { - "name": path.name } - }) - else: - documents.append({"content": text, "meta": {"name": path.name}}) - + # Add image list parsed from docx into meta + if (document['meta'] is not None + and 'images' in document['meta']): + meta_data['images'] = document['meta']['images'] + documents.append({ + "content": para, + "meta": meta_data + }) + else: + documents.append({ + "content": text, + "meta": document['meta'] if 'meta' in document else { + "name": path.name + } + }) return documents diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py index 8b1c6d0fabe2..a48ddb60e81e 100644 --- a/pipelines/utils/offline_ann.py +++ b/pipelines/utils/offline_ann.py @@ -24,9 +24,12 @@ data_dict = { 'data/dureader_dev': "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip", - "data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip", + "data/baike": + "https://paddlenlp.bj.bcebos.com/applications/baike.zip", "data/insurance": - "https://paddlenlp.bj.bcebos.com/applications/insurance.zip" + "https://paddlenlp.bj.bcebos.com/applications/insurance.zip", + "data/file_example": + "https://paddlenlp.bj.bcebos.com/pipelines/file_examples.zip" } parser = argparse.ArgumentParser()