Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi type files index update example for pipelines #3439

Merged
merged 2 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pipelines/examples/semantic-search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,17 @@ sh examples/semantic-search/run_search_web.sh

#### 3.4.5 数据更新

数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,另一种是使用前端界面的文件上传进行数据更新,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt):
数据更新的方法有两种,第一种使用前面的 `utils/offline_ann.py`进行数据更新,第二种是使用前端界面的文件上传(在界面的左侧)进行数据更新。对于第一种使用脚本的方式,可以使用多种文件更新数据,示例的文件更新建索引的命令如下,里面包含了图片(目前仅支持把图中所有的文字合并建立索引),docx(支持图文,需要按照空行进行划分段落),txt(需要按照空行划分段落)三种格式的文件建索引:

```
python utils/offline_ann.py --index_name dureader_robust_query_encoder \
--doc_dir data/file_example \
--port 9200 \
--search_engine elastic \
--delete_index
```

对于第二种使用界面的方式,支持txt,pdf,image,word的格式,以txt格式的文件为例,每段文本需要使用空行隔开,程序会根据空行进行分段建立索引,示例数据如下(demo.txt):

```
兴证策略认为,最恐慌的时候已经过去,未来一个月市场迎来阶段性修复窗口。
Expand Down
3 changes: 2 additions & 1 deletion pipelines/pipelines/nodes/file_converter/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ def convert(
if (raw_text == ''):
continue
meta_data = {}
meta_data['name'] = meta['name']
if (meta is not None and 'name' in meta):
meta_data['name'] = meta['name']
meta_data['images'] = text_dict['images']
document = {
"content": raw_text,
Expand Down
71 changes: 43 additions & 28 deletions pipelines/pipelines/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import logging
from pathlib import Path

from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from pipelines.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter, ImageToTextConverter

logger = logging.getLogger(__name__)

Expand All @@ -39,7 +39,7 @@ def convert_files_to_dicts(dir_path: str,
:param encoding: character encoding to use when converting pdf documents.
"""
file_paths = [p for p in Path(dir_path).glob("**/*")]
allowed_suffixes = [".pdf", ".txt", ".docx"]
allowed_suffixes = [".pdf", ".txt", ".docx", ".png", '.jpg']
suffix2converter: Dict[str, BaseConverter] = {}

suffix2paths: Dict[str, List[Path]] = {}
Expand All @@ -63,46 +63,61 @@ def convert_files_to_dicts(dir_path: str,
suffix2converter[file_suffix] = TextConverter()
if file_suffix == ".docx":
suffix2converter[file_suffix] = DocxToTextConverter()
if file_suffix == ".png" or file_suffix == ".jpg":
suffix2converter[file_suffix] = ImageToTextConverter()

documents = []
for suffix, paths in suffix2paths.items():
for path in paths:
if encoding is None and suffix == ".pdf":
encoding = "Latin1"
logger.info("Converting {}".format(path))
document = suffix2converter[suffix].convert(
list_documents = suffix2converter[suffix].convert(
file_path=path,
meta=None,
encoding=encoding,
)[0] # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single dict
text = document["content"]
) # PDFToTextConverter, TextConverter, ImageToTextConverter and DocxToTextConverter return a list containing a single dict
for document in list_documents:
text = document["content"]

if clean_func:
text = clean_func(text)
if clean_func:
text = clean_func(text)

if split_paragraphs:
for para in text.split("\n"):
if not para.strip(): # skip empty paragraphs
continue
if (split_answers):
query, answer = para.split('\t')
documents.append({
"content": query,
"meta": {
if split_paragraphs:
for para in text.split("\n"):
if not para.strip(): # skip empty paragraphs
continue
if (split_answers):
query, answer = para.split('\t')
meta_data = {"name": path.name, "answer": answer}
# Add image list parsed from docx into meta
if (document['meta'] is not None
and 'images' in document['meta']):
meta_data['images'] = document['meta']['images']

documents.append({
"content": query,
"meta": meta_data
})
else:
meta_data = {
"name": path.name,
"answer": answer,
}
})
else:
documents.append({
"content": para,
"meta": {
"name": path.name
}
})
else:
documents.append({"content": text, "meta": {"name": path.name}})

# Add image list parsed from docx into meta
if (document['meta'] is not None
and 'images' in document['meta']):
meta_data['images'] = document['meta']['images']
documents.append({
"content": para,
"meta": meta_data
})
else:
documents.append({
"content": text,
"meta": document['meta'] if 'meta' in document else {
"name": path.name
}
})
return documents


Expand Down
7 changes: 5 additions & 2 deletions pipelines/utils/offline_ann.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
data_dict = {
'data/dureader_dev':
"https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip",
"data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip",
"data/baike":
"https://paddlenlp.bj.bcebos.com/applications/baike.zip",
"data/insurance":
"https://paddlenlp.bj.bcebos.com/applications/insurance.zip"
"https://paddlenlp.bj.bcebos.com/applications/insurance.zip",
"data/file_example":
"https://paddlenlp.bj.bcebos.com/pipelines/file_examples.zip"
}

parser = argparse.ArgumentParser()
Expand Down