Skip to content

Commit

Permalink
Add ernie 3.0 based rocketqa into pipelines (#3078)
Browse files Browse the repository at this point in the history
* Add ernie 3.0 based rocketqa into pipelines

* Add create index bash

* remove unused codes
  • Loading branch information
w5688414 authored Aug 18, 2022
1 parent a549c89 commit 8b692d0
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 25 deletions.
3 changes: 2 additions & 1 deletion paddlenlp/transformers/semantic_search/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(self, ernie, dropout=None, num_classes=2):
super(ErnieEncoder, self).__init__()
self.ernie = ernie # allow ernie to be config
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
self.classifier = nn.Linear(768, num_classes)
self.classifier = nn.Linear(self.ernie.config["hidden_size"],
num_classes)
self.apply(self.init_weights)

def init_weights(self, layer):
Expand Down
4 changes: 3 additions & 1 deletion pipelines/examples/semantic-search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ xpack.security.enabled: false
3. 检查确保 ES 服务启动成功
```bash
curl http://localhost:9200/_aliases?pretty=true
# 打印几条数据
curl http://localhost:9200/dureader_robust_query_encoder/_search
```
备注:ES 服务默认开启端口为 9200

Expand Down Expand Up @@ -186,7 +188,7 @@ sh scripts/run_search_server.sh
启动后可以使用curl命令验证是否成功运行:

```
curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "亚马逊河流的介绍","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}'
curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "衡量酒水的价格的因素有哪些?","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}'
```
#### 3.4.4 启动 WebUI
Expand Down
6 changes: 4 additions & 2 deletions pipelines/examples/semantic-search/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
FROM w5688414/pipelines-cpu:1.2
# linux
FROM w5688414/pipelines-cpu-linux:1.3
# FROM w5688414/pipelines-cpu-win:1.3
COPY start.sh /root/start.sh
COPY build_index.sh /root/PaddleNLP/applications/experimental/pipelines
COPY create_index.sh /root/PaddleNLP/applications/experimental/pipelines
COPY run_server.sh /root/PaddleNLP/applications/experimental/pipelines
COPY run_client.sh /root/PaddleNLP/applications/experimental/pipelines
RUN chmod +x /root/start.sh
Expand Down
4 changes: 2 additions & 2 deletions pipelines/examples/semantic-search/docker/Dockerfile-GPU
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM w5688414/pipelines:1.3
FROM w5688414/pipelines-cpu-linux:1.3
COPY start.sh /root/start.sh
COPY build_index.sh /root/PaddleNLP/applications/experimental/pipelines
COPY create_index.sh /root/PaddleNLP/applications/experimental/pipelines
COPY run_server.sh /root/PaddleNLP/applications/experimental/pipelines
COPY run_client.sh /root/PaddleNLP/applications/experimental/pipelines
RUN chmod +x /root/start.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
unset http_proxy && unset https_proxy
export CUDA_VISIBLE_DEVICES=0
# linux
python utils/offline_ann.py --index_name dureader_robust_query_encoder \
--doc_dir data/dureader_dev \
--port 9200 \
--host localhost
# windows
# python utils/offline_ann.py --index_name dureader_robust_query_encoder \
# --doc_dir data/dureader_dev \
# --port 9200 \
# --host host.docker.internal
4 changes: 1 addition & 3 deletions pipelines/examples/semantic-search/docker/start.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/bash
cd /root/PaddleNLP/applications/experimental/pipelines/
sh build_index.sh
sh create_index.sh
nohup sh run_server.sh > server.log 2>&1 &
sleep 10
lsof -i:8899
nohup sh run_client.sh > client.log 2>&1 &
7 changes: 6 additions & 1 deletion pipelines/pipelines/nodes/file_converter/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import logging
import subprocess
from pathlib import Path
import paddle
from paddleocr import PaddleOCR
try:
from PIL.PpmImagePlugin import PpmImageFile
Expand Down Expand Up @@ -57,7 +58,11 @@ def __init__(
# save init parameters to enable export of component config as YAML
self.set_config(remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages)
self.recognize = PaddleOCR(use_angle_cls=True, lang='ch')
use_gpu = True if 'gpu' in paddle.device.get_device() else False
self.recognize = PaddleOCR(use_angle_cls=True,
lang='ch',
use_gpu=use_gpu)

super().__init__(remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages)

Expand Down
7 changes: 4 additions & 3 deletions pipelines/rest_api/pipeline/semantic_search.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@ components: # define all the building-blocks for Pipeline
host: localhost
port: 9200
index: dureader_robust_query_encoder
embedding_dim: 312
- name: Retriever
type: DensePassageRetriever
params:
document_store: DocumentStore # params can reference other components defined in the YAML
top_k: 10
query_embedding_model: rocketqa-zh-dureader-query-encoder
passage_embedding_model: rocketqa-zh-dureader-query-encoder
query_embedding_model: rocketqa-zh-nano-query-encoder
passage_embedding_model: rocketqa-zh-nano-para-encoder
embed_title: False
- name: Ranker # custom-name for the component; helpful for visualization & debugging
type: ErnieRanker # pipelines Class name for the component
params:
model_name_or_path: rocketqa-zh-dureader-cross-encoder
model_name_or_path: rocketqa-nano-cross-encoder
top_k: 3
- name: TextFileConverter
type: TextConverter
Expand Down
43 changes: 31 additions & 12 deletions pipelines/utils/offline_ann.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,21 @@
default="9200",
help='port of elastic search')

parser.add_argument("--embedding_dim",
default=312,
type=int,
help="The embedding_dim of index")

parser.add_argument("--query_embedding_model",
default="rocketqa-zh-nano-query-encoder",
type=str,
help="The query_embedding_model path")

parser.add_argument("--passage_embedding_model",
default="rocketqa-zh-nano-para-encoder",
type=str,
help="The passage_embedding_model path")

parser.add_argument(
'--delete_index',
action='store_true',
Expand All @@ -44,11 +59,13 @@ def offline_ann(index_name, doc_dir):

launch_es()

document_store = ElasticsearchDocumentStore(host=args.host,
port=args.port,
username="",
password="",
index=index_name)
document_store = ElasticsearchDocumentStore(
host=args.host,
port=args.port,
username="",
password="",
embedding_dim=args.embedding_dim,
index=index_name)
# 将每篇文档按照段落进行切分
dicts = convert_files_to_dicts(dir_path=doc_dir,
split_paragraphs=True,
Expand All @@ -62,8 +79,8 @@ def offline_ann(index_name, doc_dir):
### 语义索引模型
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="rocketqa-zh-dureader-query-encoder",
passage_embedding_model="rocketqa-zh-dureader-query-encoder",
query_embedding_model=args.query_embedding_model,
passage_embedding_model=args.passage_embedding_model,
max_seq_len_query=64,
max_seq_len_passage=256,
batch_size=16,
Expand All @@ -76,11 +93,13 @@ def offline_ann(index_name, doc_dir):


def delete_data(index_name):
document_store = ElasticsearchDocumentStore(host=args.host,
port=args.port,
username="",
password="",
index=index_name)
document_store = ElasticsearchDocumentStore(
host=args.host,
port=args.port,
username="",
password="",
embedding_dim=args.embedding_dim,
index=index_name)

document_store.delete_index(index_name)
print('Delete an existing elasticsearch index {} Done.'.format(index_name))
Expand Down

0 comments on commit 8b692d0

Please sign in to comment.