Add ernie 3.0 based rocketqa into pipelines (#3078)

* Add ernie 3.0 based rocketqa into pipelines * Add create index bash * remove unused codes
PaddlePaddle · Aug 18, 2022 · 8b692d0 · 8b692d0
1 parent a549c89
commit 8b692d0
Show file tree

Hide file tree

Showing 9 changed files with 59 additions and 25 deletions.
diff --git a/paddlenlp/transformers/semantic_search/modeling.py b/paddlenlp/transformers/semantic_search/modeling.py
@@ -27,7 +27,8 @@ def __init__(self, ernie, dropout=None, num_classes=2):
         super(ErnieEncoder, self).__init__()
         self.ernie = ernie  # allow ernie to be config
         self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
-        self.classifier = nn.Linear(768, num_classes)
+        self.classifier = nn.Linear(self.ernie.config["hidden_size"],
+                                    num_classes)
         self.apply(self.init_weights)
 
     def init_weights(self, layer):

diff --git a/pipelines/examples/semantic-search/README.md b/pipelines/examples/semantic-search/README.md
@@ -154,6 +154,8 @@ xpack.security.enabled: false
 3. 检查确保 ES 服务启动成功
 ```bash
 curl http://localhost:9200/_aliases?pretty=true
+# 打印几条数据
+curl http://localhost:9200/dureader_robust_query_encoder/_search
 ```
 备注：ES 服务默认开启端口为 9200
 
@@ -186,7 +188,7 @@ sh scripts/run_search_server.sh
 启动后可以使用curl命令验证是否成功运行：
 
 ```
-curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "亚马逊河流的介绍","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}'
+curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "衡量酒水的价格的因素有哪些?","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}'
 
 ```
 #### 3.4.4 启动 WebUI

diff --git a/pipelines/examples/semantic-search/docker/Dockerfile b/pipelines/examples/semantic-search/docker/Dockerfile
@@ -1,6 +1,8 @@
-FROM w5688414/pipelines-cpu:1.2
+# linux
+FROM w5688414/pipelines-cpu-linux:1.3
+# FROM w5688414/pipelines-cpu-win:1.3
 COPY start.sh /root/start.sh
-COPY build_index.sh /root/PaddleNLP/applications/experimental/pipelines
+COPY create_index.sh /root/PaddleNLP/applications/experimental/pipelines
 COPY run_server.sh  /root/PaddleNLP/applications/experimental/pipelines 
 COPY run_client.sh /root/PaddleNLP/applications/experimental/pipelines
 RUN chmod +x /root/start.sh

diff --git a/pipelines/examples/semantic-search/docker/Dockerfile-GPU b/pipelines/examples/semantic-search/docker/Dockerfile-GPU
@@ -1,6 +1,6 @@
-FROM w5688414/pipelines:1.3
+FROM w5688414/pipelines-cpu-linux:1.3
 COPY start.sh /root/start.sh
-COPY build_index.sh /root/PaddleNLP/applications/experimental/pipelines
+COPY create_index.sh /root/PaddleNLP/applications/experimental/pipelines
 COPY run_server.sh  /root/PaddleNLP/applications/experimental/pipelines 
 COPY run_client.sh /root/PaddleNLP/applications/experimental/pipelines
 RUN chmod +x /root/start.sh

diff --git a/...les/semantic-search/docker/build_index.sh → ...es/semantic-search/docker/create_index.sh b/...les/semantic-search/docker/build_index.sh → ...es/semantic-search/docker/create_index.sh
@@ -1,6 +1,12 @@
 unset http_proxy && unset https_proxy
 export CUDA_VISIBLE_DEVICES=0
+# linux
 python utils/offline_ann.py --index_name dureader_robust_query_encoder \
                             --doc_dir data/dureader_dev \
                             --port 9200 \
                             --host localhost
+# windows                             
+# python utils/offline_ann.py --index_name dureader_robust_query_encoder \
+#                             --doc_dir data/dureader_dev \
+#                             --port 9200 \
+#                             --host host.docker.internal
diff --git a/pipelines/examples/semantic-search/docker/start.sh b/pipelines/examples/semantic-search/docker/start.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 cd /root/PaddleNLP/applications/experimental/pipelines/
-sh build_index.sh
+sh create_index.sh
 nohup sh run_server.sh > server.log 2>&1 &
-sleep 10
-lsof -i:8899
 nohup sh run_client.sh > client.log 2>&1 &
diff --git a/pipelines/pipelines/nodes/file_converter/image.py b/pipelines/pipelines/nodes/file_converter/image.py
@@ -18,6 +18,7 @@
 import logging
 import subprocess
 from pathlib import Path
+import paddle
 from paddleocr import PaddleOCR
 try:
     from PIL.PpmImagePlugin import PpmImageFile
@@ -57,7 +58,11 @@ def __init__(
         # save init parameters to enable export of component config as YAML
         self.set_config(remove_numeric_tables=remove_numeric_tables,
                         valid_languages=valid_languages)
-        self.recognize = PaddleOCR(use_angle_cls=True, lang='ch')
+        use_gpu = True if 'gpu' in paddle.device.get_device() else False
+        self.recognize = PaddleOCR(use_angle_cls=True,
+                                   lang='ch',
+                                   use_gpu=use_gpu)
+
         super().__init__(remove_numeric_tables=remove_numeric_tables,
                          valid_languages=valid_languages)
 

diff --git a/pipelines/rest_api/pipeline/semantic_search.yaml b/pipelines/rest_api/pipeline/semantic_search.yaml
@@ -7,18 +7,19 @@ components:    # define all the building-blocks for Pipeline
       host: localhost
       port: 9200
       index: dureader_robust_query_encoder
+      embedding_dim: 312
   - name: Retriever
     type: DensePassageRetriever
     params:
       document_store: DocumentStore    # params can reference other components defined in the YAML
       top_k: 10
-      query_embedding_model: rocketqa-zh-dureader-query-encoder
-      passage_embedding_model: rocketqa-zh-dureader-query-encoder
+      query_embedding_model: rocketqa-zh-nano-query-encoder
+      passage_embedding_model: rocketqa-zh-nano-para-encoder
       embed_title: False
   - name: Ranker       # custom-name for the component; helpful for visualization & debugging
     type: ErnieRanker    # pipelines Class name for the component
     params:
-      model_name_or_path: rocketqa-zh-dureader-cross-encoder
+      model_name_or_path: rocketqa-nano-cross-encoder
       top_k: 3
   - name: TextFileConverter
     type: TextConverter

diff --git a/pipelines/utils/offline_ann.py b/pipelines/utils/offline_ann.py
@@ -32,6 +32,21 @@
                     default="9200",
                     help='port of elastic search')
 
+parser.add_argument("--embedding_dim",
+                    default=312,
+                    type=int,
+                    help="The embedding_dim of index")
+
+parser.add_argument("--query_embedding_model",
+                    default="rocketqa-zh-nano-query-encoder",
+                    type=str,
+                    help="The query_embedding_model path")
+
+parser.add_argument("--passage_embedding_model",
+                    default="rocketqa-zh-nano-para-encoder",
+                    type=str,
+                    help="The passage_embedding_model path")
+
 parser.add_argument(
     '--delete_index',
     action='store_true',
@@ -44,11 +59,13 @@ def offline_ann(index_name, doc_dir):
 
     launch_es()
 
-    document_store = ElasticsearchDocumentStore(host=args.host,
-                                                port=args.port,
-                                                username="",
-                                                password="",
-                                                index=index_name)
+    document_store = ElasticsearchDocumentStore(
+        host=args.host,
+        port=args.port,
+        username="",
+        password="",
+        embedding_dim=args.embedding_dim,
+        index=index_name)
     # 将每篇文档按照段落进行切分
     dicts = convert_files_to_dicts(dir_path=doc_dir,
                                    split_paragraphs=True,
@@ -62,8 +79,8 @@ def offline_ann(index_name, doc_dir):
     ### 语义索引模型
     retriever = DensePassageRetriever(
         document_store=document_store,
-        query_embedding_model="rocketqa-zh-dureader-query-encoder",
-        passage_embedding_model="rocketqa-zh-dureader-query-encoder",
+        query_embedding_model=args.query_embedding_model,
+        passage_embedding_model=args.passage_embedding_model,
         max_seq_len_query=64,
         max_seq_len_passage=256,
         batch_size=16,
@@ -76,11 +93,13 @@ def offline_ann(index_name, doc_dir):
 
 
 def delete_data(index_name):
-    document_store = ElasticsearchDocumentStore(host=args.host,
-                                                port=args.port,
-                                                username="",
-                                                password="",
-                                                index=index_name)
+    document_store = ElasticsearchDocumentStore(
+        host=args.host,
+        port=args.port,
+        username="",
+        password="",
+        embedding_dim=args.embedding_dim,
+        index=index_name)
 
     document_store.delete_index(index_name)
     print('Delete an existing elasticsearch index {} Done.'.format(index_name))