From 16a7a941a19b4085f9b1457d120f74357c853772 Mon Sep 17 00:00:00 2001
From: w5688414 <w5688414@gmail.com>
Date: Thu, 4 Aug 2022 16:25:38 +0800
Subject: [PATCH 1/3] Fix faiss index batch_size bug on python3.7 and update es
 config for pipelines

---
 .../pipelines/examples/question-answering/README.md    |  8 +++++++-
 .../examples/question-answering/dense_qa_example.py    |  4 +++-
 .../pipelines/examples/semantic-search/README.md       |  5 +++++
 .../semantic-search/semantic_search_example.py         |  4 +++-
 .../pipelines/pipelines/document_stores/faiss.py       | 10 +++++-----
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/applications/experimental/pipelines/examples/question-answering/README.md b/applications/experimental/pipelines/examples/question-answering/README.md
index 4305a2c10701..5ef924672187 100644
--- a/applications/experimental/pipelines/examples/question-answering/README.md
+++ b/applications/experimental/pipelines/examples/question-answering/README.md
@@ -73,8 +73,14 @@ python examples/question-answering/dense_qa_example.py --device cpu
 整个 Web 可视化问答系统主要包含 3 大组件: 1. 基于 ElasticSearch 的 ANN 服务 2. 基于 RestAPI 构建模型服务 3. 基于 Streamlit 构建 WebUI。接下来我们依次搭建这 3 个服务并串联构成可视化的问答系统
 
 #### 3.4.1 启动 ANN 服务
-1. 参考官方文档下载安装 [elasticsearch-8.1.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。
+1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。
 2. 启动 ES 服务
+首先修改`config/elasticsearch.yml`的配置：
+```
+xpack.security.enabled: false
+```
+然后启动：
+
 ```bash
 ./bin/elasticsearch
 ```
diff --git a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
index d9ce9ed47520..208b4fb927f1 100644
--- a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
+++ b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
@@ -15,6 +15,7 @@
 parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
 parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
 parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
+parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.")
 args = parser.parse_args()
 # yapf: enable
 
@@ -66,7 +67,8 @@ def dense_qa_pipeline():
         )
 
         # update Embedding
-        document_store.update_embeddings(retriever)
+        document_store.update_embeddings(retriever,
+                                         batch_size=args.update_batch_size)
 
         # save index
         document_store.save(args.index_name)
diff --git a/applications/experimental/pipelines/examples/semantic-search/README.md b/applications/experimental/pipelines/examples/semantic-search/README.md
index 93fa775c3344..48932a46ae80 100644
--- a/applications/experimental/pipelines/examples/semantic-search/README.md
+++ b/applications/experimental/pipelines/examples/semantic-search/README.md
@@ -78,6 +78,11 @@ python examples/semantic-search/semantic_search_example.py --device cpu
 #### 3.4.1 启动 ANN 服务
 1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。
 2. 启动 ES 服务
+首先修改`config/elasticsearch.yml`的配置：
+```
+xpack.security.enabled: false
+```
+然后启动：
 ```bash
 ./bin/elasticsearch
 ```
diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
index 6900dde13ab6..0ab818d2a1ce 100644
--- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
+++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
@@ -13,6 +13,7 @@
 parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
 parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
 parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
+parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.")
 args = parser.parse_args()
 # yapf: enable
 
@@ -65,7 +66,8 @@ def semantic_search_tutorial():
         )
 
         # update Embedding
-        document_store.update_embeddings(retriever)
+        document_store.update_embeddings(retriever,
+                                         batch_size=args.update_batch_size)
 
         # save index
         document_store.save(args.index_name)
diff --git a/applications/experimental/pipelines/pipelines/document_stores/faiss.py b/applications/experimental/pipelines/pipelines/document_stores/faiss.py
index 426abd348d2c..85bc686453c3 100644
--- a/applications/experimental/pipelines/pipelines/document_stores/faiss.py
+++ b/applications/experimental/pipelines/pipelines/document_stores/faiss.py
@@ -243,7 +243,7 @@ def write_documents(
         self,
         documents: Union[List[dict], List[Document]],
         index: Optional[str] = None,
-        batch_size: int = 10_000,
+        batch_size: int = 10000,
         duplicate_documents: Optional[str] = None,
         headers: Optional[Dict[str, str]] = None,
     ) -> None:
@@ -349,7 +349,7 @@ def update_embeddings(
         filters: Optional[Dict[
             str,
             Any]] = None,  # TODO: Adapt type once we allow extended filters in FAISSDocStore
-        batch_size: int = 10_000,
+        batch_size: int = 10000,
     ):
         """
         Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -432,7 +432,7 @@ def get_all_documents(
             str,
             Any]] = None,  # TODO: Adapt type once we allow extended filters in FAISSDocStore
         return_embedding: Optional[bool] = None,
-        batch_size: int = 10_000,
+        batch_size: int = 10000,
         headers: Optional[Dict[str, str]] = None,
     ) -> List[Document]:
         if headers:
@@ -454,7 +454,7 @@ def get_all_documents_generator(
             str,
             Any]] = None,  # TODO: Adapt type once we allow extended filters in FAISSDocStore
         return_embedding: Optional[bool] = None,
-        batch_size: int = 10_000,
+        batch_size: int = 10000,
         headers: Optional[Dict[str, str]] = None,
     ) -> Generator[Document, None, None]:
         """
@@ -493,7 +493,7 @@ def get_documents_by_id(
         self,
         ids: List[str],
         index: Optional[str] = None,
-        batch_size: int = 10_000,
+        batch_size: int = 10000,
         headers: Optional[Dict[str, str]] = None,
     ) -> List[Document]:
         if headers:

From df8bae6a848a2acbcb3eca905256230d94c2e98a Mon Sep 17 00:00:00 2001
From: w5688414 <w5688414@gmail.com>
Date: Fri, 5 Aug 2022 19:23:38 +0800
Subject: [PATCH 2/3] Fix the nltk download bug and Add FAQ for mac support

---
 .../question-answering/Install_windows.md     |  4 ++-
 .../examples/question-answering/README.md     |  3 +-
 .../semantic-search/Install_windows.md        |  5 ++--
 .../examples/semantic-search/README.md        | 24 +++++++++++++++
 .../semantic_search_example.py                |  2 +-
 .../pipelines/utils/offline_ann.py            | 30 +++++++++++++++----
 6 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/applications/experimental/pipelines/examples/question-answering/Install_windows.md b/applications/experimental/pipelines/examples/question-answering/Install_windows.md
index a2b158335807..f899ea6a4026 100644
--- a/applications/experimental/pipelines/examples/question-answering/Install_windows.md
+++ b/applications/experimental/pipelines/examples/question-answering/Install_windows.md
@@ -52,12 +52,14 @@ xpack.security.enabled: false
 # 以百科城市数据为例建立 ANN 索引库
 python utils/offline_ann.py --index_name baike_cities --doc_dir data/baike
 ```
-
 参数含义说明
 * `index_name`: 索引的名称
 * `doc_dir`: txt文本数据的路径
+* `host`: Elasticsearch的IP地址
+* `port`: Elasticsearch的端口号
 * `delete_index`: 是否删除现有的索引和数据，用于清空es的数据，默认为false
 
+
 运行成功后会输出如下的日志：
 ```
 INFO - pipelines.utils.logger -  Logged parameters:
diff --git a/applications/experimental/pipelines/examples/question-answering/README.md b/applications/experimental/pipelines/examples/question-answering/README.md
index 5ef924672187..fefd7e4f45db 100644
--- a/applications/experimental/pipelines/examples/question-answering/README.md
+++ b/applications/experimental/pipelines/examples/question-answering/README.md
@@ -97,10 +97,11 @@ python utils/offline_ann.py --index_name baike_cities \
                             --doc_dir data/baike \
                             --delete_index
 ```
-
 参数含义说明
 * `index_name`: 索引的名称
 * `doc_dir`: txt文本数据的路径
+* `host`: Elasticsearch的IP地址
+* `port`: Elasticsearch的端口号
 * `delete_index`: 是否删除现有的索引和数据，用于清空es的数据，默认为false
 
 运行成功后会输出如下的日志：
diff --git a/applications/experimental/pipelines/examples/semantic-search/Install_windows.md b/applications/experimental/pipelines/examples/semantic-search/Install_windows.md
index 7e82145d802f..cca6be7ba210 100644
--- a/applications/experimental/pipelines/examples/semantic-search/Install_windows.md
+++ b/applications/experimental/pipelines/examples/semantic-search/Install_windows.md
@@ -47,12 +47,13 @@ xpack.security.enabled: false
 #### 1.4.2 文档数据写入 ANN 索引库
 ```
 # 以DuReader-Robust 数据集为例建立 ANN 索引库
-python utils/offline_ann.py --index_name dureader_robust_query_encoder --doc_dir data/dureader_robust_processed
+python utils/offline_ann.py --index_name dureader_robust_query_encoder --doc_dir data/dureader_dev
 ```
-
 参数含义说明
 * `index_name`: 索引的名称
 * `doc_dir`: txt文本数据的路径
+* `host`: Elasticsearch的IP地址
+* `port`: Elasticsearch的端口号
 * `delete_index`: 是否删除现有的索引和数据，用于清空es的数据，默认为false
 
 
diff --git a/applications/experimental/pipelines/examples/semantic-search/README.md b/applications/experimental/pipelines/examples/semantic-search/README.md
index 48932a46ae80..445fae4ec87b 100644
--- a/applications/experimental/pipelines/examples/semantic-search/README.md
+++ b/applications/experimental/pipelines/examples/semantic-search/README.md
@@ -102,6 +102,8 @@ python utils/offline_ann.py --index_name dureader_robust_query_encoder \
 参数含义说明
 * `index_name`: 索引的名称
 * `doc_dir`: txt文本数据的路径
+* `host`: Elasticsearch的IP地址
+* `port`: Elasticsearch的端口号
 * `delete_index`: 是否删除现有的索引和数据，用于清空es的数据，默认为false
 
 #### 3.4.3 启动 RestAPI 模型服务
@@ -116,7 +118,12 @@ Linux 用户推荐采用 Shell 脚本来启动服务：：
 ```bash
 sh scripts/run_search_server.sh
 ```
+启动后可以使用curl命令验证是否成功运行：
 
+```
+curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "亚马逊河流的介绍","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}'
+
+```
 #### 3.4.4 启动 WebUI
 ```bash
 # 配置模型服务地址
@@ -174,6 +181,23 @@ elasticsearch默认达到95％就全都设置只读，可以腾出一部分空
 cluster.routing.allocation.disk.threshold_enabled: false
 ```
 
+#### nltk_data加载失败的错误 `[nltk_data] Error loading punkt: [Errno 60] Operation timed out`
+
+在命令行里面输入python,然后输入下面的命令进行下载：
+
+```
+import nltk
+nltk.download('punkt')
+```
+如果下载还是很慢，可以手动[下载](https://github.com/nltk/nltk_data/tree/gh-pages/packages/tokenizers)，然后放入本地的`~/nltk_data/tokenizers`进行解压即可。
+
+#### 服务端运行报端口占用的错误 `[Errno 48] error while attempting to bind on address ('0.0.0.0',8891): address already in use`
+
+```
+lsof -i:8891
+kill -9 PID # PID为8891端口的进程
+```
+
 ## Reference
 [1]Y. Sun et al., “[ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation](https://arxiv.org/pdf/2107.02137.pdf),” arXiv:2107.02137 [cs], Jul. 2021, Accessed: Jan. 17, 2022. [Online]. Available: http://arxiv.org/abs/2107.02137
 
diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
index 0ab818d2a1ce..bdf709fae723 100644
--- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
+++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
@@ -37,7 +37,7 @@ def semantic_search_tutorial():
             embed_title=False,
         )
     else:
-        doc_dir = "data/dureader_robust_processed"
+        doc_dir = "data/dureader_dev"
         dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip"
 
         fetch_archive_from_http(url=dureader_data, output_dir=doc_dir)
diff --git a/applications/experimental/pipelines/utils/offline_ann.py b/applications/experimental/pipelines/utils/offline_ann.py
index 14e6f40f245c..485b11051a1f 100644
--- a/applications/experimental/pipelines/utils/offline_ann.py
+++ b/applications/experimental/pipelines/utils/offline_ann.py
@@ -1,11 +1,17 @@
 import argparse
 
 import paddle
-from pipelines.utils import convert_files_to_dicts
+from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http
 from pipelines.document_stores import ElasticsearchDocumentStore
 from pipelines.nodes import DensePassageRetriever
 from pipelines.utils import launch_es
 
+data_dict = {
+    'data/dureader_dev':
+    "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip",
+    "data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip"
+}
+
 parser = argparse.ArgumentParser()
 parser.add_argument("--index_name",
                     default='baike_cities',
@@ -15,6 +21,17 @@
                     default='data/baike/',
                     type=str,
                     help="The doc path of the corpus")
+
+parser.add_argument('--host',
+                    type=str,
+                    default="127.0.0.1",
+                    help='host ip of elastic search')
+
+parser.add_argument('--port',
+                    type=str,
+                    default="9200",
+                    help='port of elastic search')
+
 parser.add_argument(
     '--delete_index',
     action='store_true',
@@ -27,8 +44,8 @@ def offline_ann(index_name, doc_dir):
 
     launch_es()
 
-    document_store = ElasticsearchDocumentStore(host="127.0.0.1",
-                                                port="9200",
+    document_store = ElasticsearchDocumentStore(host=args.host,
+                                                port=args.port,
                                                 username="",
                                                 password="",
                                                 index=index_name)
@@ -59,8 +76,8 @@ def offline_ann(index_name, doc_dir):
 
 
 def delete_data(index_name):
-    document_store = ElasticsearchDocumentStore(host="127.0.0.1",
-                                                port="9200",
+    document_store = ElasticsearchDocumentStore(host=args.host,
+                                                port=args.port,
                                                 username="",
                                                 password="",
                                                 index=index_name)
@@ -70,6 +87,9 @@ def delete_data(index_name):
 
 
 if __name__ == "__main__":
+    if (args.doc_dir in data_dict):
+        fetch_archive_from_http(url=data_dict[args.doc_dir],
+                                output_dir=args.doc_dir)
     if (args.delete_index):
         delete_data(args.index_name)
     offline_ann(args.index_name, args.doc_dir)

From 79061a6e1bd96191401e3a9d05b1126bc2f3e2c6 Mon Sep 17 00:00:00 2001
From: w5688414 <w5688414@gmail.com>
Date: Tue, 9 Aug 2022 17:04:50 +0800
Subject: [PATCH 3/3] Remove update_batch_size for fais

---
 .../pipelines/examples/question-answering/dense_qa_example.py | 4 +---
 .../examples/semantic-search/semantic_search_example.py       | 4 +---
 .../experimental/pipelines/pipelines/document_stores/faiss.py | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
index 208b4fb927f1..d9ce9ed47520 100644
--- a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
+++ b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py
@@ -15,7 +15,6 @@
 parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
 parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
 parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
-parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.")
 args = parser.parse_args()
 # yapf: enable
 
@@ -67,8 +66,7 @@ def dense_qa_pipeline():
         )
 
         # update Embedding
-        document_store.update_embeddings(retriever,
-                                         batch_size=args.update_batch_size)
+        document_store.update_embeddings(retriever)
 
         # save index
         document_store.save(args.index_name)
diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
index bdf709fae723..390408198a14 100644
--- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
+++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py
@@ -13,7 +13,6 @@
 parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.")
 parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.")
 parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.")
-parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.")
 args = parser.parse_args()
 # yapf: enable
 
@@ -66,8 +65,7 @@ def semantic_search_tutorial():
         )
 
         # update Embedding
-        document_store.update_embeddings(retriever,
-                                         batch_size=args.update_batch_size)
+        document_store.update_embeddings(retriever)
 
         # save index
         document_store.save(args.index_name)
diff --git a/applications/experimental/pipelines/pipelines/document_stores/faiss.py b/applications/experimental/pipelines/pipelines/document_stores/faiss.py
index 85bc686453c3..2a59e72d877a 100644
--- a/applications/experimental/pipelines/pipelines/document_stores/faiss.py
+++ b/applications/experimental/pipelines/pipelines/document_stores/faiss.py
@@ -243,7 +243,7 @@ def write_documents(
         self,
         documents: Union[List[dict], List[Document]],
         index: Optional[str] = None,
-        batch_size: int = 10000,
+        batch_size: int = 1000,
         duplicate_documents: Optional[str] = None,
         headers: Optional[Dict[str, str]] = None,
     ) -> None: