From 16a7a941a19b4085f9b1457d120f74357c853772 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Thu, 4 Aug 2022 16:25:38 +0800 Subject: [PATCH 1/3] Fix faiss index batch_size bug on python3.7 and update es config for pipelines --- .../pipelines/examples/question-answering/README.md | 8 +++++++- .../examples/question-answering/dense_qa_example.py | 4 +++- .../pipelines/examples/semantic-search/README.md | 5 +++++ .../semantic-search/semantic_search_example.py | 4 +++- .../pipelines/pipelines/document_stores/faiss.py | 10 +++++----- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/applications/experimental/pipelines/examples/question-answering/README.md b/applications/experimental/pipelines/examples/question-answering/README.md index 4305a2c10701..5ef924672187 100644 --- a/applications/experimental/pipelines/examples/question-answering/README.md +++ b/applications/experimental/pipelines/examples/question-answering/README.md @@ -73,8 +73,14 @@ python examples/question-answering/dense_qa_example.py --device cpu 整个 Web 可视化问答系统主要包含 3 大组件: 1. 基于 ElasticSearch 的 ANN 服务 2. 基于 RestAPI 构建模型服务 3. 基于 Streamlit 构建 WebUI。接下来我们依次搭建这 3 个服务并串联构成可视化的问答系统 #### 3.4.1 启动 ANN 服务 -1. 参考官方文档下载安装 [elasticsearch-8.1.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 +1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 2. 启动 ES 服务 +首先修改`config/elasticsearch.yml`的配置: +``` +xpack.security.enabled: false +``` +然后启动: + ```bash ./bin/elasticsearch ``` diff --git a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py index d9ce9ed47520..208b4fb927f1 100644 --- a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py +++ b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py @@ -15,6 +15,7 @@ parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") +parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.") args = parser.parse_args() # yapf: enable @@ -66,7 +67,8 @@ def dense_qa_pipeline(): ) # update Embedding - document_store.update_embeddings(retriever) + document_store.update_embeddings(retriever, + batch_size=args.update_batch_size) # save index document_store.save(args.index_name) diff --git a/applications/experimental/pipelines/examples/semantic-search/README.md b/applications/experimental/pipelines/examples/semantic-search/README.md index 93fa775c3344..48932a46ae80 100644 --- a/applications/experimental/pipelines/examples/semantic-search/README.md +++ b/applications/experimental/pipelines/examples/semantic-search/README.md @@ -78,6 +78,11 @@ python examples/semantic-search/semantic_search_example.py --device cpu #### 3.4.1 启动 ANN 服务 1. 参考官方文档下载安装 [elasticsearch-8.3.2](https://www.elastic.co/cn/downloads/elasticsearch) 并解压。 2. 启动 ES 服务 +首先修改`config/elasticsearch.yml`的配置: +``` +xpack.security.enabled: false +``` +然后启动: ```bash ./bin/elasticsearch ``` diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py index 6900dde13ab6..0ab818d2a1ce 100644 --- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py +++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py @@ -13,6 +13,7 @@ parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") +parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.") args = parser.parse_args() # yapf: enable @@ -65,7 +66,8 @@ def semantic_search_tutorial(): ) # update Embedding - document_store.update_embeddings(retriever) + document_store.update_embeddings(retriever, + batch_size=args.update_batch_size) # save index document_store.save(args.index_name) diff --git a/applications/experimental/pipelines/pipelines/document_stores/faiss.py b/applications/experimental/pipelines/pipelines/document_stores/faiss.py index 426abd348d2c..85bc686453c3 100644 --- a/applications/experimental/pipelines/pipelines/document_stores/faiss.py +++ b/applications/experimental/pipelines/pipelines/document_stores/faiss.py @@ -243,7 +243,7 @@ def write_documents( self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, - batch_size: int = 10_000, + batch_size: int = 10000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, ) -> None: @@ -349,7 +349,7 @@ def update_embeddings( filters: Optional[Dict[ str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore - batch_size: int = 10_000, + batch_size: int = 10000, ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -432,7 +432,7 @@ def get_all_documents( str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore return_embedding: Optional[bool] = None, - batch_size: int = 10_000, + batch_size: int = 10000, headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: @@ -454,7 +454,7 @@ def get_all_documents_generator( str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore return_embedding: Optional[bool] = None, - batch_size: int = 10_000, + batch_size: int = 10000, headers: Optional[Dict[str, str]] = None, ) -> Generator[Document, None, None]: """ @@ -493,7 +493,7 @@ def get_documents_by_id( self, ids: List[str], index: Optional[str] = None, - batch_size: int = 10_000, + batch_size: int = 10000, headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: From df8bae6a848a2acbcb3eca905256230d94c2e98a Mon Sep 17 00:00:00 2001 From: w5688414 Date: Fri, 5 Aug 2022 19:23:38 +0800 Subject: [PATCH 2/3] Fix the nltk download bug and Add FAQ for mac support --- .../question-answering/Install_windows.md | 4 ++- .../examples/question-answering/README.md | 3 +- .../semantic-search/Install_windows.md | 5 ++-- .../examples/semantic-search/README.md | 24 +++++++++++++++ .../semantic_search_example.py | 2 +- .../pipelines/utils/offline_ann.py | 30 +++++++++++++++---- 6 files changed, 58 insertions(+), 10 deletions(-) diff --git a/applications/experimental/pipelines/examples/question-answering/Install_windows.md b/applications/experimental/pipelines/examples/question-answering/Install_windows.md index a2b158335807..f899ea6a4026 100644 --- a/applications/experimental/pipelines/examples/question-answering/Install_windows.md +++ b/applications/experimental/pipelines/examples/question-answering/Install_windows.md @@ -52,12 +52,14 @@ xpack.security.enabled: false # 以百科城市数据为例建立 ANN 索引库 python utils/offline_ann.py --index_name baike_cities --doc_dir data/baike ``` - 参数含义说明 * `index_name`: 索引的名称 * `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 * `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false + 运行成功后会输出如下的日志: ``` INFO - pipelines.utils.logger - Logged parameters: diff --git a/applications/experimental/pipelines/examples/question-answering/README.md b/applications/experimental/pipelines/examples/question-answering/README.md index 5ef924672187..fefd7e4f45db 100644 --- a/applications/experimental/pipelines/examples/question-answering/README.md +++ b/applications/experimental/pipelines/examples/question-answering/README.md @@ -97,10 +97,11 @@ python utils/offline_ann.py --index_name baike_cities \ --doc_dir data/baike \ --delete_index ``` - 参数含义说明 * `index_name`: 索引的名称 * `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 * `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false 运行成功后会输出如下的日志: diff --git a/applications/experimental/pipelines/examples/semantic-search/Install_windows.md b/applications/experimental/pipelines/examples/semantic-search/Install_windows.md index 7e82145d802f..cca6be7ba210 100644 --- a/applications/experimental/pipelines/examples/semantic-search/Install_windows.md +++ b/applications/experimental/pipelines/examples/semantic-search/Install_windows.md @@ -47,12 +47,13 @@ xpack.security.enabled: false #### 1.4.2 文档数据写入 ANN 索引库 ``` # 以DuReader-Robust 数据集为例建立 ANN 索引库 -python utils/offline_ann.py --index_name dureader_robust_query_encoder --doc_dir data/dureader_robust_processed +python utils/offline_ann.py --index_name dureader_robust_query_encoder --doc_dir data/dureader_dev ``` - 参数含义说明 * `index_name`: 索引的名称 * `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 * `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false diff --git a/applications/experimental/pipelines/examples/semantic-search/README.md b/applications/experimental/pipelines/examples/semantic-search/README.md index 48932a46ae80..445fae4ec87b 100644 --- a/applications/experimental/pipelines/examples/semantic-search/README.md +++ b/applications/experimental/pipelines/examples/semantic-search/README.md @@ -102,6 +102,8 @@ python utils/offline_ann.py --index_name dureader_robust_query_encoder \ 参数含义说明 * `index_name`: 索引的名称 * `doc_dir`: txt文本数据的路径 +* `host`: Elasticsearch的IP地址 +* `port`: Elasticsearch的端口号 * `delete_index`: 是否删除现有的索引和数据,用于清空es的数据,默认为false #### 3.4.3 启动 RestAPI 模型服务 @@ -116,7 +118,12 @@ Linux 用户推荐采用 Shell 脚本来启动服务:: ```bash sh scripts/run_search_server.sh ``` +启动后可以使用curl命令验证是否成功运行: +``` +curl -X POST -k http://localhost:8891/query -H 'Content-Type: application/json' -d '{"query": "亚马逊河流的介绍","params": {"Retriever": {"top_k": 5}, "Ranker":{"top_k": 5}}}' + +``` #### 3.4.4 启动 WebUI ```bash # 配置模型服务地址 @@ -174,6 +181,23 @@ elasticsearch默认达到95%就全都设置只读,可以腾出一部分空 cluster.routing.allocation.disk.threshold_enabled: false ``` +#### nltk_data加载失败的错误 `[nltk_data] Error loading punkt: [Errno 60] Operation timed out` + +在命令行里面输入python,然后输入下面的命令进行下载: + +``` +import nltk +nltk.download('punkt') +``` +如果下载还是很慢,可以手动[下载](https://github.com/nltk/nltk_data/tree/gh-pages/packages/tokenizers),然后放入本地的`~/nltk_data/tokenizers`进行解压即可。 + +#### 服务端运行报端口占用的错误 `[Errno 48] error while attempting to bind on address ('0.0.0.0',8891): address already in use` + +``` +lsof -i:8891 +kill -9 PID # PID为8891端口的进程 +``` + ## Reference [1]Y. Sun et al., “[ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation](https://arxiv.org/pdf/2107.02137.pdf),” arXiv:2107.02137 [cs], Jul. 2021, Accessed: Jan. 17, 2022. [Online]. Available: http://arxiv.org/abs/2107.02137 diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py index 0ab818d2a1ce..bdf709fae723 100644 --- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py +++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py @@ -37,7 +37,7 @@ def semantic_search_tutorial(): embed_title=False, ) else: - doc_dir = "data/dureader_robust_processed" + doc_dir = "data/dureader_dev" dureader_data = "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip" fetch_archive_from_http(url=dureader_data, output_dir=doc_dir) diff --git a/applications/experimental/pipelines/utils/offline_ann.py b/applications/experimental/pipelines/utils/offline_ann.py index 14e6f40f245c..485b11051a1f 100644 --- a/applications/experimental/pipelines/utils/offline_ann.py +++ b/applications/experimental/pipelines/utils/offline_ann.py @@ -1,11 +1,17 @@ import argparse import paddle -from pipelines.utils import convert_files_to_dicts +from pipelines.utils import convert_files_to_dicts, fetch_archive_from_http from pipelines.document_stores import ElasticsearchDocumentStore from pipelines.nodes import DensePassageRetriever from pipelines.utils import launch_es +data_dict = { + 'data/dureader_dev': + "https://paddlenlp.bj.bcebos.com/applications/dureader_dev.zip", + "data/baike": "https://paddlenlp.bj.bcebos.com/applications/baike.zip" +} + parser = argparse.ArgumentParser() parser.add_argument("--index_name", default='baike_cities', @@ -15,6 +21,17 @@ default='data/baike/', type=str, help="The doc path of the corpus") + +parser.add_argument('--host', + type=str, + default="127.0.0.1", + help='host ip of elastic search') + +parser.add_argument('--port', + type=str, + default="9200", + help='port of elastic search') + parser.add_argument( '--delete_index', action='store_true', @@ -27,8 +44,8 @@ def offline_ann(index_name, doc_dir): launch_es() - document_store = ElasticsearchDocumentStore(host="127.0.0.1", - port="9200", + document_store = ElasticsearchDocumentStore(host=args.host, + port=args.port, username="", password="", index=index_name) @@ -59,8 +76,8 @@ def offline_ann(index_name, doc_dir): def delete_data(index_name): - document_store = ElasticsearchDocumentStore(host="127.0.0.1", - port="9200", + document_store = ElasticsearchDocumentStore(host=args.host, + port=args.port, username="", password="", index=index_name) @@ -70,6 +87,9 @@ def delete_data(index_name): if __name__ == "__main__": + if (args.doc_dir in data_dict): + fetch_archive_from_http(url=data_dict[args.doc_dir], + output_dir=args.doc_dir) if (args.delete_index): delete_data(args.index_name) offline_ann(args.index_name, args.doc_dir) From 79061a6e1bd96191401e3a9d05b1126bc2f3e2c6 Mon Sep 17 00:00:00 2001 From: w5688414 Date: Tue, 9 Aug 2022 17:04:50 +0800 Subject: [PATCH 3/3] Remove update_batch_size for fais --- .../pipelines/examples/question-answering/dense_qa_example.py | 4 +--- .../examples/semantic-search/semantic_search_example.py | 4 +--- .../experimental/pipelines/pipelines/document_stores/faiss.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py index 208b4fb927f1..d9ce9ed47520 100644 --- a/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py +++ b/applications/experimental/pipelines/examples/question-answering/dense_qa_example.py @@ -15,7 +15,6 @@ parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") -parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.") args = parser.parse_args() # yapf: enable @@ -67,8 +66,7 @@ def dense_qa_pipeline(): ) # update Embedding - document_store.update_embeddings(retriever, - batch_size=args.update_batch_size) + document_store.update_embeddings(retriever) # save index document_store.save(args.index_name) diff --git a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py index bdf709fae723..390408198a14 100644 --- a/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py +++ b/applications/experimental/pipelines/examples/semantic-search/semantic_search_example.py @@ -13,7 +13,6 @@ parser.add_argument("--max_seq_len_query", default=64, type=int, help="The maximum total length of query after tokenization.") parser.add_argument("--max_seq_len_passage", default=256, type=int, help="The maximum total length of passage after tokenization.") parser.add_argument("--retriever_batch_size", default=16, type=int, help="The batch size of retriever to extract passage embedding for building ANN index.") -parser.add_argument("--update_batch_size", default=100, type=int, help="The batch size of document_store to update passage embedding for building ANN index.") args = parser.parse_args() # yapf: enable @@ -66,8 +65,7 @@ def semantic_search_tutorial(): ) # update Embedding - document_store.update_embeddings(retriever, - batch_size=args.update_batch_size) + document_store.update_embeddings(retriever) # save index document_store.save(args.index_name) diff --git a/applications/experimental/pipelines/pipelines/document_stores/faiss.py b/applications/experimental/pipelines/pipelines/document_stores/faiss.py index 85bc686453c3..2a59e72d877a 100644 --- a/applications/experimental/pipelines/pipelines/document_stores/faiss.py +++ b/applications/experimental/pipelines/pipelines/document_stores/faiss.py @@ -243,7 +243,7 @@ def write_documents( self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, - batch_size: int = 10000, + batch_size: int = 1000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, ) -> None: