Add docprompt example in pipelines (#3534)

* add_docprompt_example_in_pipelines
PaddlePaddle · Oct 24, 2022 · 97fe71d · 97fe71d
1 parent 1bad949
commit 97fe71d
Show file tree

Hide file tree

Showing 16 changed files with 1,007 additions and 3 deletions.
diff --git a/pipelines/examples/document-intelligence/README.md b/pipelines/examples/document-intelligence/README.md
@@ -0,0 +1,88 @@
+# 端到端开放文档抽取问答系统
+
+## 1. 系统介绍
+
+开放文档抽取问答主要指对于网页、数字文档或扫描文档所包含的文本以及丰富的排版格式等信息，通过人工智能技术进行理解、分类、提取以及信息归纳的过程。开放文档抽取问答技术广泛应用于金融、保险、能源、物流、医疗等行业，常见的应用场景包括财务报销单、招聘简历、企业财报、合同文书、动产登记证、法律判决书、物流单据等多模态文档的关键信息抽取、问题回答等。
+
+本项目提供了低成本搭建端到端开放文档抽取问答系统的能力。用户只需要处理好自己的业务数据，就可以使用本项目预置的开放文档抽取问答系统模型(文档OCR预处理模型、文档抽取问答模型)快速搭建一个针对自己业务数据的文档抽取问答系统，并提供基于[Gradio](https://gradio.app/) 的 Web 可视化服务。
+
+
+## 2. 快速开始
+
+以下是针对mac和linux的安装流程：
+
+
+### 2.1 运行环境
+
+**安装PaddlePaddle：**
+
+ 环境中paddlepaddle-gpu或paddlepaddle版本应大于或等于2.3, 请参见[飞桨快速安装](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)根据自己需求选择合适的PaddlePaddle下载命令。
+
+**安装Paddle-Pipelines：**
+
+```bash
+# pip 一键安装
+pip install --upgrade paddle-pipelines -i https://pypi.tuna.tsinghua.edu.cn/simple
+# 或者源码进行安装最新版本
+cd ${HOME}/PaddleNLP/pipelines/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+python setup.py install
+```
+
+**安装OpenCV：**
+```bash
+pip install opencv-python==4.6.0.66
+```
+
+【注意】以下的所有的流程都只需要在`pipelines`根目录下进行，不需要跳转目录
+
+### 2.2 一键体验问答系统
+您可以通过如下命令快速体验开放文档抽取问答系统的效果。
+
+
+```bash
+# 我们建议在 GPU 环境下运行本示例，运行速度较快
+# 设置 1 个空闲的 GPU 卡，此处假设 0 卡为空闲 GPU
+export CUDA_VISIBLE_DEVICES=0
+python examples/document-intelligence/docprompt_example.py --device gpu
+# 如果只有 CPU 机器，可以通过 --device 参数指定 cpu 即可, 运行耗时较长
+unset CUDA_VISIBLE_DEVICES
+python examples/document-intelligence/docprompt_example.py --device cpu
+```
+
+### 2.3 构建 Web 可视化开放文档抽取问答系统
+
+整个 Web 可视化问答系统主要包含两大组件:  1. 基于 RestAPI 构建模型服务 2. 基于 Gradio 构建 WebUI。接下来我们依次搭建这 2 个服务并串联构成可视化的开放文档抽取问答系统。
+
+#### 2.3.1 启动 RestAPI 模型服务
+```bash
+# 指定智能问答系统的Yaml配置文件
+export PIPELINE_YAML_PATH=rest_api/pipeline/docprompt.yaml
+export QUERY_PIPELINE_NAME=query_documents
+# 使用端口号 8891 启动模型服务
+python rest_api/application.py 8891
+```
+Linux 用户推荐采用 Shell 脚本来启动服务：
+
+```bash
+sh examples/document-intelligence/run_docprompt_server.sh
+```
+启动后可以使用curl命令验证是否成功运行：
+
+```
+curl --request POST --url 'http://0.0.0.0:8891/query_documents' -H "Content-Type: application/json"  --data '{"meta": {"doc": "https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/images/invoice.jpg", "prompt": ["发票号码是多少?", "校验码是多少?"]}}'
+```
+
+#### 2.3.2 启动 WebUI
+
+```bash
+python ui/webapp_docprompt_gradio.py
+```
+
+Linux 用户推荐采用 Shell 脚本来启动服务：
+
+```bash
+sh examples/document-intelligence/run_docprompt_web.sh
+```
+
+到这里您就可以打开浏览器访问 http://127.0.0.1:7860 地址体验开放文档抽取问答系统系统服务了。
diff --git a/pipelines/examples/document-intelligence/docprompt_example.py b/pipelines/examples/document-intelligence/docprompt_example.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+
+import paddle
+from pipelines.nodes import DocOCRProcessor, DocPrompter
+from pipelines import DocPipeline
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to run docprompt system, defaults to gpu.")
+parser.add_argument("--batch_size", default=4, type=int, help="The batch size of prompt for one image.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def docprompt_pipeline():
+
+    use_gpu = True if args.device == 'gpu' else False
+
+    preprocessor = DocOCRProcessor(use_gpu=use_gpu)
+    docprompter = DocPrompter(use_gpu=use_gpu, batch_size=args.batch_size)
+    pipe = DocPipeline(preprocessor=preprocessor, modelrunner=docprompter)
+    # image link input
+    meta = {
+        "doc":
+        "https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/images/invoice.jpg",
+        "prompt": ["发票号码是多少?", "校验码是多少?"]
+    }
+    # image local path input
+    # meta = {"doc": "./invoice.jpg", "prompt": ["发票号码是多少?", "校验码是多少?"]}
+
+    prediction = pipe.run(meta=meta)
+    print(prediction["results"][0])
+
+
+if __name__ == "__main__":
+    docprompt_pipeline()
diff --git a/pipelines/examples/document-intelligence/requirements.txt b/pipelines/examples/document-intelligence/requirements.txt
@@ -0,0 +1 @@
+opencv-python
diff --git a/pipelines/examples/document-intelligence/run_docprompt_server.sh b/pipelines/examples/document-intelligence/run_docprompt_server.sh
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 指定语义检索系统的Yaml配置文件
+export CUDA_VISIBLE_DEVICES=0
+export PIPELINE_YAML_PATH=rest_api/pipeline/docprompt.yaml
+# 使用端口号 8891 启动模型服务
+python rest_api/application.py 8891
diff --git a/pipelines/examples/document-intelligence/run_docprompt_web.sh b/pipelines/examples/document-intelligence/run_docprompt_web.sh
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+unset http_proxy && unset https_proxy
+# 配置模型服务地址
+python ui/webapp_docprompt_gradio.py
diff --git a/pipelines/pipelines/__init__.py b/pipelines/pipelines/__init__.py
@@ -40,8 +40,8 @@
 from pipelines.pipelines.standard_pipelines import (BaseStandardPipeline,
                                                     ExtractiveQAPipeline,
                                                     SemanticSearchPipeline,
+                                                    DocPipeline,
                                                     TextToImagePipeline)
-
 import pandas as pd
 
 pd.options.display.max_colwidth = 80

diff --git a/pipelines/pipelines/nodes/__init__.py b/pipelines/pipelines/nodes/__init__.py
@@ -29,4 +29,5 @@
 from pipelines.nodes.ranker import BaseRanker, ErnieRanker
 from pipelines.nodes.reader import BaseReader, ErnieReader
 from pipelines.nodes.retriever import BaseRetriever, DensePassageRetriever
+from pipelines.nodes.document import DocOCRProcessor, DocPrompter
 from pipelines.nodes.text_to_image_generator import ErnieTextToImageGenerator
diff --git a/pipelines/pipelines/nodes/document/__init__.py b/pipelines/pipelines/nodes/document/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pipelines.nodes.document.document_preprocessor import DocOCRProcessor
+from pipelines.nodes.document.document_intelligence import DocPrompter