From 5f2862a347493188bff2a53190495f2e0b57cf62 Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Fri, 17 Dec 2021 11:20:10 +0800
Subject: [PATCH] Add plato-mini for taskflow (#1383)

* Add plato-mini for taskflow

* Update README.md

* Update README.md

* Update README.md

* Update model generation

* Update task display

* Update interactive mode

* Update usage
---
 docs/model_zoo/taskflow.md     |  46 ++++-
 paddlenlp/taskflow/dialogue.py | 353 +++++++++++++++++++++++++++++++++
 paddlenlp/taskflow/task.py     |   4 +-
 paddlenlp/taskflow/taskflow.py |  25 ++-
 4 files changed, 423 insertions(+), 5 deletions(-)
 create mode 100644 paddlenlp/taskflow/dialogue.py

diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
index 30470ef70574..c92cd3624b5b 100644
--- a/docs/model_zoo/taskflow.md
+++ b/docs/model_zoo/taskflow.md
@@ -16,6 +16,7 @@
     - [知识挖掘-名词短语标注](#知识挖掘-名词短语标注)
     - [生成式问答](#生成式问答)
     - [智能写诗](#智能写诗)
+    - [开放域对话](#开放域对话)
   - [FAQ](#FAQ)
 
 ## 介绍
@@ -28,8 +29,8 @@
 | :------------  | ---- |
 | 中文分词 | 生成式问答 |
 | 词性标注 | 智能写诗 |
-| 命名实体识别  | 文本翻译(TODO) |
-| 文本纠错 | 开放域对话(TODO) |
+| 命名实体识别  | 开放域对话 |
+| 文本纠错 | 文本翻译(TODO) |
 | 句法分析 | 自动对联(TODO) |
 | 情感分析 |  |
 | 文本相似度 |  |
@@ -452,6 +453,47 @@ poetry(["林密不见人", "举头邀明月"])
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。
 
+### 开放域对话
+
+非交互模式：
+```python
+from paddlenlp import Taskflow 
+
+dialogue = Taskflow("dialogue")
+dialogue(["吃饭了吗"])
+>>> ['刚吃完饭,你在干什么呢?']
+
+dialogue(["你好", "吃饭了吗"], ["你是谁？"])
+>>> ['吃过了,你呢', '我是李明啊']
+```
+
+可配置参数：
+
+* `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。
+* `max_seq_len`：最大序列长度，默认为512。
+
+交互模式：
+```python
+from paddlenlp import Taskflow
+
+dialogue = Taskflow("dialogue")
+# 输入`exit`可退出交互模式
+dialogue.interactive_mode(max_turn=3)
+
+'''
+[Human]:你好
+[Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗?
+[Human]:喜欢
+[Bot]:那你喜欢什么运动啊?
+[Human]:篮球,你喜欢篮球吗
+[Bot]:当然了,我很喜欢打篮球的?
+'''
+```
+
+交互模式参数：
+
+* `max_turn`：任务能记忆的对话轮数，当max_turn为1时，模型只能记住当前对话，无法获知之前的对话内容。
+
 ## FAQ
 
 ### Q1 Taskflow如何修改任务保存路径？
diff --git a/paddlenlp/taskflow/dialogue.py b/paddlenlp/taskflow/dialogue.py
new file mode 100644
index 000000000000..56828e415f31
--- /dev/null
+++ b/paddlenlp/taskflow/dialogue.py
@@ -0,0 +1,353 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from collections import deque
+import numpy as np
+import paddle
+
+from ..transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+from ..datasets import load_dataset
+from ..data import Pad
+from .utils import dygraph_mode_guard
+from .task import Task
+
+usage = r"""
+           from paddlenlp import Taskflow 
+
+           # 非交互模式
+           dialogue = Taskflow("dialogue")
+           dialogue(["吃饭了吗"])
+           '''
+           ['刚吃完饭,你在干什么呢?']
+           '''
+           dialogue(["你好", "吃饭了吗"], ["你是谁？"])
+           '''
+           ['吃过了,你呢', '我是李明啊']
+           '''
+
+           dialogue = Taskflow("dialogue")
+           # 进入交互模式 (输入exit退出)
+           dialogue.interactive_mode(max_turn=3)
+
+           '''
+           [Human]:你好
+           [Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗?
+           [Human]:喜欢
+           [Bot]:那你喜欢什么运动啊?
+           [Human]:篮球,你喜欢篮球吗
+           [Bot]:当然了,我很喜欢打篮球的。
+           '''           
+         """
+
+class DialogueTask(Task):
+    """
+    Task of Chinese open domain dialogue.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task. 
+    """
+    def __init__(self, 
+                 task, 
+                 model, 
+                 batch_size=1,
+                 max_seq_len=512,
+                 **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._static_mode = False
+        self._usage = usage
+        self._construct_tokenizer(model)
+        self._batch_size = batch_size
+        self._max_seq_len = max_seq_len
+        self._interactive_mode = False
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64", name='input_ids'),
+            paddle.static.InputSpec(
+                shape=[None], dtype="int64", name='token_type_ids'),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        self._model = UnifiedTransformerLMHeadModel.from_pretrained(model)
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = UnifiedTransformerTokenizer.from_pretrained(model)
+
+    def _batchify_fn(self, batch_examples):
+        #padding = False if self._batch_size == 1 else True
+        pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype='int64')
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32')
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        input_ids = pad_func([example['input_ids'] for example in batch_examples])
+        token_type_ids = pad_func(
+            [example['token_type_ids'] for example in batch_examples])
+        position_ids = pad_func(
+            [example['position_ids'] for example in batch_examples])
+        attention_mask = pad_mask(
+            [example['attention_mask'] for example in batch_examples])
+
+        return input_ids, token_type_ids, position_ids, attention_mask
+
+    def _check_input_text(self, inputs):
+        if self._interactive_mode:
+            if isinstance(inputs, str):
+                self.context.append(inputs.strip())
+                inputs = [list(self.context)]
+                return inputs
+            else:
+                raise ValueError("In the interactive mode, the input data shold be a string")
+        elif not isinstance(inputs[0], list):
+            raise ValueError("If not in the interactive mode, the input data should be a list.")
+        return inputs
+
+    def _batchify(self, data, max_seq_len, batch_size):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype=np.int64)
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32')
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example['input_ids'] for example in batch_examples])
+                token_type_ids = pad_func([example['token_type_ids'] for example in batch_examples])
+                position_ids = pad_func([example['position_ids'] for example in batch_examples])
+                attention_mask = pad_mask([example['attention_mask'] for example in batch_examples])
+            else:
+                input_ids = np.asarray([example['input_ids'] for example in batch_examples], dtype=np.int64)
+                token_type_ids = np.asarray([example['token_type_ids'] for example in batch_examples], dtype=np.int64)
+                position_ids = np.asarray([example['position_ids'] for example in batch_examples], dtype=np.int64)
+                attention_mask = np.asarray([example['attention_mask'] for example in batch_examples])
+                attention_mask = np.expand_dims(attention_mask, 0)
+
+            return input_ids, token_type_ids, position_ids, attention_mask
+
+        examples = []
+        for texts in data:
+            examples.append(self._convert_text_to_input(texts, max_seq_len))
+
+        # Seperates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _convert_text_to_input(self, texts, max_seq_len):
+        """
+        Convert input strings to tokens.
+        """
+        return self._tokenizer.dialogue_encode(
+            texts, max_seq_len=max_seq_len, add_start_token_as_response=True, is_split_into_words=False)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        num_workers = self.kwargs[
+            'num_workers'] if 'num_workers' in self.kwargs else 0
+        lazy_load = self.kwargs[
+            'lazy_load'] if 'lazy_load' in self.kwargs else False
+
+        batches = self._batchify(inputs, self._max_seq_len, self._batch_size)
+
+        outputs = {}
+        outputs['batches'] = batches
+        outputs['text'] = inputs
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        all_ids = []
+        all_scores = []
+
+        for batch in inputs["batches"]:
+            input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch)
+            ids, scores = self._model.generate(input_ids=input_ids, 
+                                                token_type_ids=token_type_ids,
+                                                position_ids=position_ids,
+                                                attention_mask=attention_mask,
+                                                max_length=64,
+                                                min_length=1,
+                                                decode_strategy='sampling',
+                                                temperature=1.0,
+                                                top_k=5,
+                                                top_p=1.0,
+                                                num_beams=0,
+                                                length_penalty=1.0,
+                                                early_stopping=False,
+                                                use_faster=False,
+                                                num_return_sequences=1)
+            all_ids.extend([ids])
+            all_scores.extend([scores])
+        inputs['ids'] = all_ids
+        inputs['scores'] = all_scores
+        return inputs
+
+    def _post_process_response(self, token_ids, tokenizer):
+        '''
+        Post-process the decoded sequence. Truncate from the first <eos>.
+        '''
+        eos_pos = len(token_ids)
+        for i, tok_id in enumerate(token_ids):
+            if tok_id == tokenizer.sep_token_id:
+                eos_pos = i
+                break
+        token_ids = token_ids[:eos_pos]
+        tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        tokens = tokenizer.merge_subword(tokens)
+        return token_ids, tokens
+
+    @contextlib.contextmanager
+    def interactive_mode(self, max_turn=3):
+        """
+        Enter the interactive mode.
+        """
+        self._interactive_mode = True
+        self.max_turn = max_turn
+        self.context = deque(maxlen=self.max_turn)
+        yield
+        self.context.clear()
+        self._interactive_mode = False
+
+    def _get_in_turn_repetition(self, pred, is_cn=False):
+        '''
+        Get in-turn repetition.
+        '''
+        if len(pred) == 0:
+            return 1.0
+        if isinstance(pred[0], str):
+            pred = [tok.lower() for tok in pred]
+            if is_cn:
+                pred = "".join(pred)
+        tri_grams = set()
+        for i in range(len(pred) - 2):
+            tri_gram = tuple(pred[i:i + 3])
+            if tri_gram in tri_grams:
+                return True
+            tri_grams.add(tri_gram)
+        return False           
+
+    def _select_response(self,
+                         ids,
+                         scores,
+                         tokenizer,
+                         max_dec_len=None,
+                         num_return_sequences=1,
+                         keep_space=True):
+        '''
+        Select response with the highest score.
+        '''
+        ids = ids.numpy().tolist()
+        scores = scores.numpy()
+
+        if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+            raise ValueError("the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+                len(ids), num_return_sequences))
+
+        group = []
+        tmp = []
+        for pred, score in zip(ids, scores):
+            pred_token_ids, pred_tokens = self._post_process_response(pred, tokenizer)
+            num_token = len(pred_token_ids)
+            if keep_space:
+                response = " ".join(pred_tokens)
+            else:
+                response = "".join(pred_tokens)
+
+            in_turn_repetition = self._get_in_turn_repetition(pred_tokens, True) \
+                or self._get_in_turn_repetition(pred_token_ids)
+            # not ending
+            if max_dec_len is not None and num_token >= max_dec_len:
+                score -= 1e3
+            elif in_turn_repetition:
+                score -= 1e3
+
+            tmp.append([response, score])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        results = []
+        for preds in group:
+            preds = sorted(preds, key=lambda x: -x[1])
+            results.append(preds[0][0])
+        return results
+
+    def _postprocess(self, inputs):
+        all_ids = inputs['ids']
+        all_scores = inputs['scores']
+        texts = inputs['text']
+
+        results = []
+        for ids, scores, text in zip(all_ids, all_scores, texts):
+            results.extend(
+                self._select_response(ids, 
+                                      scores, 
+                                      self._tokenizer, 
+                                      num_return_sequences=1,
+                                      keep_space=False))
+        
+        if self._interactive_mode:
+            self.context.append(results[0].strip())
+        return results
\ No newline at end of file
diff --git a/paddlenlp/taskflow/task.py b/paddlenlp/taskflow/task.py
index 70cb4cd81631..d4fbca4a92a6 100644
--- a/paddlenlp/taskflow/task.py
+++ b/paddlenlp/taskflow/task.py
@@ -86,8 +86,8 @@ def _postprocess(self, inputs):
     @abstractmethod
     def _construct_input_spec(self):
         """
-       Construct the input spec for the convert dygraph model to static model.
-       """
+        Construct the input spec for the convert dygraph model to static model.
+        """
 
     def _prepare_static_mode(self):
         """
diff --git a/paddlenlp/taskflow/taskflow.py b/paddlenlp/taskflow/taskflow.py
index bc39c25581b8..e67a19644840 100644
--- a/paddlenlp/taskflow/taskflow.py
+++ b/paddlenlp/taskflow/taskflow.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
+from collections import deque
 import warnings
 import paddle
 from ..utils.tools import get_env_device
@@ -29,6 +31,7 @@
 from .dependency_parsing import DDParserTask
 from .text_correction import CSCTask
 from .text_similarity import TextSimilarityTask
+from .dialogue import DialogueTask
 
 warnings.simplefilter(action='ignore', category=Warning, lineno=0, append=False)
 
@@ -178,6 +181,17 @@
             "model": "simbert-base-chinese"
         }
     },
+    'dialogue': {
+        "models": {
+            "plato-mini": {
+                "task_class": DialogueTask,
+                "task_flag": "dialogue-plato-mini"
+            },
+        },
+        "default": {
+            "model": "plato-mini"
+        }
+    },
 }
 
 
@@ -246,4 +260,13 @@ def tasks():
 
     def from_segments(self, *inputs):
         results = self.task_instance.from_segments(inputs)
-        return results
\ No newline at end of file
+        return results
+
+    def interactive_mode(self, max_turn):
+        with self.task_instance.interactive_mode(max_turn=3):
+            while True:
+                human = input("[Human]:").strip()
+                if human.lower() == "exit":
+                    exit()
+                robot = self.task_instance(human)[0]
+                print("[Bot]:%s"%robot)