diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md index 30470ef70574..c92cd3624b5b 100644 --- a/docs/model_zoo/taskflow.md +++ b/docs/model_zoo/taskflow.md @@ -16,6 +16,7 @@ - [知识挖掘-名词短语标注](#知识挖掘-名词短语标注) - [生成式问答](#生成式问答) - [智能写诗](#智能写诗) + - [开放域对话](#开放域对话) - [FAQ](#FAQ) ## 介绍 @@ -28,8 +29,8 @@ | :------------ | ---- | | 中文分词 | 生成式问答 | | 词性标注 | 智能写诗 | -| 命名实体识别 | 文本翻译(TODO) | -| 文本纠错 | 开放域对话(TODO) | +| 命名实体识别 | 开放域对话 | +| 文本纠错 | 文本翻译(TODO) | | 句法分析 | 自动对联(TODO) | | 情感分析 | | | 文本相似度 | | @@ -452,6 +453,47 @@ poetry(["林密不见人", "举头邀明月"]) * `batch_size`:批处理大小,请结合机器情况进行调整,默认为1。 +### 开放域对话 + +非交互模式: +```python +from paddlenlp import Taskflow + +dialogue = Taskflow("dialogue") +dialogue(["吃饭了吗"]) +>>> ['刚吃完饭,你在干什么呢?'] + +dialogue(["你好", "吃饭了吗"], ["你是谁?"]) +>>> ['吃过了,你呢', '我是李明啊'] +``` + +可配置参数: + +* `batch_size`:批处理大小,请结合机器情况进行调整,默认为1。 +* `max_seq_len`:最大序列长度,默认为512。 + +交互模式: +```python +from paddlenlp import Taskflow + +dialogue = Taskflow("dialogue") +# 输入`exit`可退出交互模式 +dialogue.interactive_mode(max_turn=3) + +''' +[Human]:你好 +[Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗? +[Human]:喜欢 +[Bot]:那你喜欢什么运动啊? +[Human]:篮球,你喜欢篮球吗 +[Bot]:当然了,我很喜欢打篮球的? +''' +``` + +交互模式参数: + +* `max_turn`:任务能记忆的对话轮数,当max_turn为1时,模型只能记住当前对话,无法获知之前的对话内容。 + ## FAQ ### Q1 Taskflow如何修改任务保存路径? diff --git a/paddlenlp/taskflow/dialogue.py b/paddlenlp/taskflow/dialogue.py new file mode 100644 index 000000000000..56828e415f31 --- /dev/null +++ b/paddlenlp/taskflow/dialogue.py @@ -0,0 +1,353 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +from collections import deque +import numpy as np +import paddle + +from ..transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer +from ..datasets import load_dataset +from ..data import Pad +from .utils import dygraph_mode_guard +from .task import Task + +usage = r""" + from paddlenlp import Taskflow + + # 非交互模式 + dialogue = Taskflow("dialogue") + dialogue(["吃饭了吗"]) + ''' + ['刚吃完饭,你在干什么呢?'] + ''' + dialogue(["你好", "吃饭了吗"], ["你是谁?"]) + ''' + ['吃过了,你呢', '我是李明啊'] + ''' + + dialogue = Taskflow("dialogue") + # 进入交互模式 (输入exit退出) + dialogue.interactive_mode(max_turn=3) + + ''' + [Human]:你好 + [Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗? + [Human]:喜欢 + [Bot]:那你喜欢什么运动啊? + [Human]:篮球,你喜欢篮球吗 + [Bot]:当然了,我很喜欢打篮球的。 + ''' + """ + +class DialogueTask(Task): + """ + Task of Chinese open domain dialogue. + Args: + task(string): The name of task. + model(string): The model name in the task. + kwargs (dict, optional): Additional keyword arguments passed along to the specific task. + """ + def __init__(self, + task, + model, + batch_size=1, + max_seq_len=512, + **kwargs): + super().__init__(task=task, model=model, **kwargs) + self._static_mode = False + self._usage = usage + self._construct_tokenizer(model) + self._batch_size = batch_size + self._max_seq_len = max_seq_len + self._interactive_mode = False + if self._static_mode: + self._get_inference_model() + else: + self._construct_model(model) + + def _construct_input_spec(self): + """ + Construct the input spec for the convert dygraph model to static model. + """ + self._input_spec = [ + paddle.static.InputSpec( + shape=[None, None], dtype="int64", name='input_ids'), + paddle.static.InputSpec( + shape=[None], dtype="int64", name='token_type_ids'), + ] + + def _construct_model(self, model): + """ + Construct the inference model for the predictor. + """ + self._model = UnifiedTransformerLMHeadModel.from_pretrained(model) + self._model.eval() + + def _construct_tokenizer(self, model): + """ + Construct the tokenizer for the predictor. + """ + self._tokenizer = UnifiedTransformerTokenizer.from_pretrained(model) + + def _batchify_fn(self, batch_examples): + #padding = False if self._batch_size == 1 else True + pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype='int64') + + def pad_mask(batch_attention_mask): + batch_size = len(batch_attention_mask) + max_len = max(map(len, batch_attention_mask)) + attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9 + for i, mask_data in enumerate(attention_mask): + seq_len = len(batch_attention_mask[i]) + mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32') + # In order to ensure the correct broadcasting mechanism, expand one + # dimension to the second dimension (n_head of Transformer). + attention_mask = np.expand_dims(attention_mask, axis=1) + return attention_mask + + input_ids = pad_func([example['input_ids'] for example in batch_examples]) + token_type_ids = pad_func( + [example['token_type_ids'] for example in batch_examples]) + position_ids = pad_func( + [example['position_ids'] for example in batch_examples]) + attention_mask = pad_mask( + [example['attention_mask'] for example in batch_examples]) + + return input_ids, token_type_ids, position_ids, attention_mask + + def _check_input_text(self, inputs): + if self._interactive_mode: + if isinstance(inputs, str): + self.context.append(inputs.strip()) + inputs = [list(self.context)] + return inputs + else: + raise ValueError("In the interactive mode, the input data shold be a string") + elif not isinstance(inputs[0], list): + raise ValueError("If not in the interactive mode, the input data should be a list.") + return inputs + + def _batchify(self, data, max_seq_len, batch_size): + """ + Generate input batches. + """ + padding = False if batch_size == 1 else True + pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype=np.int64) + + def pad_mask(batch_attention_mask): + batch_size = len(batch_attention_mask) + max_len = max(map(len, batch_attention_mask)) + attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9 + for i, mask_data in enumerate(attention_mask): + seq_len = len(batch_attention_mask[i]) + mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32') + # In order to ensure the correct broadcasting mechanism, expand one + # dimension to the second dimension (n_head of Transformer). + attention_mask = np.expand_dims(attention_mask, axis=1) + return attention_mask + + def _parse_batch(batch_examples): + if padding: + input_ids = pad_func([example['input_ids'] for example in batch_examples]) + token_type_ids = pad_func([example['token_type_ids'] for example in batch_examples]) + position_ids = pad_func([example['position_ids'] for example in batch_examples]) + attention_mask = pad_mask([example['attention_mask'] for example in batch_examples]) + else: + input_ids = np.asarray([example['input_ids'] for example in batch_examples], dtype=np.int64) + token_type_ids = np.asarray([example['token_type_ids'] for example in batch_examples], dtype=np.int64) + position_ids = np.asarray([example['position_ids'] for example in batch_examples], dtype=np.int64) + attention_mask = np.asarray([example['attention_mask'] for example in batch_examples]) + attention_mask = np.expand_dims(attention_mask, 0) + + return input_ids, token_type_ids, position_ids, attention_mask + + examples = [] + for texts in data: + examples.append(self._convert_text_to_input(texts, max_seq_len)) + + # Seperates data into some batches. + one_batch = [] + for example in examples: + one_batch.append(example) + if len(one_batch) == batch_size: + yield _parse_batch(one_batch) + one_batch = [] + if one_batch: + yield _parse_batch(one_batch) + + def _convert_text_to_input(self, texts, max_seq_len): + """ + Convert input strings to tokens. + """ + return self._tokenizer.dialogue_encode( + texts, max_seq_len=max_seq_len, add_start_token_as_response=True, is_split_into_words=False) + + def _preprocess(self, inputs): + """ + Transform the raw text to the model inputs, two steps involved: + 1) Transform the raw text to token ids. + 2) Generate the other model inputs from the raw text and token ids. + """ + inputs = self._check_input_text(inputs) + # Get the config from the kwargs + num_workers = self.kwargs[ + 'num_workers'] if 'num_workers' in self.kwargs else 0 + lazy_load = self.kwargs[ + 'lazy_load'] if 'lazy_load' in self.kwargs else False + + batches = self._batchify(inputs, self._max_seq_len, self._batch_size) + + outputs = {} + outputs['batches'] = batches + outputs['text'] = inputs + return outputs + + def _run_model(self, inputs): + """ + Run the task model from the outputs of the `_tokenize` function. + """ + all_ids = [] + all_scores = [] + + for batch in inputs["batches"]: + input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch) + ids, scores = self._model.generate(input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + attention_mask=attention_mask, + max_length=64, + min_length=1, + decode_strategy='sampling', + temperature=1.0, + top_k=5, + top_p=1.0, + num_beams=0, + length_penalty=1.0, + early_stopping=False, + use_faster=False, + num_return_sequences=1) + all_ids.extend([ids]) + all_scores.extend([scores]) + inputs['ids'] = all_ids + inputs['scores'] = all_scores + return inputs + + def _post_process_response(self, token_ids, tokenizer): + ''' + Post-process the decoded sequence. Truncate from the first . + ''' + eos_pos = len(token_ids) + for i, tok_id in enumerate(token_ids): + if tok_id == tokenizer.sep_token_id: + eos_pos = i + break + token_ids = token_ids[:eos_pos] + tokens = tokenizer.convert_ids_to_tokens(token_ids) + tokens = tokenizer.merge_subword(tokens) + return token_ids, tokens + + @contextlib.contextmanager + def interactive_mode(self, max_turn=3): + """ + Enter the interactive mode. + """ + self._interactive_mode = True + self.max_turn = max_turn + self.context = deque(maxlen=self.max_turn) + yield + self.context.clear() + self._interactive_mode = False + + def _get_in_turn_repetition(self, pred, is_cn=False): + ''' + Get in-turn repetition. + ''' + if len(pred) == 0: + return 1.0 + if isinstance(pred[0], str): + pred = [tok.lower() for tok in pred] + if is_cn: + pred = "".join(pred) + tri_grams = set() + for i in range(len(pred) - 2): + tri_gram = tuple(pred[i:i + 3]) + if tri_gram in tri_grams: + return True + tri_grams.add(tri_gram) + return False + + def _select_response(self, + ids, + scores, + tokenizer, + max_dec_len=None, + num_return_sequences=1, + keep_space=True): + ''' + Select response with the highest score. + ''' + ids = ids.numpy().tolist() + scores = scores.numpy() + + if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0: + raise ValueError("the length of `ids` is {}, but the `num_return_sequences` is {}".format( + len(ids), num_return_sequences)) + + group = [] + tmp = [] + for pred, score in zip(ids, scores): + pred_token_ids, pred_tokens = self._post_process_response(pred, tokenizer) + num_token = len(pred_token_ids) + if keep_space: + response = " ".join(pred_tokens) + else: + response = "".join(pred_tokens) + + in_turn_repetition = self._get_in_turn_repetition(pred_tokens, True) \ + or self._get_in_turn_repetition(pred_token_ids) + # not ending + if max_dec_len is not None and num_token >= max_dec_len: + score -= 1e3 + elif in_turn_repetition: + score -= 1e3 + + tmp.append([response, score]) + if len(tmp) == num_return_sequences: + group.append(tmp) + tmp = [] + + results = [] + for preds in group: + preds = sorted(preds, key=lambda x: -x[1]) + results.append(preds[0][0]) + return results + + def _postprocess(self, inputs): + all_ids = inputs['ids'] + all_scores = inputs['scores'] + texts = inputs['text'] + + results = [] + for ids, scores, text in zip(all_ids, all_scores, texts): + results.extend( + self._select_response(ids, + scores, + self._tokenizer, + num_return_sequences=1, + keep_space=False)) + + if self._interactive_mode: + self.context.append(results[0].strip()) + return results \ No newline at end of file diff --git a/paddlenlp/taskflow/task.py b/paddlenlp/taskflow/task.py index 70cb4cd81631..d4fbca4a92a6 100644 --- a/paddlenlp/taskflow/task.py +++ b/paddlenlp/taskflow/task.py @@ -86,8 +86,8 @@ def _postprocess(self, inputs): @abstractmethod def _construct_input_spec(self): """ - Construct the input spec for the convert dygraph model to static model. - """ + Construct the input spec for the convert dygraph model to static model. + """ def _prepare_static_mode(self): """ diff --git a/paddlenlp/taskflow/taskflow.py b/paddlenlp/taskflow/taskflow.py index bc39c25581b8..e67a19644840 100644 --- a/paddlenlp/taskflow/taskflow.py +++ b/paddlenlp/taskflow/taskflow.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib +from collections import deque import warnings import paddle from ..utils.tools import get_env_device @@ -29,6 +31,7 @@ from .dependency_parsing import DDParserTask from .text_correction import CSCTask from .text_similarity import TextSimilarityTask +from .dialogue import DialogueTask warnings.simplefilter(action='ignore', category=Warning, lineno=0, append=False) @@ -178,6 +181,17 @@ "model": "simbert-base-chinese" } }, + 'dialogue': { + "models": { + "plato-mini": { + "task_class": DialogueTask, + "task_flag": "dialogue-plato-mini" + }, + }, + "default": { + "model": "plato-mini" + } + }, } @@ -246,4 +260,13 @@ def tasks(): def from_segments(self, *inputs): results = self.task_instance.from_segments(inputs) - return results \ No newline at end of file + return results + + def interactive_mode(self, max_turn): + with self.task_instance.interactive_mode(max_turn=3): + while True: + human = input("[Human]:").strip() + if human.lower() == "exit": + exit() + robot = self.task_instance(human)[0] + print("[Bot]:%s"%robot)