From edf6fef0009638c304cf3fc5b13d2aaec90d3d8c Mon Sep 17 00:00:00 2001 From: Sijun He Date: Fri, 24 Feb 2023 15:12:04 +0800 Subject: [PATCH] [CodeStyle] fix taskflow (#4982) * dependency parsing * fix lexical * fix lexical * fix lexical * poetry * poetry * qa * text correction * text generation * seg --- paddlenlp/taskflow/dependency_parsing.py | 24 +++++++-------- paddlenlp/taskflow/lexical_analysis.py | 26 +++++------------ .../taskflow/models/lexical_analysis_model.py | 5 ++-- paddlenlp/taskflow/poetry_generation.py | 12 +------- paddlenlp/taskflow/pos_tagging.py | 15 ++-------- paddlenlp/taskflow/question_answering.py | 13 ++------- paddlenlp/taskflow/text_correction.py | 29 +++++++------------ paddlenlp/taskflow/text_generation.py | 23 +++------------ paddlenlp/taskflow/word_segmentation.py | 18 +++--------- 9 files changed, 43 insertions(+), 122 deletions(-) diff --git a/paddlenlp/taskflow/dependency_parsing.py b/paddlenlp/taskflow/dependency_parsing.py index 58974c4e03cf..e72c470ad227 100644 --- a/paddlenlp/taskflow/dependency_parsing.py +++ b/paddlenlp/taskflow/dependency_parsing.py @@ -15,17 +15,17 @@ import copy import os -import itertools import numpy as np import paddle -from ..data import Vocab, Pad -from .utils import download_file, dygraph_mode_guard -from .task import Task + +from ..data import Pad, Vocab from .models import BiAffineParser +from .task import Task +from .utils import download_file usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow ddp = Taskflow("dependency_parsing") ddp("三亚是一座美丽的城市") @@ -35,7 +35,7 @@ ddp(["三亚是一座美丽的城市", "他送了一本书"]) ''' [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}] - ''' + ''' ddp = Taskflow("dependency_parsing", prob=True, use_pos=True) ddp("三亚是一座美丽的城市") @@ -64,7 +64,7 @@ ddp.from_segments([['三亚', '是', '一座', '美丽', '的', '城市'], ['他', '送', '了', '一本', '书']]) ''' [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}] - ''' + ''' """ @@ -181,7 +181,7 @@ def __init__( try: from LAC import LAC - except: + except Exception: raise ImportError("Please install the dependencies first, pip install LAC --upgrade") self.use_cuda = use_cuda @@ -274,10 +274,6 @@ def _preprocess(self, inputs): 2) Generate the other model inputs from the raw text and token ids. """ - # Get the config from the kwargs - num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0 - lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False - outputs = {} lac_results = [] @@ -368,9 +364,9 @@ def _visualize(self, data): data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it. """ try: - import matplotlib.pyplot as plt import matplotlib.font_manager as font_manager - except: + import matplotlib.pyplot as plt + except Exception: raise ImportError("Please install the dependencies first, pip install matplotlib --upgrade") self.plt = plt diff --git a/paddlenlp/taskflow/lexical_analysis.py b/paddlenlp/taskflow/lexical_analysis.py index fd07af5b8a4a..92786e13d27c 100644 --- a/paddlenlp/taskflow/lexical_analysis.py +++ b/paddlenlp/taskflow/lexical_analysis.py @@ -13,26 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math import os -import copy -import itertools -import numpy as np import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ..datasets import load_dataset, MapDataset -from ..data import Stack, Pad, Tuple, Vocab, JiebaTokenizer -from .utils import download_file, add_docstrings, static_mode_guard, dygraph_mode_guard -from .utils import Customization -from .task import Task + +from ..data import Pad, Stack, Tuple +from ..datasets import load_dataset from .models import BiGruCrf +from .task import Task +from .utils import Customization usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow lac = Taskflow("lexical_analysis") lac("LAC是个优秀的分词工具") @@ -42,7 +34,7 @@ lac(["LAC是个优秀的分词工具", "三亚是一个美丽的城市"]) ''' - [{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']}, + [{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']}, {'text': '三亚是一个美丽的城市', 'segs': ['三亚', '是', '一个', '美丽', '的', '城市'], 'tags': ['LOC', 'v', 'm', 'a', 'u', 'n']} ] ''' @@ -60,7 +52,7 @@ def load_vocab(dict_path): for i, line in enumerate(fin): terms = line.strip("\n").split("\t") if len(terms) == 2: - if reverse == None: + if reverse is None: reverse = True if terms[0].isdigit() else False if reverse: value, key = terms @@ -174,7 +166,6 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True): batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1 num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0 self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False - infer_data = [] oov_token_id = self._word_vocab.get("OOV") filter_inputs = [] @@ -238,7 +229,6 @@ def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ - batch_out = [] lengths = inputs["lens"] preds = inputs["result"] sents = inputs["text"] diff --git a/paddlenlp/taskflow/models/lexical_analysis_model.py b/paddlenlp/taskflow/models/lexical_analysis_model.py index e3f768851066..32f711020916 100644 --- a/paddlenlp/taskflow/models/lexical_analysis_model.py +++ b/paddlenlp/taskflow/models/lexical_analysis_model.py @@ -15,13 +15,12 @@ import paddle import paddle.nn as nn -import paddle.nn.functional as F + from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss -from paddlenlp.utils.tools import compare_version try: from paddle.text import ViterbiDecoder -except: +except Exception: raise ImportError( "Taskflow requires paddle version >= 2.2.0, but current paddle version is {}".format( paddle.version.full_version diff --git a/paddlenlp/taskflow/poetry_generation.py b/paddlenlp/taskflow/poetry_generation.py index d6c34339905d..7678ea232edd 100644 --- a/paddlenlp/taskflow/poetry_generation.py +++ b/paddlenlp/taskflow/poetry_generation.py @@ -13,20 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math -import os -import copy -import itertools - -import numpy as np -from .utils import download_file from .text_generation import TextGenerationTask -from .task import Task usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow poetry = Taskflow("poetry_generation") poetry("林密不见人") diff --git a/paddlenlp/taskflow/pos_tagging.py b/paddlenlp/taskflow/pos_tagging.py index 4359a1d07c2c..6d7a309112b8 100644 --- a/paddlenlp/taskflow/pos_tagging.py +++ b/paddlenlp/taskflow/pos_tagging.py @@ -13,19 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math -import os -import copy -import itertools - -import numpy as np -from .utils import download_file -from .lexical_analysis import load_vocab, LacTask +from .lexical_analysis import LacTask usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow pos = Taskflow("pos_tagging") pos("第十四届全运会在西安举办") @@ -56,13 +47,11 @@ def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ - batch_out = [] lengths = inputs["lens"] preds = inputs["result"] sents = inputs["text"] final_results = [] for sent_index in range(len(lengths)): - single_result = {} tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]] sent = sents[sent_index] if self._custom: diff --git a/paddlenlp/taskflow/question_answering.py b/paddlenlp/taskflow/question_answering.py index 84a11ea4787b..c00d1d3e743f 100644 --- a/paddlenlp/taskflow/question_answering.py +++ b/paddlenlp/taskflow/question_answering.py @@ -13,19 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math -import os -import copy -import itertools - -import numpy as np -from .utils import download_file from .text_generation import TextGenerationTask usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow qa = Taskflow("question_answering") qa("中国的国土面积有多大?") @@ -37,7 +28,7 @@ ''' [{'text': '中国国土面积有多大?', 'answer': '960万平方公里。'}, {'text': '中国的首都在哪里?', 'answer': '北京。'}] ''' - + """ URLS = { diff --git a/paddlenlp/taskflow/text_correction.py b/paddlenlp/taskflow/text_correction.py index 932512c77b3a..d7fd11c6248f 100644 --- a/paddlenlp/taskflow/text_correction.py +++ b/paddlenlp/taskflow/text_correction.py @@ -13,24 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math import os -import copy -import itertools -import numpy as np import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ..transformers import ErnieTokenizer, ErnieModel -from ..transformers import is_chinese_char -from ..datasets import load_dataset -from ..data import Stack, Pad, Tuple, Vocab -from .utils import download_file, add_docstrings, static_mode_guard + +from ..data import Pad, Stack, Tuple, Vocab +from ..transformers import ErnieModel, ErnieTokenizer, is_chinese_char from .models import ErnieForCSC from .task import Task +from .utils import static_mode_guard usage = r""" from paddlenlp import Taskflow @@ -47,11 +38,11 @@ text_correction(['遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。', '人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。']) ''' - [{'source': '遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。', - 'target': '遇到逆境时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。', - 'errors': [{'position': 3, 'correction': {'竟': '境'}}]}, - {'source': '人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。', - 'target': '人生就是如此,经过磨练才能让自己更加茁壮,才能使自己更加乐观。', + [{'source': '遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。', + 'target': '遇到逆境时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。', + 'errors': [{'position': 3, 'correction': {'竟': '境'}}]}, + {'source': '人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。', + 'target': '人生就是如此,经过磨练才能让自己更加茁壮,才能使自己更加乐观。', 'errors': [{'position': 18, 'correction': {'拙': '茁'}}]} ] ''' @@ -93,7 +84,7 @@ def __init__(self, task, model, **kwargs): self._construct_tokenizer(model) try: import pypinyin - except: + except ImportError: raise ImportError("Please install the dependencies first, pip install pypinyin --upgrade") self._pypinyin = pypinyin self._batchify_fn = lambda samples, fn=Tuple( diff --git a/paddlenlp/taskflow/text_generation.py b/paddlenlp/taskflow/text_generation.py index 4be8ac9e2895..0eeaacf4580d 100644 --- a/paddlenlp/taskflow/text_generation.py +++ b/paddlenlp/taskflow/text_generation.py @@ -13,23 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math -import os -import copy -import itertools - -import numpy as np import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ..transformers import GPTForGreedyGeneration -from ..transformers import GPTChineseTokenizer, GPTTokenizer -from ..datasets import load_dataset -from ..data import Stack, Pad, Tuple -from .utils import download_file, add_docstrings, static_mode_guard, dygraph_mode_guard + +from ..data import Pad, Stack, Tuple +from ..transformers import GPTChineseTokenizer, GPTForGreedyGeneration, GPTTokenizer from .task import Task +from .utils import download_file, static_mode_guard usage = r""" """ @@ -99,9 +88,7 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True): inputs = self._check_input_text(inputs) # Get the config from the kwargs batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1 - num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0 generation_task = self.kwargs["generation_task"] if "generation_task" in self.kwargs else "question_answering" - max_seq_len = 32 def select_few_shot_input(model_name, generation_task): pre_input = "" @@ -116,8 +103,6 @@ def select_few_shot_input(model_name, generation_task): pre_input = select_few_shot_input(self.model, generation_task) - infer_data = [] - examples = [] filter_inputs = [] for input_text in inputs: diff --git a/paddlenlp/taskflow/word_segmentation.py b/paddlenlp/taskflow/word_segmentation.py index 254c72973e1f..1bb3974b97a5 100644 --- a/paddlenlp/taskflow/word_segmentation.py +++ b/paddlenlp/taskflow/word_segmentation.py @@ -13,22 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import glob -import json -import math -import os -import copy -import itertools - -import numpy as np import jieba -from .utils import download_file -from .task import Task -from .lexical_analysis import load_vocab, LacTask + +from .lexical_analysis import LacTask from .named_entity_recognition import NERWordTagTask +from .task import Task usage = r""" - from paddlenlp import Taskflow + from paddlenlp import Taskflow # Taskflow base模式 seg = Taskflow("word_segmentation") @@ -124,13 +116,11 @@ def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ - batch_out = [] lengths = inputs["lens"] preds = inputs["result"] sents = inputs["text"] final_results = [] for sent_index in range(len(lengths)): - single_result = {} tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]] sent = sents[sent_index] if self._custom: