Skip to content

Commit

Permalink
[CodeStyle] fix taskflow (#4982)
Browse files Browse the repository at this point in the history
* dependency parsing

* fix lexical

* fix lexical

* fix lexical

* poetry

* poetry

* qa

* text correction

* text generation

* seg
  • Loading branch information
sijunhe authored Feb 24, 2023
1 parent 1a0af2b commit edf6fef
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 122 deletions.
24 changes: 10 additions & 14 deletions paddlenlp/taskflow/dependency_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@

import copy
import os
import itertools

import numpy as np
import paddle
from ..data import Vocab, Pad
from .utils import download_file, dygraph_mode_guard
from .task import Task

from ..data import Pad, Vocab
from .models import BiAffineParser
from .task import Task
from .utils import download_file

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
ddp = Taskflow("dependency_parsing")
ddp("三亚是一座美丽的城市")
Expand All @@ -35,7 +35,7 @@
ddp(["三亚是一座美丽的城市", "他送了一本书"])
'''
[{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
'''
'''
ddp = Taskflow("dependency_parsing", prob=True, use_pos=True)
ddp("三亚是一座美丽的城市")
Expand Down Expand Up @@ -64,7 +64,7 @@
ddp.from_segments([['三亚', '是', '一座', '美丽', '的', '城市'], ['他', '送', '了', '一本', '书']])
'''
[{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
'''
'''
"""


Expand Down Expand Up @@ -181,7 +181,7 @@ def __init__(

try:
from LAC import LAC
except:
except Exception:
raise ImportError("Please install the dependencies first, pip install LAC --upgrade")

self.use_cuda = use_cuda
Expand Down Expand Up @@ -274,10 +274,6 @@ def _preprocess(self, inputs):
2) Generate the other model inputs from the raw text and token ids.
"""

# Get the config from the kwargs
num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False

outputs = {}

lac_results = []
Expand Down Expand Up @@ -368,9 +364,9 @@ def _visualize(self, data):
data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
"""
try:
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
except:
import matplotlib.pyplot as plt
except Exception:
raise ImportError("Please install the dependencies first, pip install matplotlib --upgrade")

self.plt = plt
Expand Down
26 changes: 8 additions & 18 deletions paddlenlp/taskflow/lexical_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..datasets import load_dataset, MapDataset
from ..data import Stack, Pad, Tuple, Vocab, JiebaTokenizer
from .utils import download_file, add_docstrings, static_mode_guard, dygraph_mode_guard
from .utils import Customization
from .task import Task

from ..data import Pad, Stack, Tuple
from ..datasets import load_dataset
from .models import BiGruCrf
from .task import Task
from .utils import Customization

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
lac = Taskflow("lexical_analysis")
lac("LAC是个优秀的分词工具")
Expand All @@ -42,7 +34,7 @@
lac(["LAC是个优秀的分词工具", "三亚是一个美丽的城市"])
'''
[{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']},
[{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']},
{'text': '三亚是一个美丽的城市', 'segs': ['三亚', '是', '一个', '美丽', '的', '城市'], 'tags': ['LOC', 'v', 'm', 'a', 'u', 'n']}
]
'''
Expand All @@ -60,7 +52,7 @@ def load_vocab(dict_path):
for i, line in enumerate(fin):
terms = line.strip("\n").split("\t")
if len(terms) == 2:
if reverse == None:
if reverse is None:
reverse = True if terms[0].isdigit() else False
if reverse:
value, key = terms
Expand Down Expand Up @@ -174,7 +166,6 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True):
batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
infer_data = []
oov_token_id = self._word_vocab.get("OOV")

filter_inputs = []
Expand Down Expand Up @@ -238,7 +229,6 @@ def _postprocess(self, inputs):
"""
The model output is the tag ids, this function will convert the model output to raw text.
"""
batch_out = []
lengths = inputs["lens"]
preds = inputs["result"]
sents = inputs["text"]
Expand Down
5 changes: 2 additions & 3 deletions paddlenlp/taskflow/models/lexical_analysis_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss
from paddlenlp.utils.tools import compare_version

try:
from paddle.text import ViterbiDecoder
except:
except Exception:
raise ImportError(
"Taskflow requires paddle version >= 2.2.0, but current paddle version is {}".format(
paddle.version.full_version
Expand Down
12 changes: 1 addition & 11 deletions paddlenlp/taskflow/poetry_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
from .utils import download_file
from .text_generation import TextGenerationTask
from .task import Task

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
poetry = Taskflow("poetry_generation")
poetry("林密不见人")
Expand Down
15 changes: 2 additions & 13 deletions paddlenlp/taskflow/pos_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
from .utils import download_file
from .lexical_analysis import load_vocab, LacTask
from .lexical_analysis import LacTask

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
pos = Taskflow("pos_tagging")
pos("第十四届全运会在西安举办")
Expand Down Expand Up @@ -56,13 +47,11 @@ def _postprocess(self, inputs):
"""
The model output is the tag ids, this function will convert the model output to raw text.
"""
batch_out = []
lengths = inputs["lens"]
preds = inputs["result"]
sents = inputs["text"]
final_results = []
for sent_index in range(len(lengths)):
single_result = {}
tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
sent = sents[sent_index]
if self._custom:
Expand Down
13 changes: 2 additions & 11 deletions paddlenlp/taskflow/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
from .utils import download_file
from .text_generation import TextGenerationTask

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
qa = Taskflow("question_answering")
qa("中国的国土面积有多大?")
Expand All @@ -37,7 +28,7 @@
'''
[{'text': '中国国土面积有多大?', 'answer': '960万平方公里。'}, {'text': '中国的首都在哪里?', 'answer': '北京。'}]
'''
"""

URLS = {
Expand Down
29 changes: 10 additions & 19 deletions paddlenlp/taskflow/text_correction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..transformers import ErnieTokenizer, ErnieModel
from ..transformers import is_chinese_char
from ..datasets import load_dataset
from ..data import Stack, Pad, Tuple, Vocab
from .utils import download_file, add_docstrings, static_mode_guard

from ..data import Pad, Stack, Tuple, Vocab
from ..transformers import ErnieModel, ErnieTokenizer, is_chinese_char
from .models import ErnieForCSC
from .task import Task
from .utils import static_mode_guard

usage = r"""
from paddlenlp import Taskflow
Expand All @@ -47,11 +38,11 @@
text_correction(['遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。',
'人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。'])
'''
[{'source': '遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。',
'target': '遇到逆境时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。',
'errors': [{'position': 3, 'correction': {'竟': '境'}}]},
{'source': '人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。',
'target': '人生就是如此,经过磨练才能让自己更加茁壮,才能使自己更加乐观。',
[{'source': '遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。',
'target': '遇到逆境时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。',
'errors': [{'position': 3, 'correction': {'竟': '境'}}]},
{'source': '人生就是如此,经过磨练才能让自己更加拙壮,才能使自己更加乐观。',
'target': '人生就是如此,经过磨练才能让自己更加茁壮,才能使自己更加乐观。',
'errors': [{'position': 18, 'correction': {'拙': '茁'}}]}
]
'''
Expand Down Expand Up @@ -93,7 +84,7 @@ def __init__(self, task, model, **kwargs):
self._construct_tokenizer(model)
try:
import pypinyin
except:
except ImportError:
raise ImportError("Please install the dependencies first, pip install pypinyin --upgrade")
self._pypinyin = pypinyin
self._batchify_fn = lambda samples, fn=Tuple(
Expand Down
23 changes: 4 additions & 19 deletions paddlenlp/taskflow/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..transformers import GPTForGreedyGeneration
from ..transformers import GPTChineseTokenizer, GPTTokenizer
from ..datasets import load_dataset
from ..data import Stack, Pad, Tuple
from .utils import download_file, add_docstrings, static_mode_guard, dygraph_mode_guard

from ..data import Pad, Stack, Tuple
from ..transformers import GPTChineseTokenizer, GPTForGreedyGeneration, GPTTokenizer
from .task import Task
from .utils import download_file, static_mode_guard

usage = r"""
"""
Expand Down Expand Up @@ -99,9 +88,7 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True):
inputs = self._check_input_text(inputs)
# Get the config from the kwargs
batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
generation_task = self.kwargs["generation_task"] if "generation_task" in self.kwargs else "question_answering"
max_seq_len = 32

def select_few_shot_input(model_name, generation_task):
pre_input = ""
Expand All @@ -116,8 +103,6 @@ def select_few_shot_input(model_name, generation_task):

pre_input = select_few_shot_input(self.model, generation_task)

infer_data = []

examples = []
filter_inputs = []
for input_text in inputs:
Expand Down
18 changes: 4 additions & 14 deletions paddlenlp/taskflow/word_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import json
import math
import os
import copy
import itertools

import numpy as np
import jieba
from .utils import download_file
from .task import Task
from .lexical_analysis import load_vocab, LacTask

from .lexical_analysis import LacTask
from .named_entity_recognition import NERWordTagTask
from .task import Task

usage = r"""
from paddlenlp import Taskflow
from paddlenlp import Taskflow
# Taskflow base模式
seg = Taskflow("word_segmentation")
Expand Down Expand Up @@ -124,13 +116,11 @@ def _postprocess(self, inputs):
"""
The model output is the tag ids, this function will convert the model output to raw text.
"""
batch_out = []
lengths = inputs["lens"]
preds = inputs["result"]
sents = inputs["text"]
final_results = []
for sent_index in range(len(lengths)):
single_result = {}
tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
sent = sents[sent_index]
if self._custom:
Expand Down

0 comments on commit edf6fef

Please sign in to comment.