Skip to content

Commit

Permalink
Merge branch 'PaddlePaddle:develop' into log
Browse files Browse the repository at this point in the history
  • Loading branch information
lugimzzz authored Feb 28, 2023
2 parents 9010358 + 00d842b commit 4858d7d
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 31 deletions.
18 changes: 14 additions & 4 deletions paddlenlp/taskflow/dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,23 +74,33 @@ class DialogueTask(Task):
"https://bj.bcebos.com/paddlenlp/taskflow/dialogue/plato-mini/model_config.json",
"5e853fda9a9b573815ad112e494a65af",
],
}
},
"__internal_testing__/tiny-random-plato": {
"model_state": [
"https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/model_state.pdparams",
"fda5d068908505cf0c3a46125eb4d39e",
],
"model_config": [
"https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/config.json",
"3664e658d5273a132f2e7345a8cafa53",
],
},
}

def __init__(self, task, model, batch_size=1, max_seq_len=512, **kwargs):
super().__init__(task=task, model=model, **kwargs)
self._static_mode = False
self._usage = usage
if not self.from_hf_hub:
if not self._custom_model:
self._check_task_files()
self._construct_tokenizer(self._task_path if self.from_hf_hub else model)
self._construct_tokenizer(self._task_path if self._custom_model else model)
self._batch_size = batch_size
self._max_seq_len = max_seq_len
self._interactive_mode = False
if self._static_mode:
self._get_inference_model()
else:
self._construct_model(self._task_path if self.from_hf_hub else model)
self._construct_model(self._task_path if self._custom_model else model)

def _construct_input_spec(self):
"""
Expand Down
48 changes: 43 additions & 5 deletions paddlenlp/taskflow/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,16 +180,44 @@ class MultimodalFeatureExtractionTask(Task):
"573ba0466e15cdb5bd423ff7010735ce",
],
},
"__internal_testing__/tiny-random-ernievil2": {
"model_state": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/model_state.pdparams",
"771c844e7b75f61123d9606c8c17b1d6",
],
"config": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/config.json",
"ae27a68336ccec6d3ffd14b48a6d1f25",
],
"vocab_file": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/vocab.txt",
"1c1c1f4fd93c5bed3b4eebec4de976a8",
],
"preprocessor_config": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/preprocessor_config.json",
"9a2e8da9f41896fedb86756b79355ee2",
],
"special_tokens_map": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/special_tokens_map.json",
"8b3fb1023167bb4ab9d70708eb05f6ec",
],
"tokenizer_config": [
"https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/tokenizer_config.json",
"2333f189cad8dd559de61bbff4d4a789",
],
},
}

def __init__(self, task, model, batch_size=1, is_static_model=True, return_tensors="pd", **kwargs):
def __init__(self, task, model, batch_size=1, is_static_model=True, max_length=128, return_tensors="pd", **kwargs):
super().__init__(task=task, model=model, **kwargs)
self._seed = None
# we do not use batch
self.export_type = "text"
self._batch_size = batch_size
self.return_tensors = return_tensors
self._check_task_files()
if not self.from_hf_hub:
self._check_task_files()
self._max_length = max_length
self._construct_tokenizer()
self.is_static_model = is_static_model
self._config_map = {}
Expand All @@ -214,7 +242,7 @@ def _construct_tokenizer(self):
"""
Construct the tokenizer for the predictor.
"""
self._processor = AutoProcessor.from_pretrained(self.model)
self._processor = AutoProcessor.from_pretrained(self._task_path)

def _batchify(self, data, batch_size):
"""
Expand All @@ -231,12 +259,22 @@ def _parse_batch(batch_examples):
if self.is_static_model:
# The input of static model is numpy array
tokenized_inputs = self._processor(
text=batch_texts, images=batch_images, return_tensors="np", padding="max_length", truncation=True
text=batch_texts,
images=batch_images,
return_tensors="np",
padding="max_length",
max_length=self._max_length,
truncation=True,
)
else:
# The input of dygraph model is padddle.Tensor
tokenized_inputs = self._processor(
text=batch_texts, images=batch_images, return_tensors="pd", padding="max_length", truncation=True
text=batch_texts,
images=batch_images,
return_tensors="pd",
padding="max_length",
max_length=self._max_length,
truncation=True,
)
return tokenized_inputs

Expand Down
13 changes: 13 additions & 0 deletions paddlenlp/taskflow/taskflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@
"dialogue": {
"models": {
"plato-mini": {"task_class": DialogueTask, "task_flag": "dialogue-plato-mini"},
"__internal_testing__/tiny-random-plato": {
"task_class": DialogueTask,
"task_flag": "dialogue-tiny-random-plato",
},
},
"default": {
"model": "plato-mini",
Expand Down Expand Up @@ -235,6 +239,10 @@
"task_class": TextSimilarityTask,
"task_flag": "text_similarity-rocketqa-nano-cross-encoder",
},
"__internal_testing__/tiny-random-bert": {
"task_class": TextSimilarityTask,
"task_flag": "text_similarity-tiny-random-bert",
},
},
"default": {"model": "simbert-base-chinese"},
},
Expand Down Expand Up @@ -597,6 +605,11 @@
"task_flag": "feature_extraction-openai/clip-rn50x4",
"task_priority_path": "openai/clip-rn50x4",
},
"__internal_testing__/tiny-random-ernievil2": {
"task_class": MultimodalFeatureExtractionTask,
"task_flag": "feature_extraction-tiny-random-ernievil2",
"task_priority_path": "__internal_testing__/tiny-random-ernievil2",
},
},
"default": {"model": "PaddlePaddle/ernie_vil-2.0-base-zh"},
},
Expand Down
24 changes: 17 additions & 7 deletions paddlenlp/taskflow/text_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,30 @@ class TextSimilarityTask(Task):
"dcff14cd671e1064be2c5d63734098bb",
],
},
"__internal_testing__/tiny-random-bert": {
"model_state": [
"https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-bert/model_state.pdparams",
"a7a54deee08235fc6ae454f5def2d663",
],
"model_config": [
"https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-bert/config.json",
"bfaa763f77da7cc796de4e0ad4b389e9",
],
},
}

def __init__(self, task, model, batch_size=1, max_seq_len=384, **kwargs):
def __init__(self, task, model, batch_size=1, max_length=384, **kwargs):
super().__init__(task=task, model=model, **kwargs)
self._static_mode = True
self._check_task_files()
self._get_inference_model()
if not self.from_hf_hub:
self._check_task_files()
if self._static_mode:
self._get_inference_model()
else:
self._construct_model(model)
self._construct_tokenizer(model)
self._batch_size = batch_size
self._max_seq_len = max_seq_len
self._max_length = max_length
self._usage = usage
self.model_name = model

Expand Down Expand Up @@ -185,16 +195,16 @@ def _preprocess(self, inputs):
for data in inputs:
text1, text2 = data[0], data[1]
if "rocketqa" in self.model_name:
encoded_inputs = self._tokenizer(text=text1, text_pair=text2, max_seq_len=self._max_seq_len)
encoded_inputs = self._tokenizer(text=text1, text_pair=text2, max_length=self._max_length)
ids = encoded_inputs["input_ids"]
segment_ids = encoded_inputs["token_type_ids"]
examples.append((ids, segment_ids))
else:
text1_encoded_inputs = self._tokenizer(text=text1, max_seq_len=self._max_seq_len)
text1_encoded_inputs = self._tokenizer(text=text1, max_length=self._max_length)
text1_input_ids = text1_encoded_inputs["input_ids"]
text1_token_type_ids = text1_encoded_inputs["token_type_ids"]

text2_encoded_inputs = self._tokenizer(text=text2, max_seq_len=self._max_seq_len)
text2_encoded_inputs = self._tokenizer(text=text2, max_length=self._max_length)
text2_input_ids = text2_encoded_inputs["input_ids"]
text2_token_type_ids = text2_encoded_inputs["token_type_ids"]

Expand Down
2 changes: 1 addition & 1 deletion tests/taskflow/test_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class TestDialogueTask(unittest.TestCase):
def setUpClass(cls):
cls.dialogue = Taskflow(
task="dialogue",
task_path="__internal_testing__/tiny-random-plato",
model="__internal_testing__/tiny-random-plato",
)
cls.max_turn = 3

Expand Down
26 changes: 14 additions & 12 deletions tests/taskflow/test_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,28 @@ def setUpClass(cls):
cls.max_resolution = 40
cls.min_resolution = 30
cls.num_channels = 3
cls.model = "__internal_testing__/tiny-random-ernievil2"
cls.max_length = 30

@classmethod
def tearDownClass(cls):
cls.temp_dir.cleanup()

def test_model_np(self):
feature_extractor = Taskflow(
model="PaddlePaddle/ernie_vil-2.0-base-zh",
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
return_tensors="np",
max_length=self.max_length,
)
outputs = feature_extractor("This is a test")
self.assertEqual(outputs["features"].shape, (1, 768))
self.assertEqual(outputs["features"].shape, (1, 32))

def test_return_tensors(self):
feature_extractor = Taskflow(
model="PaddlePaddle/ernie_vil-2.0-base-zh",
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
return_tensors="pd",
max_length=self.max_length,
)
outputs = feature_extractor(
"This is a test",
Expand Down Expand Up @@ -94,23 +94,23 @@ def test_feature_extraction_task(self):
input_text = (["这是一只猫", "这是一只狗"],)
# dygraph text test
dygraph_taskflow = MultimodalFeatureExtractionTask(
model="PaddlePaddle/ernie_vil-2.0-base-zh",
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
is_static_model=False,
return_tensors="np",
max_length=self.max_length,
)
dygraph_results = dygraph_taskflow(input_text)
shape = dygraph_results["features"].shape
self.assertEqual(shape[0], 2)
# static text test
static_taskflow = MultimodalFeatureExtractionTask(
model="PaddlePaddle/ernie_vil-2.0-base-zh",
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
is_static_model=True,
return_tensors="np",
device_id=0,
max_length=self.max_length,
)
static_results = static_taskflow(input_text)
self.assertEqual(static_results["features"].shape[0], 2)
Expand All @@ -137,21 +137,23 @@ def test_taskflow_task(self):

# dygraph test
dygraph_taskflow = Taskflow(
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
is_static_model=False,
return_tensors="np",
max_length=self.max_length,
)
dygraph_results = dygraph_taskflow(input_text)
shape = dygraph_results["features"].shape

self.assertEqual(shape[0], 2)
# static test
static_taskflow = Taskflow(
model="__internal_testing__/tiny-random-ernievil2",
task="feature_extraction",
task_path=self.model,
is_static_model=True,
return_tensors="np",
max_length=self.max_length,
)
static_results = static_taskflow(input_text)
self.assertEqual(static_results["features"].shape[0], 2)
Expand Down
3 changes: 1 addition & 2 deletions tests/taskflow/test_text_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ class TestTextSimilarityTask(unittest.TestCase):
def test_bert_model(self):
similarity = Taskflow(
task="text_similarity",
model="simbert-base-chinese",
task_path="__internal_testing__/tiny-random-bert",
model="__internal_testing__/tiny-random-bert",
)
results = similarity([["世界上什么东西最小", "世界上什么东西最小?"]])
self.assertTrue(len(results) == 1)
Expand Down

0 comments on commit 4858d7d

Please sign in to comment.