Skip to content

Commit

Permalink
Fix type promotion problem. (#8414)
Browse files Browse the repository at this point in the history
* fix type promotion problem.
  • Loading branch information
zxcd authored May 11, 2024
1 parent c6e5459 commit 99fbc41
Show file tree
Hide file tree
Showing 12 changed files with 27 additions and 23 deletions.
4 changes: 3 additions & 1 deletion paddlenlp/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,9 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
def update_scores_for_generation(scores, next_scores, length, unfinished_flag):
# update scores

unfinished_scores = (scores * length + next_scores) / (length + 1)
unfinished_scores = (scores * paddle.to_tensor(length, dtype=scores.dtype) + next_scores) / (
paddle.to_tensor(length, dtype=scores.dtype) + 1
)
scores = paddle.where(unfinished_flag, unfinished_scores, scores)
return scores

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/layers/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _point_score(self, inputs, labels, lengths):
flattened_inputs = inputs.reshape([-1])
offsets = paddle.unsqueeze(self._get_batch_index(batch_size) * seq_len * n_labels, 1)
offsets += paddle.unsqueeze(self._get_seq_index(seq_len) * n_labels, 0)
flattened_tag_indices = paddle.reshape(offsets + labels, [-1])
flattened_tag_indices = paddle.reshape(offsets + labels.astype(offsets.dtype), [-1])

scores = paddle.gather(flattened_inputs, flattened_tag_indices).reshape([batch_size, seq_len])

Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/metrics/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def compute(self, pred, label, seq_mask=None):
ce = F.cross_entropy(input=pred, label=label, reduction="none", soft_label=False)
ce = paddle.squeeze(ce, axis=[2])
if seq_mask is not None:
ce = ce * seq_mask
ce = ce * seq_mask.astype(ce.dtype)
word_num = paddle.sum(seq_mask)
return ce, word_num
return ce
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/prompt/verbalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def aggregate(self, outputs: Tensor, mask: Tensor, atype: str):
Aggregate multiple tokens/words for each word/label.
"""
if atype == "mean":
outputs = outputs * mask
outputs = outputs * mask.astype(outputs.dtype)
outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15)
elif atype == "max":
outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1)
Expand Down
4 changes: 3 additions & 1 deletion paddlenlp/transformers/convbert/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,9 @@ def update_inputs(self, sequence, updates, positions):
N = positions.shape[1]
assert N == L, "the dimension of inputs and mask should be same as [batch_size, sequence_length]"

updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
positions * updates.astype(positions.dtype)
)

return updated_sequence

Expand Down
8 changes: 6 additions & 2 deletions paddlenlp/transformers/electra/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,9 @@ def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generat
mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions)
# use inputs and updated_input to get discriminator labels
labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype("int64"))
labels = mask_positions * (
paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype(raw_inputs.dtype)
)
return updated_inputs, labels, sampled_tokids

def sample_from_softmax(self, logits, use_softmax_sample=True):
Expand All @@ -1073,7 +1075,9 @@ def update_inputs(self, sequence, updates, positions):
N = positions.shape[1]
assert N == L, "the dimension of inputs and mask should be same as [B, L]"

updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (positions * updates)
updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
positions * updates.astype(positions.dtype)
)

return updated_sequence

Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/transformers/funnel/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def relative_positional_attention(self, position_embeds, q_head, context_len, cl
positional_attn = _relative_shift_gather(positional_attn, context_len, shift)

if cls_mask is not None:
positional_attn *= cls_mask
positional_attn *= cls_mask.astype(positional_attn.dtype)
return positional_attn

def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
Expand Down Expand Up @@ -547,7 +547,7 @@ def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
)

if cls_mask is not None:
token_type_attn *= cls_mask
token_type_attn *= cls_mask.astype(token_type_attn.dtype)
return token_type_attn

def forward(self, query, key, value, attention_inputs, output_attentions=False):
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/gptj/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def _attn(

if attention_mask is not None:
# Apply the attention mask
attn_weights = attn_weights + attention_mask
attn_weights = attn_weights + attention_mask.astype(attn_weights.dtype)

attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1)
attn_weights = attn_weights.astype(value.dtype)
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/mbart/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def shift_tokens_right(input_ids, pad_token_id):
batch_size, seq_length = shifted_input_ids.shape
index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length
index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1
decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos)
decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos.astype(index.dtype))
shifted_input_ids[:, 1:] = shifted_input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_tokens
return shifted_input_ids
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/megatronbert/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def forward(self, hidden_states, attention_mask=None):
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
attention_scores = attention_scores + attention_mask
attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)

# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, axis=-1)
Expand Down
16 changes: 6 additions & 10 deletions paddlenlp/transformers/prophetnet/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
)
inv_relative_positions = paddle.abs(inv_relative_positions)
else:
inv_relative_positions = (
paddle.cast(
paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
)
* inv_relative_positions
)
inv_relative_positions = paddle.cast(
paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
) * inv_relative_positions.astype(paddle.int32)

max_exact = num_buckets // 2
is_small = paddle.less_than(inv_relative_positions, paddle.to_tensor(max_exact).cast(dtype=paddle.int32))
Expand All @@ -85,10 +82,9 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1)
val_if_large_lt = paddle.cast(paddle.less_than(val_if_large, val_if_large_num_buckets), dtype=paddle.int32)
val_if_large = (
paddle.cast(val_if_large_lt * val_if_large, dtype=paddle.int32)
+ (1 - val_if_large_lt) * val_if_large_num_buckets
)
val_if_large = val_if_large_lt * val_if_large.astype(val_if_large_lt.dtype) + (
1 - val_if_large_lt
) * val_if_large_num_buckets.astype(val_if_large_lt.dtype)
rel_positions_bucket = rel_positions_bucket + paddle.where(
is_small, paddle.cast(inv_relative_positions, dtype=paddle.int32), val_if_large
)
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/rembert/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def forward(self, hidden_states, attention_mask=None):
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in RemBertModel forward() function)
attention_scores = attention_scores + attention_mask
attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)

# Normalize the attention scores to probabilities.
attention_probs = F.softmax(attention_scores, axis=-1)
Expand Down

0 comments on commit 99fbc41

Please sign in to comment.