From 36ab9a7b6552e6a2affe9f033bdc49c2e32769fc Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 16 Apr 2024 06:48:15 +0000 Subject: [PATCH 01/41] add Qwen2Moe --- llm/data.py | 4 +- llm/qwen2moe/lora_argument.json | 32 + llm/qwen2moe/sft_argument.json | 30 + paddlenlp/transformers/auto/modeling.py | 1 + paddlenlp/transformers/qwen2moe/__init__.py | 16 + .../transformers/qwen2moe/configuration.py | 203 +++ paddlenlp/transformers/qwen2moe/modeling.py | 1589 +++++++++++++++++ tests/transformers/qwen2moe/__init__.py | 13 + tests/transformers/qwen2moe/test_modeling.py | 318 ++++ 9 files changed, 2204 insertions(+), 2 deletions(-) create mode 100644 llm/qwen2moe/lora_argument.json create mode 100644 llm/qwen2moe/sft_argument.json create mode 100644 paddlenlp/transformers/qwen2moe/__init__.py create mode 100644 paddlenlp/transformers/qwen2moe/configuration.py create mode 100644 paddlenlp/transformers/qwen2moe/modeling.py create mode 100644 tests/transformers/qwen2moe/__init__.py create mode 100644 tests/transformers/qwen2moe/test_modeling.py diff --git a/llm/data.py b/llm/data.py index 5d44c72c8abd..a92b087e591f 100644 --- a/llm/data.py +++ b/llm/data.py @@ -44,11 +44,11 @@ def get_convert_example(model): if base_model_prefix == "chatglm": return convert_example_chatglm - elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mixtral"]: + elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mixtral", "qwen2moe"]: return convert_example_common else: raise ValueError( - f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral" + f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, qwen2moe" ) diff --git a/llm/qwen2moe/lora_argument.json b/llm/qwen2moe/lora_argument.json new file mode 100644 index 000000000000..ef832a113f46 --- /dev/null +++ b/llm/qwen2moe/lora_argument.json @@ -0,0 +1,32 @@ +{ + "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/qwen2moe_lora_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-04, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "fp16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 8, + "pipeline_parallel_degree": 1, + "lora": true, + "zero_padding": false, + "use_flash_attention": false + } \ No newline at end of file diff --git a/llm/qwen2moe/sft_argument.json b/llm/qwen2moe/sft_argument.json new file mode 100644 index 000000000000..7bb8547803dc --- /dev/null +++ b/llm/qwen2moe/sft_argument.json @@ -0,0 +1,30 @@ +{ + "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", + "dataset_name_or_path": "./data", + "output_dir": "./checkpoints/qwen2moe_sft_ckpts", + "per_device_train_batch_size": 4, + "gradient_accumulation_steps": 4, + "per_device_eval_batch_size": 8, + "eval_accumulation_steps":16, + "num_train_epochs": 3, + "learning_rate": 3e-05, + "warmup_steps": 30, + "logging_steps": 1, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "src_length": 1024, + "max_length": 2048, + "bf16": true, + "fp16_opt_level": "O2", + "do_train": true, + "do_eval": true, + "disable_tqdm": true, + "load_best_model_at_end": true, + "eval_with_do_generation": false, + "metric_for_best_model": "accuracy", + "recompute": true, + "save_total_limit": 1, + "tensor_parallel_degree": 8, + "sharding": "stage2", + "pipeline_parallel_degree": 1 +} \ No newline at end of file diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 9e291f83aa56..4e014a11844a 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -118,6 +118,7 @@ ("Bloom", "bloom"), ("QWen", "qwen"), ("Mixtral", "mixtral"), + ("QWen2Moe", "qwen2moe"), ] ) diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py new file mode 100644 index 000000000000..622f373aab5e --- /dev/null +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .configuration import Qwen2MoeConfig +from .modeling import Qwen2MoeForCausalLM diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2moe/configuration.py new file mode 100644 index 000000000000..38ec512a078e --- /dev/null +++ b/paddlenlp/transformers/qwen2moe/configuration.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Qwen2MoE model configuration""" + +from paddlenlp.transformers.configuration_utils import PretrainedConfig + +__all__ = [ + "Qwen2MoeConfig", +] + + +class Qwen2MoeConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a + Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B"). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2MoeModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 5632): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 1408): + Intermediate size of the routed expert. + shared_expert_intermediate_size (`int`, *optional*, defaults to 5632): + Intermediate size of the shared expert. + num_experts_per_tok (`int`, *optional*, defaults to 4): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 60): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabeling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + ```python + >>> from paddlenlp.transformers import Qwen2MoeModel, Qwen2MoeConfig + + >>> # Initializing a Qwen2MoE style configuration + >>> configuration = Qwen2MoeConfig() + + >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration + >>> model = Qwen2MoeModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2_moe" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=2048, + intermediate_size=5632, + num_hidden_layers=24, + num_attention_heads=16, + num_key_value_heads=16, + hidden_act="silu", + max_position_embeddings=32768, + seq_length=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + use_recompute=False, + recompute_granularity="full", + no_recompute_layers=None, + use_flash_attention=False, + attention_dropout=0.0, + use_fused_rope=False, + rope_theta=10000.0, + tensor_parallel_output=True, + sequence_parallel=False, + fuse_sequence_parallel_allreduce=False, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + decoder_sparse_step=1, + moe_intermediate_size=1408, + shared_expert_intermediate_size=5632, + num_experts_per_tok=4, + num_experts=60, + norm_topk_prob=False, + output_router_logits=False, + router_aux_loss_coef=0.001, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.seq_length = seq_length + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + + self.use_cache = use_cache + self.use_recompute = use_recompute + self.recompute_granularity = recompute_granularity + self.no_recompute_layers = no_recompute_layers + self.use_flash_attention = use_flash_attention + self.tensor_parallel_output = tensor_parallel_output + self.sequence_parallel = sequence_parallel + self.fuse_sequence_parallel_allreduce = fuse_sequence_parallel_allreduce + + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + self.use_fused_rope = use_fused_rope + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.shared_expert_intermediate_size = shared_expert_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + tensor_parallel_output=tensor_parallel_output, + **kwargs, + ) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py new file mode 100644 index 000000000000..689e03ef8286 --- /dev/null +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -0,0 +1,1589 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Paddle Qwen2Moe model.""" +from __future__ import annotations + +import math +import warnings +from functools import partial +from typing import Optional, Tuple + +import paddle +import paddle.distributed.fleet.meta_parallel as mpu +import paddle.nn.functional as F +from paddle import Tensor, nn +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker +from paddle.distributed.fleet.utils import recompute + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None +from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, +) + +from paddlenlp.transformers.conversion_utils import ( + StateDictNameMapping, + init_name_mappings, +) +from paddlenlp.transformers.model_outputs import ( + MoECausalLMOutputWithPast, + MoEModelOutputWithPast, +) +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model +from paddlenlp.utils.log import logger + +from ..activations import ACT2FN +from .configuration import Qwen2MoeConfig + +try: + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + +__all__ = [ + "Qwen2MoeModel", + "Qwen2MoePretrainedModel", + "Qwen2MoeForCausalLM", + "Qwen2MoePretrainingCriterion", +] + + +def load_balancing_loss_func(gate_logits, num_experts, top_k=2, attention_mask=None): + """ + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Paddle. + See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + Args: + gate_logits (Union[`paddle.Tensor`, Tuple[paddle.Tensor]): + Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [batch_size X sequence_length, num_experts]. + num_experts (`int`): + Number of experts. + top_k (`int`): + Number of top k experts to be considered for the loss computation. + attention_mask (`paddle.Tensor`, None): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + if isinstance(gate_logits, tuple): + concatenated_gate_logits = paddle.concat( + gate_logits, axis=0 + ) # [num_hidden_layers X batch_size X sequence_length, num_experts] + + routing_weights = F.softmax(concatenated_gate_logits, axis=-1) + _, selected_experts = paddle.topk(routing_weights, top_k, axis=-1) + expert_mask = F.one_hot( + selected_experts, num_classes=num_experts + ) # [num_hidden_layers X batch_size X sequence_length, top_k, num_experts] + + if attention_mask is None or len(attention_mask.shape) == 4: + # Only intokens strategy has 4-D attention_mask, we currently do not support excluding padding tokens. + # Compute the percentage of tokens routed to each experts + tokens_per_expert = paddle.mean(expert_mask.astype("float32"), axis=0) + + # Compute the average probability of routing to these experts + router_prob_per_expert = paddle.mean(routing_weights, axis=0) + else: + # Exclude the load balancing loss of padding tokens. + if len(attention_mask.shape) == 2: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None, None] + .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts)) + .reshape([-1, top_k, num_experts]) + ) # [num_hidden_layers * batch_size * sequence_length, top_k, num_experts] + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = paddle.sum(expert_mask.astype("float32") * expert_attention_mask, axis=0) / paddle.sum( + expert_attention_mask, axis=0 + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape([-1, num_experts]) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = paddle.sum( + routing_weights * router_per_expert_attention_mask, axis=0 + ) / paddle.sum(router_per_expert_attention_mask, axis=0) + + overall_loss = paddle.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0)) + return overall_loss * num_experts + + +def get_triangle_upper_mask(x, mask=None): + if mask is not None: + return mask + # [bsz, n_head, q_len, kv_seq_len] + shape = x.shape + # [bsz, 1, q_len, kv_seq_len] + shape[1] = 1 + mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype) + mask = paddle.triu(mask, diagonal=1) + mask.stop_gradient = True + return mask + + +def assign_kv_heads(num_kv_heads: int, num_gpus: int): + # Initialize the assignment list + """ + Assign kv heads to different GPUs in the Tensor Parallel Setup + + Examples: + assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]] + assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]] + assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]] + assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]] + assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]] + assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]] + """ + assignment_list = [[] for _ in range(num_gpus)] + # Case 1: more heads than cards + if num_kv_heads > num_gpus: + num_heads_per_card = num_kv_heads // num_gpus + for i in range(num_gpus): + for j in range(num_heads_per_card): + assignment_list[i].append(i * num_heads_per_card + j) + # Case 2: more cards than heads. each card get only 1 head. + else: + num_card_per_heads = num_gpus // num_kv_heads + for i in range(num_kv_heads): + for j in range(num_card_per_heads): + assignment_list[i * num_card_per_heads + j].append(i) + return assignment_list + + +def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True): + is_fleet_init = True + tensor_parallel_degree = 1 + try: + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + tensor_parallel_degree = hcg.get_model_parallel_world_size() + except: + is_fleet_init = False + + if paddle.in_dynamic_mode(): + y_is_distributed = y.is_distributed + else: + y_is_distributed = tensor_parallel_degree > 1 + + if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed: + # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg' + input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group) + logits = paddle.matmul(input_parallel, y, transpose_y=False) + + if tensor_parallel_output: + return logits + + return paddle.distributed.collective._c_concat(logits, group=model_parallel_group) + + else: + logits = paddle.matmul(x, y, transpose_y=False) + return logits + + +def scaled_dot_product_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + training=True, + sequence_parallel=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + + if config.use_flash_attention and flash_attention: + # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim] + # Torch Flash Attention input [ bz, nhead, seqlen, head_dim] + + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + dropout_p=config.attention_dropout if training else 0.0, + training=training, + ) + attn_weights = None + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output + else: + # [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim] + query_states = paddle.transpose(query_states, [0, 2, 1, 3]) + # merge with the next tranpose + key_states = paddle.transpose(key_states, [0, 2, 1, 3]) + value_states = paddle.transpose(value_states, [0, 2, 1, 3]) + + # matmul and devide by sqrt(head_dim) + attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])) + + if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]: + raise ValueError( + f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.shape}" + ) + + if attention_mask is None: + attention_mask = get_triangle_upper_mask(attn_weights) + attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len]) + if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: + raise ValueError( + f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}" + ) + + attn_weights = attn_weights + attention_mask + if not paddle.in_dynamic_mode(): + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) + else: + with paddle.amp.auto_cast(False): + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) + + attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training) + + attn_output = paddle.matmul(attn_weights, value_states) + attn_output = attn_output.transpose([0, 2, 1, 3]) + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def is_casual_mask(attention_mask): + """ + Upper triangular of attention_mask equals to attention_mask is casual + """ + return (paddle.triu(attention_mask) == attention_mask).all().item() + + +def _make_causal_mask(input_ids_shape, past_key_values_length): + """ + Make causal mask used for self-attention + """ + batch_size, target_length = input_ids_shape # target_length: seq_len + + mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool")) + + if past_key_values_length > 0: + # [tgt_len, tgt_len + past_len] + mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1) + + # [bs, 1, tgt_len, tgt_len + past_len] + return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length]) + + +def _expand_2d_mask(mask, dtype, tgt_length): + """ + Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`. + """ + batch_size, src_length = mask.shape[0], mask.shape[-1] + tgt_length = tgt_length if tgt_length is not None else src_length + + mask = mask[:, None, None, :].astype("bool") + mask.stop_gradient = True + expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length]) + + return expanded_mask + + +class Qwen2MoeRMSNorm(nn.Layer): + def __init__(self, config, eps=1e-6): + super().__init__() + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.variance_epsilon = config.rms_norm_eps + self.config = config + + if config.sequence_parallel: + mark_as_sequence_parallel_parameter(self.weight) + + def forward(self, hidden_states): + if paddle.in_dynamic_mode(): + with paddle.amp.auto_cast(False): + hidden_states = hidden_states.astype("float32") + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + else: + hidden_states = hidden_states.astype("float32") + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + + if self.weight.dtype in [paddle.float16, paddle.bfloat16]: + hidden_states = paddle.cast(hidden_states, self.weight.dtype) + return hidden_states * self.weight + + +def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: + """ + This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, slen, num_key_value_heads, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + + hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1]) + return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) + + +class Qwen2MoeRotaryEmbedding(nn.Layer): + def __init__(self, dim, max_position_embeddings=2048, base=10000): + super().__init__() + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + # [dim / 2] + self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim)) + self._set_cos_sin_cache(seq_len=max_position_embeddings) + + def _set_cos_sin_cache(self, seq_len): + self.max_seq_len_cached = seq_len + # [seq_len] + t = paddle.arange(seq_len, dtype="float32") + # [seq_len, dim/2] + freqs = paddle.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + # [seq_len, dim] + emb = paddle.concat([freqs, freqs], axis=-1) + # [1, seqlen, 1, dim] + self.cos_cached = emb.cos()[None, :, None, :] + self.sin_cached = emb.sin()[None, :, None, :] + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len) + cos = self.cos_cached[:, :seq_len, :, :] + sin = self.sin_cached[:, :seq_len, :, :] + return ( + cos.cast(x.dtype) if cos.dtype != x.dtype else cos, + sin.cast(x.dtype) if sin.dtype != x.dtype else sin, + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.concat([-x2, x1], axis=-1) # shape is the same as x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + if position_ids is None: + # Note: Only for MixtralForCausalLMPipe model pretraining + cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] + sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] + else: + cos = cos.squeeze(axis=[0, 2]) # [seq_len, dim] + sin = sin.squeeze(axis=[0, 2]) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim] + sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Qwen2MoeMLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.tensor_parallel_degree = config.tensor_parallel_degree + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) # w1 + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) # w2 + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) # w3 + + if config.sequence_parallel: + ColumnParallelLinear = ColumnSequenceParallelLinear + RowParallelLinear = RowSequenceParallelLinear + else: + ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear + RowParallelLinear = fleet.meta_parallel.RowParallelLinear + + if config.tensor_parallel_degree > 1: + self.gate_proj = ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + input_is_parallel=True, + has_bias=False, + ) + else: + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Qwen2MoeSparseMoeBlock(nn.Layer): + def __init__(self, config: Qwen2MoeConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self.ffn_dim = config.intermediate_size + self.num_experts = config.num_local_experts + self.top_k = config.num_experts_per_tok + self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias_attr=False) + self.experts = nn.LayerList([Qwen2MoeMLP(config) for _ in range(self.num_experts)]) + + def forward(self, hidden_states): + batch_size, seq_len, hidden_dim = hidden_states.shape + hidden_states = hidden_states.reshape([-1, hidden_dim]) + # router_logits: [batch_size * seq_len, num_experts] + router_logits = self.gate(hidden_states) + + with paddle.amp.auto_cast(False): + routing_weights = F.softmax(router_logits.astype("float32"), axis=1) + routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1) + routing_weights /= routing_weights.sum(axis=-1, keepdim=True) + # we cast back to input dtype + routing_weights = routing_weights.astype(hidden_states.dtype) + + final_hidden_states = paddle.zeros( + [batch_size * seq_len, hidden_dim], + dtype=hidden_states.dtype, + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated. + # shape: [num_experts, top_k, batch_size * seq_len] + expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0]) + + # Loop over all available experts in the model and perform the computation on each expert. + for expert_id in range(self.num_experts): + expert_layer = self.experts[expert_id] + idx, top_x = paddle.where(expert_mask[expert_id]) + + if top_x.shape[0] == 0: + continue + + current_state = paddle.gather(hidden_states, top_x.squeeze()) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx] + + top_x = top_x.squeeze() + if top_x.shape == []: + top_x = paddle.to_tensor([top_x.item()]) + final_hidden_states.index_add_(top_x, 0, current_hidden_states.astype(hidden_states.dtype)) + + final_hidden_states = final_hidden_states.reshape([batch_size, seq_len, hidden_dim]) + return final_hidden_states, router_logits + + +class Qwen2MoeAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None, layerwise_recompute: bool = True): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.head_dim = self.hidden_size // config.num_attention_heads + + self.num_key_value_heads = config.num_key_value_heads + assert config.num_attention_heads // config.num_key_value_heads + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads + self.rope_theta = config.rope_theta + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + self.attention_dropout = config.attention_dropout + + self.seq_length = config.seq_length + self.sequence_parallel = config.sequence_parallel + + # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True + # Enable_recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + self.layerwise_recompute = layerwise_recompute + self.recompute_granularity = config.recompute_granularity + if config.tensor_parallel_degree > 1: + assert ( + self.num_heads % config.tensor_parallel_degree == 0 + ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}" + self.num_heads = self.num_heads // config.tensor_parallel_degree + + assert ( + self.num_key_value_heads % config.tensor_parallel_degree == 0 + ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}" + self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree + + self.use_fused_rope = config.use_fused_rope + if self.use_fused_rope: + if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None: + warnings.warn( + "Enable fuse rope in the config, but fuse rope is not available. " + "Will disable fuse rope. Try using latest gpu version of Paddle." + ) + self.use_fused_rope = False + + if config.sequence_parallel: + ColumnParallelLinear = ColumnSequenceParallelLinear + RowParallelLinear = RowSequenceParallelLinear + else: + ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear + RowParallelLinear = fleet.meta_parallel.RowParallelLinear + + if config.tensor_parallel_degree > 1: + self.q_proj = ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + has_bias=True, + gather_output=False, + ) + self.k_proj = ColumnParallelLinear( + self.hidden_size, + self.config.num_key_value_heads * self.head_dim, + has_bias=True, + gather_output=False, + ) + self.v_proj = ColumnParallelLinear( + self.hidden_size, + self.config.num_key_value_heads * self.head_dim, + has_bias=True, + gather_output=False, + ) + else: + self.q_proj = nn.Linear( + self.hidden_size, + self.hidden_size, + bias_attr=True, + ) + self.k_proj = nn.Linear( + self.hidden_size, + self.config.num_key_value_heads * self.head_dim, + bias_attr=True, + ) + self.v_proj = nn.Linear( + self.hidden_size, + self.config.num_key_value_heads * self.head_dim, + bias_attr=True, + ) + + if config.tensor_parallel_degree > 1: + self.o_proj = RowParallelLinear( + self.hidden_size, + self.hidden_size, + has_bias=False, + input_is_parallel=True, + ) + else: + self.o_proj = nn.Linear( + self.hidden_size, + self.hidden_size, + bias_attr=False, + ) + + self.rotary_emb = Qwen2MoeRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states, + position_ids: Optional[Tuple[paddle.Tensor]] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.sequence_parallel: + target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim] + target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim] + else: + target_query_shape = [0, 0, self.num_heads, self.head_dim] + target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim] + query_states = query_states.reshape(shape=target_query_shape) + key_states = key_states.reshape(shape=target_key_value_shape) + value_states = value_states.reshape(shape=target_key_value_shape) + + kv_seq_len = key_states.shape[-3] + + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # TODO layer_idx + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value[0].shape[-3] + + if self.use_fused_rope: + assert past_key_value is None, "fuse rotary not support cache kv for now" + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states, _ = fused_rotary_position_embedding( + query_states, + key_states, + v=None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + # [bs, seq_len, num_head, head_dim] + if past_key_value is not None: + # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + # reuse k, v, self_attention + key_states = paddle.concat([past_key_value[0], key_states], axis=1) + value_states = paddle.concat([past_key_value[1], value_states], axis=1) + + past_key_value = (key_states, value_states) if use_cache else None + + # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1 + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient) + if ( + self.enable_recompute + and self.layerwise_recompute + and has_gradient + and self.recompute_granularity == "core_attn" + ): + outputs = recompute( + scaled_dot_product_attention, + query_states, + self.config, + key_states, + value_states, + attention_mask, + output_attentions, + self.training, + self.sequence_parallel, + use_reentrant=self.config.recompute_use_reentrant, + ) + else: + outputs = scaled_dot_product_attention( + query_states, + self.config, + key_states, + value_states, + attention_mask, + output_attentions, + self.training, + self.sequence_parallel, + ) + if output_attentions: + attn_output, attn_weights = outputs + else: + attn_output = outputs + + # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim] + # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism. + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + outputs = (attn_output,) + + if output_attentions: + outputs += (attn_weights,) + + if use_cache: + outputs += (past_key_value,) + + if type(outputs) is tuple and len(outputs) == 1: + outputs = outputs[0] + + return outputs + + +class Qwen2MoeDecoderLayer(nn.Module): + def __init__(self, config: Qwen2MoeConfig, layer_idx: int, layerwise_recompute: bool = False): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + + self.self_attn = Qwen2MoeAttention(config, layer_idx) + + if config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0: + self.mlp = Qwen2MoeSparseMoeBlock(config) + else: + # num_experts == 0 or this layer is not sparse layer + self.mlp = Qwen2MoeMLP(config, intermediate_size=config.intermediate_size) + + self.input_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.sequence_parallel = config.sequence_parallel + # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True + # Enable_recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + self.layerwise_recompute = layerwise_recompute + self.recompute_granularity = config.recompute_granularity + + def forward( + self, + hidden_states: paddle.Tensor, + position_ids: Optional[Tuple[paddle.Tensor]] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, and + should not be returned during inference. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states + """ + + # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel) + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + has_gradient = not hidden_states.stop_gradient + if ( + self.enable_recompute + and self.layerwise_recompute + and has_gradient + and self.recompute_granularity == "full_attn" + ): + outputs = recompute( + self.self_attn, + hidden_states, + position_ids, + past_key_value, + attention_mask, + output_attentions, + use_cache, + use_reentrant=self.config.recompute_use_reentrant, + ) + else: + outputs = self.self_attn( + hidden_states, + position_ids, + past_key_value, + attention_mask, + output_attentions, + use_cache, + ) + + if type(outputs) is tuple: + hidden_states = outputs[0] + else: + hidden_states = outputs + + if output_attentions: + self_attn_weights = outputs[1] + + if use_cache: + present_key_value = outputs[2 if output_attentions else 1] + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states, router_logits = self.block_sparse_moe(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if output_router_logits: + outputs += (router_logits,) + + if type(outputs) is tuple and len(outputs) == 1: + outputs = outputs[0] + + return outputs + + +class Qwen2MoePretrainedModel(PretrainedModel): + config_class = Qwen2MoeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] + + @classmethod + def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping]: + mappings: list[StateDictNameMapping] = [] + model_mappings = [ + ["embed_tokens.weight"], + ["norm.weight"], + ] + for layer_index in range(config.num_hidden_layers): + layer_mappings = [ + [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], + [f"layers.{layer_index}.input_layernorm.weight"], + [f"layers.{layer_index}.post_attention_layernorm.weight"], + ] + model_mappings.extend(layer_mappings) + + for expert_idx in range(config.num_local_experts): + expert_mappings = [ + [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w1.weight", None, "transpose"], + [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w2.weight", None, "transpose"], + [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w3.weight", None, "transpose"], + ] + model_mappings.extend(expert_mappings) + model_mappings.append([f"layers.{layer_index}.block_sparse_moe.gate.weight", None, "transpose"]) + + init_name_mappings(mappings=model_mappings) + # base-model prefix "MixtralModel" + if "MixtralModel" not in config.architectures: + for mapping in model_mappings: + mapping[0] = "model." + mapping[0] + mapping[1] = "mixtral." + mapping[1] + model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) + + mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] + return mappings + + @classmethod + def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): + from paddlenlp.transformers.conversion_utils import split_or_merge_func + + fn = split_or_merge_func( + is_split=is_split, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + ) + + def get_tensor_parallel_split_mappings(num_layers, num_local_experts): + final_actions = {} + + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } + + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") + + # Column Linear + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + # Add tp split for expert params. + base_actions = { + "layers.0.block_sparse_moe.experts.0.w1.weight": partial(fn, is_column=True), + "layers.0.block_sparse_moe.experts.0.w2.weight": partial(fn, is_column=False), + "layers.0.block_sparse_moe.experts.0.w3.weight": partial(fn, is_column=True), + } + for key, action in base_actions.items(): + for i in range(num_layers): + newkey = key.replace("layers.0.", f"layers.{i}.") + for j in range(num_local_experts): + newkey2 = newkey.replace("experts.0.", f"experts.{j}.") + final_actions[newkey2] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_local_experts) + + return mappings + + def _init_weights(self, layer): + """Initialization hook""" + if self.config.tensor_parallel_degree > 1: + rng_tracker = get_rng_state_tracker().rng_state + if isinstance( + layer, + ( + nn.Linear, + nn.Embedding, + mpu.VocabParallelEmbedding, + mpu.ColumnParallelLinear, + mpu.RowParallelLinear, + Qwen2MoeLMHead, + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ), + ): + # In the dygraph mode, use the `set_value` to reset the parameter directly, + # and reset the `state_dict` to update parameter in static mode. + if isinstance(layer.weight, paddle.Tensor): + if layer.weight.is_distributed: + with rng_tracker(): + layer.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.mixtral.config.initializer_range, + shape=layer.weight.shape, + ) + ) + else: + layer.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.mixtral.config.initializer_range, + shape=layer.weight.shape, + ) + ) + # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530 + # sublayer is init first + # scale RowParallelLinear weight + with paddle.no_grad(): + if isinstance(layer, Qwen2MoeMLP): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.w2.weight.scale_(factor) + if isinstance(layer, Qwen2MoeAttention): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.o_proj.weight.scale_(factor) + + +@register_base_model +class Qwen2MoeModel(Qwen2MoePretrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`] + Args: + config: Qwen2MoeConfig + """ + + def __init__(self, config: Qwen2MoeConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.sequence_parallel = config.sequence_parallel + self.recompute_granularity = config.recompute_granularity + self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else [] + + # Recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: + self.embed_tokens = mpu.VocabParallelEmbedding( + self.vocab_size, + self.hidden_size, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()), + ) + else: + self.embed_tokens = nn.Embedding( + self.vocab_size, + self.hidden_size, + ) + + self.layers = nn.LayerList( + [ + Qwen2MoeDecoderLayer(config, layer_idx not in self.no_recompute_layers) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @staticmethod + def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype): + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if len(attention_mask.shape) == 2: + expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1]) + # For decoding phase in generation, seq_length = 1, we don't need to add causal mask + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + past_key_values_length=past_key_values_length, + ) + expanded_attn_mask = expanded_attn_mask & combined_attention_mask + # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len] + elif len(attention_mask.shape) == 3: + expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool") + # if attention_mask is already 4-D, do nothing + else: + expanded_attn_mask = attention_mask + else: + expanded_attn_mask = _make_causal_mask( + input_shape, + past_key_values_length=past_key_values_length, + ) + # Convert bool attention_mask to float attention mask, which will be added to attention_scores later + expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + return expanded_attn_mask + + @paddle.jit.not_to_static + def recompute_training_full( + self, + layer_module: nn.Layer, + hidden_states: Tensor, + position_ids: Optional[Tensor], + attention_mask: Tensor, + output_attentions: bool, + output_router_logits: bool, + past_key_value: Tensor, + use_cache: bool, + ): + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = recompute( + create_custom_forward(layer_module), + hidden_states, + position_ids, + attention_mask, + output_attentions, + output_router_logits, + past_key_value, + use_cache, + use_reentrant=self.config.recompute_use_reentrant, + ) + + return hidden_states + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + use_cache=None, + past_key_values=None, + output_attentions=False, + output_hidden_states=None, + output_router_logits: Optional[bool] = None, + return_dict=False, + **kwargs, + ): + if self.sequence_parallel and use_cache: + raise ValueError("We currently only support sequence parallel without cache.") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = 0 + + if past_key_values is None: + past_key_values = tuple([None] * len(self.layers)) + # NOTE: to make cache can be clear in-time + past_key_values = list(past_key_values) + + seq_length_with_past = seq_length + cache_length = 0 + if past_key_values[0] is not None: + cache_length = paddle.shape(past_key_values[0][0])[1] + seq_length_with_past += cache_length + + # if use_cache: + # use_legacy_cache = not isinstance(past_key_values, Cache) + # if use_legacy_cache: + # past_key_values = DynamicCache.from_legacy_cache(past_key_values) + # past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = paddle.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self.sequence_parallel: + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = inputs_embeds.shape + inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + inputs_embeds = ScatterOp.apply(inputs_embeds) + + # embed positions + if attention_mask is None: + # [bs, seq_len] + attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool) + + if position_ids is None: + position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length)) + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype + ) # [bs, 1, seq_len, seq_len] + if self.config.use_flash_attention: + is_casual = is_casual_mask(attention_mask) + if is_casual: + attention_mask = None + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_router_logits = () if output_router_logits else None + next_decoder_cache = () if use_cache else None + + for idx, (decoder_layer) in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + past_key_value = past_key_values[idx] if past_key_values is not None else None + + has_gradient = not hidden_states.stop_gradient + if ( + self.enable_recompute + and idx not in self.no_recompute_layers + and has_gradient + and self.recompute_granularity == "full" + ): + layer_outputs = self.recompute_training_full( + decoder_layer, + hidden_states, + position_ids, + attention_mask, + output_attentions, + output_router_logits, + past_key_value, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_ids, + attention_mask, + output_attentions, + output_router_logits, + past_key_value, + use_cache, + ) + + # NOTE: clear outdate cache after it has been used for memory saving + past_key_value = past_key_values[idx] = None + if type(layer_outputs) is tuple: + hidden_states = layer_outputs[0] + else: + hidden_states = layer_outputs + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_router_logits: + all_router_logits += (layer_outputs[-1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits] + if v is not None + ) + return MoEModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + router_logits=all_router_logits, + ) + + +class Qwen2MoePretrainingCriterion(nn.Layer): + """ + Criterion for Mixtral. + It calculates the final loss. + """ + + def __init__(self, config): + super(Qwen2MoePretrainingCriterion, self).__init__() + self.ignore_index = getattr(config, "ignore_index", -100) + self.config = config + self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output + + if self.enable_parallel_cross_entropy: # and False: # and lm_head is distributed + self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index) + else: + self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + def forward(self, prediction_scores, masked_lm_labels): + if self.enable_parallel_cross_entropy: + if prediction_scores.shape[-1] == self.config.vocab_size: + warnings.warn( + f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}" + ) + self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + with paddle.amp.auto_cast(False): + masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)) + + # skip ignore_index which loss == 0 + masked_lm_loss = masked_lm_loss[masked_lm_loss > 0] + loss = paddle.mean(masked_lm_loss) + + return loss + + +class Qwen2MoeLMHead(nn.Layer): + def __init__(self, config: Qwen2MoeConfig): + super(Qwen2MoeLMHead, self).__init__() + self.config = config + if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: + vocab_size = config.vocab_size // config.tensor_parallel_degree + else: + vocab_size = config.vocab_size + + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + # Must set distributed attr for Tensor Parallel ! + self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False + if self.weight.is_distributed: + self.weight.split_axis = 1 + + def forward(self, hidden_states, tensor_parallel_output=None): + if self.config.sequence_parallel: + hidden_states = GatherOp.apply(hidden_states) + seq_length = self.config.seq_length + hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size]) + + if tensor_parallel_output is None: + tensor_parallel_output = self.config.tensor_parallel_output + + logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) + return logits + + +class Qwen2MoeForCausalLM(Qwen2MoePretrainedModel): + enable_to_static_method = True + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.config = config + + self.model = Qwen2MoeModel(config) + self.lm_head = Qwen2MoeLMHead(config) + self.criterion = Qwen2MoePretrainingCriterion(config) + self.router_aux_loss_coef = config.router_aux_loss_coef + self.num_experts = config.num_experts + self.num_experts_per_tok = config.num_experts_per_tok + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def prepare_inputs_for_generation( + self, + input_ids, + use_cache=False, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + output_router_logits=False, + **kwargs + ): + batch_size, seq_length = input_ids.shape + position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length))) + attention_mask = kwargs.get("attention_mask", None) + if past_key_values: + input_ids = input_ids[:, -1].unsqueeze(axis=-1) + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "output_router_logits": output_router_logits, + } + ) + return model_inputs + + def _get_model_inputs_spec(self, dtype: str): + return { + "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + } + + @staticmethod + def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False): + # update cache + if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor): + model_kwargs["past_key_values"] = outputs[1] + + if isinstance(outputs, MoECausalLMOutputWithPast) and "past_key_values" in outputs: + model_kwargs["past_key_values"] = outputs.past_key_values + + # update position_ids + if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None: + position_ids = model_kwargs["position_ids"] + model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1) + + if not is_encoder_decoder and "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = paddle.concat( + [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1 + ) + + return model_kwargs + + def forward( + self, + input_ids=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + use_cache=False, + past_key_values=None, + output_attentions=None, + output_hidden_states=None, + output_router_logits: Optional[bool] = None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, # [bs, seq_len] + position_ids=position_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + output_router_logits=output_router_logits, + return_dict=return_dict, + ) + + hidden_states = outputs[0] # [bs, seq_len, dim] + + # if labels is None,means we need full output, instead of tensor_parallel_output + # tensor_parallel_output is togather with ParallelCrossEntropy + tensor_parallel_output = ( + self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1 + ) + + logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output) + + loss = None + if labels is not None: + loss = self.criterion(logits, labels) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits if return_dict else outputs[-1], + self.num_experts, + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.router_aux_loss_coef * aux_loss + + if not return_dict: + output = (logits,) + outputs[1:] + if output_router_logits: + output = (aux_loss,) + output + return (loss,) + output if loss is not None else output + + return MoECausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) diff --git a/tests/transformers/qwen2moe/__init__.py b/tests/transformers/qwen2moe/__init__.py new file mode 100644 index 000000000000..595add0aed9e --- /dev/null +++ b/tests/transformers/qwen2moe/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py new file mode 100644 index 000000000000..cf8e8a564785 --- /dev/null +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -0,0 +1,318 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import unittest + +import paddle + +from paddlenlp.transformers import Qwen2MoeConfig, Qwen2MoeForCausalLM, Qwen2MoeModel +from tests.transformers.test_configuration_common import ConfigTester +from tests.transformers.test_generation_utils import GenerationTesterMixin +from tests.transformers.test_modeling_common import ( + ModelTesterMixin, + ids_tensor, + random_attention_mask, +) + + +class Qwen2MoeModelTester: + def __init__( + self, + parent, + vocab_size=32000, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=8, + masked_softmax_fusion=True, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + is_training=True, + use_cache=False, + bos_token_id=1, + eos_token_id=2, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.0, + attention_dropout=0.0, + attention_softmax_in_fp32=True, + pretraining_tp=1, # TP rank used when training with megatron + dtype="bfloat16", + slow_but_exact=False, + batch_size: int = 2, + seq_length: int = 10, + type_sequence_label_size=2, + activation_function="gelu", + num_labels=3, + num_choices=4, + scope=None, + dropout=0.56, + use_input_mask: bool = False, + use_labels: bool = False, + return_dict=False, + ): + self.parent: Qwen2MoeModelTest = parent + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.masked_softmax_fusion = masked_softmax_fusion + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.is_training = is_training + self.use_cache = use_cache + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.pretraining_tp = pretraining_tp + self.dtype = dtype + self.slow_but_exact = slow_but_exact + + self.batch_size = batch_size + self.seq_length = seq_length + self.type_sequence_label_size = type_sequence_label_size + self.activation_function = activation_function + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.dropout = dropout + + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.return_dict = return_dict + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype=paddle.int64) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self) -> Qwen2MoeConfig: + return Qwen2MoeConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + masked_softmax_fusion=self.masked_softmax_fusion, + layer_norm_epsilon=self.layer_norm_epsilon, + initializer_range=self.initializer_range, + use_cache=self.use_cache, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + apply_residual_connection_post_layernorm=self.apply_residual_connection_post_layernorm, + hidden_dropout=self.hidden_dropout, + attention_dropout=self.attention_dropout, + attention_softmax_in_fp32=self.attention_softmax_in_fp32, + pretraining_tp=self.pretraining_tp, + dtype=self.dtype, + slow_but_exact=self.slow_but_exact, + activation_function=self.activation_function, + ) + + def create_and_check_model( + self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = Qwen2MoeModel(config) + model.eval() + result = model(input_ids) + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) + + def create_and_check_model_attention_mask( + self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = Qwen2MoeModel(config) + model.eval() + attn_mask_2d = random_attention_mask([self.batch_size, self.seq_length]) + result_2d = model(input_ids, attention_mask=attn_mask_2d)[0] + batch, seq_length = input_ids.shape + causal_mask = paddle.tril(paddle.ones((batch, seq_length, seq_length), dtype=attn_mask_2d.dtype)) + attn_mask_3d = causal_mask & attn_mask_2d.unsqueeze(-1) + result_3d = model(input_ids, attention_mask=attn_mask_3d)[0] + attn_mask_4d = attn_mask_3d.unsqueeze(1) + result_4d = model(input_ids, attention_mask=attn_mask_4d)[0] + result_no_attention_mask = model(input_ids, attention_mask=None)[0] + # Assert non-padding tokens have the same logits with different attention_mask shape + self.parent.assertTrue((result_2d[attn_mask_2d] == result_3d[attn_mask_2d]).all()) + self.parent.assertTrue((result_2d[attn_mask_2d] == result_4d[attn_mask_2d]).all()) + self.parent.assertTrue((result_2d[attn_mask_2d] == result_no_attention_mask[attn_mask_2d]).all()) + + def create_and_check_model_past_large_inputs( + self, + config: Qwen2MoeConfig, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = Qwen2MoeModel(config) + model.eval() + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, use_cache=True, return_dict=self.return_dict) + past_key_values = outputs.past_key_values if self.return_dict else outputs[2] + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), self.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = paddle.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = paddle.concat([input_mask, next_mask], axis=-1) + + outputs = model( + next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True, return_dict=self.return_dict + ) + + output_from_no_past = outputs[2][0] + + outputs = model( + next_tokens, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + return_dict=self.return_dict, + ) + + output_from_past = outputs[2][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(paddle.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): + model = Qwen2MoeForCausalLM(config) + model.eval() + + result = model( + input_ids, + use_cache=True, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + if self.parent.use_labels: + self.parent.assertIsInstance(result[0].item(), float) + self.parent.assertEqual(result[1].shape, [self.batch_size, self.seq_length, self.vocab_size]) + else: + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.vocab_size]) + + def check_model_position_ids(self, config, input_ids, input_mask, *args): + model = Qwen2MoeForCausalLM(config) + model.eval() + + result_no_position_id = model( + input_ids, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + batch_size, seq_len = input_ids.shape + position_ids = paddle.arange(seq_len).expand((batch_size, seq_len)) + result_position_id = model( + input_ids, + position_ids, + labels=input_ids if self.parent.use_labels else None, + return_dict=self.parent.return_dict, + ) + if self.parent.use_labels: + self.parent.assertTrue((result_position_id[1] == result_no_position_id[1]).all()) + else: + self.parent.assertTrue((result_position_id[0] == result_no_position_id[0]).all()) + + +class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = Qwen2MoeModel + return_dict = False + use_labels = False + use_test_model_name_list = False + + all_model_classes = (Qwen2MoeModel, Qwen2MoeForCausalLM) + all_generative_model_classes = {Qwen2MoeForCausalLM: (Qwen2MoeModel, "mixtral")} + + def setUp(self): + super().setUp() + + self.model_tester = Qwen2MoeModelTester(self) + self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, vocab_size=256, hidden_size=24) + + def _get_input_ids_and_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + input_ids = inputs_dict[self.input_name] + attention_mask = paddle.ones_like(input_ids, dtype=paddle.int64) + + max_batch_size = 2 + sequence_length = input_ids.shape[-1] // 2 + input_ids = input_ids[:max_batch_size, :sequence_length] + attention_mask = attention_mask[:max_batch_size, :sequence_length] + max_length = 3 + + return config, input_ids, attention_mask, max_length + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_attention_mask(*config_and_inputs) + + def test_model_position_ids(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_model_position_ids(*config_and_inputs) + + def test_generate_without_input_ids(self): + # this requires 4-D attention mask logic, which is not supported yet + pass + + def test_mixtral_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + +if __name__ == "__main__": + unittest.main() From 3913e115b1d6dbe630a91689530071e7a24ffd67 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 17 Apr 2024 03:26:14 +0000 Subject: [PATCH 02/41] update default config --- paddlenlp/transformers/qwen2moe/configuration.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2moe/configuration.py index 38ec512a078e..b344d70f7b94 100644 --- a/paddlenlp/transformers/qwen2moe/configuration.py +++ b/paddlenlp/transformers/qwen2moe/configuration.py @@ -17,11 +17,11 @@ from paddlenlp.transformers.configuration_utils import PretrainedConfig __all__ = [ - "Qwen2MoeConfig", + "QWen2MoeConfig", ] -class Qwen2MoeConfig(PretrainedConfig): +class QWen2MoeConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -117,7 +117,7 @@ def __init__( num_attention_heads=16, num_key_value_heads=16, hidden_act="silu", - max_position_embeddings=32768, + max_position_embeddings=8192, seq_length=2048, initializer_range=0.02, rms_norm_eps=1e-6, @@ -128,16 +128,16 @@ def __init__( use_flash_attention=False, attention_dropout=0.0, use_fused_rope=False, - rope_theta=10000.0, + rope_theta=1000000.0, tensor_parallel_output=True, sequence_parallel=False, fuse_sequence_parallel_allreduce=False, pad_token_id=0, - bos_token_id=1, - eos_token_id=2, + bos_token_id=151643, + eos_token_id=151643, tie_word_embeddings=False, use_sliding_window=False, - sliding_window=4096, + sliding_window=32768, max_window_layers=28, decoder_sparse_step=1, moe_intermediate_size=1408, From a29e90d542597b519e0c77421ecaff24d07b0a15 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 18 Apr 2024 08:00:42 +0000 Subject: [PATCH 03/41] update QWen2Moe modeling --- paddlenlp/transformers/qwen2moe/__init__.py | 4 +- paddlenlp/transformers/qwen2moe/modeling.py | 108 ++++++++++---------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index 622f373aab5e..617ea4a8ec47 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .configuration import Qwen2MoeConfig -from .modeling import Qwen2MoeForCausalLM +from .configuration import QWen2MoeConfig +from .modeling import QWen2MoeForCausalLM diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 689e03ef8286..0e46c554ef91 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Paddle Qwen2Moe model.""" +""" Paddle QWen2Moe model.""" from __future__ import annotations import math @@ -52,7 +52,7 @@ from paddlenlp.utils.log import logger from ..activations import ACT2FN -from .configuration import Qwen2MoeConfig +from .configuration import QWen2MoeConfig try: from paddle.nn.functional.flash_attention import flash_attention @@ -60,10 +60,10 @@ flash_attention = None __all__ = [ - "Qwen2MoeModel", - "Qwen2MoePretrainedModel", - "Qwen2MoeForCausalLM", - "Qwen2MoePretrainingCriterion", + "QWen2MoeModel", + "QWen2MoePretrainedModel", + "QWen2MoeForCausalLM", + "QWen2MoePretrainingCriterion", ] @@ -342,8 +342,8 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -class Qwen2MoeRMSNorm(nn.Layer): - def __init__(self, config, eps=1e-6): +class QWen2MoeRMSNorm(nn.Layer): + def __init__(self, config): super().__init__() self.hidden_size = config.hidden_size self.weight = paddle.create_parameter( @@ -386,7 +386,7 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -class Qwen2MoeRotaryEmbedding(nn.Layer): +class QWen2MoeRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() self.dim = dim @@ -443,16 +443,16 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed -class Qwen2MoeMLP(nn.Layer): +class QWen2MoeMLP(nn.Layer): def __init__(self, config): super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size self.tensor_parallel_degree = config.tensor_parallel_degree - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) # w1 - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) # w2 - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) # w3 + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w1 + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) # w2 + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w3 if config.sequence_parallel: ColumnParallelLinear = ColumnSequenceParallelLinear @@ -491,15 +491,15 @@ def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -class Qwen2MoeSparseMoeBlock(nn.Layer): - def __init__(self, config: Qwen2MoeConfig): +class QWen2MoeSparseMoeBlock(nn.Layer): + def __init__(self, config: QWen2MoeConfig): super().__init__() self.hidden_dim = config.hidden_size self.ffn_dim = config.intermediate_size - self.num_experts = config.num_local_experts + self.num_experts = config.num_experts self.top_k = config.num_experts_per_tok self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias_attr=False) - self.experts = nn.LayerList([Qwen2MoeMLP(config) for _ in range(self.num_experts)]) + self.experts = nn.LayerList([QWen2MoeMLP(config) for _ in range(self.num_experts)]) def forward(self, hidden_states): batch_size, seq_len, hidden_dim = hidden_states.shape @@ -544,13 +544,13 @@ def forward(self, hidden_states): return final_hidden_states, router_logits -class Qwen2MoeAttention(nn.Module): +class QWen2MoeAttention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". """ - def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None, layerwise_recompute: bool = True): + def __init__(self, config: QWen2MoeConfig, layer_idx: Optional[int] = None, layerwise_recompute: bool = True): super().__init__() self.config = config @@ -661,7 +661,7 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None, laye bias_attr=False, ) - self.rotary_emb = Qwen2MoeRotaryEmbedding( + self.rotary_emb = QWen2MoeRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, @@ -794,22 +794,22 @@ def forward( return outputs -class Qwen2MoeDecoderLayer(nn.Module): - def __init__(self, config: Qwen2MoeConfig, layer_idx: int, layerwise_recompute: bool = False): +class QWen2MoeDecoderLayer(nn.Layer): + def __init__(self, config: QWen2MoeConfig, layer_idx: int, layerwise_recompute: bool = False): super().__init__() self.config = config self.hidden_size = config.hidden_size - self.self_attn = Qwen2MoeAttention(config, layer_idx) + self.self_attn = QWen2MoeAttention(config, layer_idx) if config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0: - self.mlp = Qwen2MoeSparseMoeBlock(config) + self.mlp = QWen2MoeSparseMoeBlock(config) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = Qwen2MoeMLP(config, intermediate_size=config.intermediate_size) + self.mlp = QWen2MoeMLP(config, intermediate_size=config.intermediate_size) - self.input_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm = QWen2MoeRMSNorm(config) + self.post_attention_layernorm = QWen2MoeRMSNorm(config) self.sequence_parallel = config.sequence_parallel # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True @@ -914,14 +914,14 @@ def forward( return outputs -class Qwen2MoePretrainedModel(PretrainedModel): - config_class = Qwen2MoeConfig - base_model_prefix = "model" +class QWen2MoePretrainedModel(PretrainedModel): + config_class = QWen2MoeConfig + base_model_prefix = "qwen2moe" supports_gradient_checkpointing = True _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod - def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping]: + def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] model_mappings = [ ["embed_tokens.weight"], @@ -939,7 +939,7 @@ def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping ] model_mappings.extend(layer_mappings) - for expert_idx in range(config.num_local_experts): + for expert_idx in range(config.num_experts): expert_mappings = [ [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w1.weight", None, "transpose"], [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w2.weight", None, "transpose"], @@ -960,7 +960,7 @@ def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping return mappings @classmethod - def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config: QWen2MoeConfig, is_split=True): from paddlenlp.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -970,7 +970,7 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, num_local_experts): + def get_tensor_parallel_split_mappings(num_layers, num_experts): final_actions = {} base_actions = { @@ -1006,13 +1006,13 @@ def get_tensor_parallel_split_mappings(num_layers, num_local_experts): for key, action in base_actions.items(): for i in range(num_layers): newkey = key.replace("layers.0.", f"layers.{i}.") - for j in range(num_local_experts): + for j in range(num_experts): newkey2 = newkey.replace("experts.0.", f"experts.{j}.") final_actions[newkey2] = action return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_local_experts) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) return mappings @@ -1028,7 +1028,7 @@ def _init_weights(self, layer): mpu.VocabParallelEmbedding, mpu.ColumnParallelLinear, mpu.RowParallelLinear, - Qwen2MoeLMHead, + QWen2MoeLMHead, ColumnSequenceParallelLinear, RowSequenceParallelLinear, ), @@ -1061,23 +1061,23 @@ def _init_weights(self, layer): # sublayer is init first # scale RowParallelLinear weight with paddle.no_grad(): - if isinstance(layer, Qwen2MoeMLP): + if isinstance(layer, QWen2MoeMLP): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.w2.weight.scale_(factor) - if isinstance(layer, Qwen2MoeAttention): + if isinstance(layer, QWen2MoeAttention): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) @register_base_model -class Qwen2MoeModel(Qwen2MoePretrainedModel): +class QWen2MoeModel(QWen2MoePretrainedModel): """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`] + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2MoeDecoderLayer`] Args: - config: Qwen2MoeConfig + config: QWen2MoeConfig """ - def __init__(self, config: Qwen2MoeConfig): + def __init__(self, config: QWen2MoeConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -1102,12 +1102,12 @@ def __init__(self, config: Qwen2MoeConfig): self.layers = nn.LayerList( [ - Qwen2MoeDecoderLayer(config, layer_idx not in self.no_recompute_layers) + QWen2MoeDecoderLayer(config, layer_idx not in self.no_recompute_layers) for layer_idx in range(config.num_hidden_layers) ] ) self._attn_implementation = config._attn_implementation - self.norm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm = QWen2MoeRMSNorm(config) def get_input_embeddings(self): return self.embed_tokens @@ -1346,14 +1346,14 @@ def forward( ) -class Qwen2MoePretrainingCriterion(nn.Layer): +class QWen2MoePretrainingCriterion(nn.Layer): """ Criterion for Mixtral. It calculates the final loss. """ def __init__(self, config): - super(Qwen2MoePretrainingCriterion, self).__init__() + super(QWen2MoePretrainingCriterion, self).__init__() self.ignore_index = getattr(config, "ignore_index", -100) self.config = config self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output @@ -1381,9 +1381,9 @@ def forward(self, prediction_scores, masked_lm_labels): return loss -class Qwen2MoeLMHead(nn.Layer): - def __init__(self, config: Qwen2MoeConfig): - super(Qwen2MoeLMHead, self).__init__() +class QWen2MoeLMHead(nn.Layer): + def __init__(self, config: QWen2MoeConfig): + super(QWen2MoeLMHead, self).__init__() self.config = config if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: vocab_size = config.vocab_size // config.tensor_parallel_degree @@ -1412,7 +1412,7 @@ def forward(self, hidden_states, tensor_parallel_output=None): return logits -class Qwen2MoeForCausalLM(Qwen2MoePretrainedModel): +class QWen2MoeForCausalLM(QWen2MoePretrainedModel): enable_to_static_method = True _tied_weights_keys = ["lm_head.weight"] @@ -1420,9 +1420,9 @@ def __init__(self, config): super().__init__(config) self.config = config - self.model = Qwen2MoeModel(config) - self.lm_head = Qwen2MoeLMHead(config) - self.criterion = Qwen2MoePretrainingCriterion(config) + self.model = QWen2MoeModel(config) + self.lm_head = QWen2MoeLMHead(config) + self.criterion = QWen2MoePretrainingCriterion(config) self.router_aux_loss_coef = config.router_aux_loss_coef self.num_experts = config.num_experts self.num_experts_per_tok = config.num_experts_per_tok From d514dff888abab517e38e4222075f1abc9428270 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 18 Apr 2024 13:11:33 +0000 Subject: [PATCH 04/41] update modeling --- paddlenlp/transformers/qwen2moe/modeling.py | 330 +++++++++----------- 1 file changed, 148 insertions(+), 182 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 0e46c554ef91..52d67c20e94f 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -342,6 +342,7 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2Moe class QWen2MoeRMSNorm(nn.Layer): def __init__(self, config): super().__init__() @@ -373,19 +374,7 @@ def forward(self, hidden_states): return hidden_states * self.weight -def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: - """ - This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, slen, num_key_value_heads, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - - hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1]) - return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) - - +# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2Moe class QWen2MoeRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() @@ -429,8 +418,28 @@ def rotate_half(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ if position_ids is None: - # Note: Only for MixtralForCausalLMPipe model pretraining + # Note: Only for QWen2MoeForCausalLMPipe model pretraining cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] else: @@ -443,17 +452,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed +# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe class QWen2MoeMLP(nn.Layer): - def __init__(self, config): + def __init__(self, config, is_shared=False): super().__init__() self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size + self.intermediate_size = config.intermediate_size if not is_shared else config.shared_expert_intermediate_size self.tensor_parallel_degree = config.tensor_parallel_degree - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w1 - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) # w2 - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w3 - if config.sequence_parallel: ColumnParallelLinear = ColumnSequenceParallelLinear RowParallelLinear = RowSequenceParallelLinear @@ -481,9 +487,9 @@ def __init__(self, config): has_bias=False, ) else: - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w1 + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w3 + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) # w2 self.act_fn = ACT2FN[config.hidden_act] @@ -491,77 +497,31 @@ def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -class QWen2MoeSparseMoeBlock(nn.Layer): - def __init__(self, config: QWen2MoeConfig): - super().__init__() - self.hidden_dim = config.hidden_size - self.ffn_dim = config.intermediate_size - self.num_experts = config.num_experts - self.top_k = config.num_experts_per_tok - self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias_attr=False) - self.experts = nn.LayerList([QWen2MoeMLP(config) for _ in range(self.num_experts)]) - - def forward(self, hidden_states): - batch_size, seq_len, hidden_dim = hidden_states.shape - hidden_states = hidden_states.reshape([-1, hidden_dim]) - # router_logits: [batch_size * seq_len, num_experts] - router_logits = self.gate(hidden_states) - - with paddle.amp.auto_cast(False): - routing_weights = F.softmax(router_logits.astype("float32"), axis=1) - routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1) - routing_weights /= routing_weights.sum(axis=-1, keepdim=True) - # we cast back to input dtype - routing_weights = routing_weights.astype(hidden_states.dtype) - - final_hidden_states = paddle.zeros( - [batch_size * seq_len, hidden_dim], - dtype=hidden_states.dtype, - ) - - # One hot encode the selected experts to create an expert mask - # this will be used to easily index which expert is going to be sollicitated. - # shape: [num_experts, top_k, batch_size * seq_len] - expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0]) - - # Loop over all available experts in the model and perform the computation on each expert. - for expert_id in range(self.num_experts): - expert_layer = self.experts[expert_id] - idx, top_x = paddle.where(expert_mask[expert_id]) - - if top_x.shape[0] == 0: - continue - - current_state = paddle.gather(hidden_states, top_x.squeeze()) - current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx] - - top_x = top_x.squeeze() - if top_x.shape == []: - top_x = paddle.to_tensor([top_x.item()]) - final_hidden_states.index_add_(top_x, 0, current_hidden_states.astype(hidden_states.dtype)) +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: + """ + This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, slen, num_key_value_heads, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states - final_hidden_states = final_hidden_states.reshape([batch_size, seq_len, hidden_dim]) - return final_hidden_states, router_logits + hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1]) + return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) +# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->Qwen2Moe class QWen2MoeAttention(nn.Layer): """ - Multi-headed attention from 'Attention Is All You Need' paper. - Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". """ - def __init__(self, config: QWen2MoeConfig, layer_idx: Optional[int] = None, layerwise_recompute: bool = True): + def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = True): super().__init__() self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " - "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads @@ -571,8 +531,8 @@ def __init__(self, config: QWen2MoeConfig, layer_idx: Optional[int] = None, laye assert config.num_attention_heads // config.num_key_value_heads self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads - self.rope_theta = config.rope_theta self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta self.is_causal = True self.attention_dropout = config.attention_dropout @@ -612,54 +572,19 @@ def __init__(self, config: QWen2MoeConfig, layer_idx: Optional[int] = None, laye RowParallelLinear = fleet.meta_parallel.RowParallelLinear if config.tensor_parallel_degree > 1: - self.q_proj = ColumnParallelLinear( - self.hidden_size, - self.hidden_size, - has_bias=True, - gather_output=False, - ) + self.q_proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, has_bias=True, gather_output=False) self.k_proj = ColumnParallelLinear( - self.hidden_size, - self.config.num_key_value_heads * self.head_dim, - has_bias=True, - gather_output=False, + self.hidden_size, self.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False ) self.v_proj = ColumnParallelLinear( - self.hidden_size, - self.config.num_key_value_heads * self.head_dim, - has_bias=True, - gather_output=False, - ) - else: - self.q_proj = nn.Linear( - self.hidden_size, - self.hidden_size, - bias_attr=True, - ) - self.k_proj = nn.Linear( - self.hidden_size, - self.config.num_key_value_heads * self.head_dim, - bias_attr=True, - ) - self.v_proj = nn.Linear( - self.hidden_size, - self.config.num_key_value_heads * self.head_dim, - bias_attr=True, - ) - - if config.tensor_parallel_degree > 1: - self.o_proj = RowParallelLinear( - self.hidden_size, - self.hidden_size, - has_bias=False, - input_is_parallel=True, + self.hidden_size, self.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False ) + self.o_proj = RowParallelLinear(self.hidden_size, self.hidden_size, has_bias=False, input_is_parallel=True) else: - self.o_proj = nn.Linear( - self.hidden_size, - self.hidden_size, - bias_attr=False, - ) + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) self.rotary_emb = QWen2MoeRotaryEmbedding( self.head_dim, @@ -680,6 +605,8 @@ def forward( """Input shape: Batch x Time x Channel""" # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) + batch_size, seq_len, _ = hidden_states.shape + query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) @@ -697,14 +624,6 @@ def forward( kv_seq_len = key_states.shape[-3] if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - # TODO layer_idx - # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) kv_seq_len += past_key_value[0].shape[-3] if self.use_fused_rope: @@ -725,12 +644,8 @@ def forward( # [bs, seq_len, num_head, head_dim] if past_key_value is not None: - # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - # reuse k, v, self_attention key_states = paddle.concat([past_key_value[0], key_states], axis=1) value_states = paddle.concat([past_key_value[1], value_states], axis=1) - past_key_value = (key_states, value_states) if use_cache else None # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1 @@ -794,19 +709,80 @@ def forward( return outputs +class QWen2MoeSparseMoeBlock(nn.Layer): + def __init__(self, config: QWen2MoeConfig): + super().__init__() + self.num_experts = config.num_experts + self.top_k = config.num_experts_per_tok + self.norm_topk_prob = config.norm_topk_prob + + self.gate = nn.Linear(config.hidden_size, self.num_experts, bias_attr=False) + self.experts = nn.LayerList([QWen2MoeMLP(config) for _ in range(self.num_experts)]) + + self.shared_expert = QWen2MoeMLP(config, is_shared=True) + self.shared_expert_gate = nn.Linear(config.hidden_size, 1, bias_attr=False) + + def forward(self, hidden_states): + batch_size, seq_len, hidden_dim = hidden_states.shape + hidden_states = hidden_states.reshape([-1, hidden_dim]) + # router_logits: [batch_size * seq_len, num_experts] + router_logits = self.gate(hidden_states) + + with paddle.amp.auto_cast(False): + routing_weights = F.softmax(router_logits.astype("float32"), axis=1) + routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1) + if self.norm_topk_prob: # Note: Mixtral is set norm as default, QWen2MoE is set to no norm + routing_weights /= routing_weights.sum(axis=-1, keepdim=True) + # we cast back to input dtype + routing_weights = routing_weights.astype(hidden_states.dtype) + + final_hidden_states = paddle.zeros( + [batch_size * seq_len, hidden_dim], + dtype=hidden_states.dtype, + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated. + # shape: [num_experts, top_k, batch_size * seq_len] + expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0]) + + # Loop over all available experts in the model and perform the computation on each expert. + for expert_id in range(self.num_experts): + expert_layer = self.experts[expert_id] + idx, top_x = paddle.where(expert_mask[expert_id]) + + if top_x.shape[0] == 0: + continue + + current_state = paddle.gather(hidden_states, top_x.squeeze()) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx] + + top_x = top_x.squeeze() + if top_x.shape == []: + top_x = paddle.to_tensor([top_x.item()]) + final_hidden_states.index_add_(top_x, 0, current_hidden_states.astype(hidden_states.dtype)) + + shared_expert_output = self.shared_expert(hidden_states) + shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output + + final_hidden_states = final_hidden_states + shared_expert_output + + final_hidden_states = final_hidden_states.reshape([batch_size, seq_len, hidden_dim]) + return final_hidden_states, router_logits + + class QWen2MoeDecoderLayer(nn.Layer): - def __init__(self, config: QWen2MoeConfig, layer_idx: int, layerwise_recompute: bool = False): + def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = False): super().__init__() self.config = config - self.hidden_size = config.hidden_size - self.self_attn = QWen2MoeAttention(config, layer_idx) + self.self_attn = QWen2MoeAttention(config, layerwise_recompute) - if config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0: + if config.num_experts > 0: self.mlp = QWen2MoeSparseMoeBlock(config) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = QWen2MoeMLP(config, intermediate_size=config.intermediate_size) + self.mlp = QWen2MoeMLP(config) self.input_layernorm = QWen2MoeRMSNorm(config) self.post_attention_layernorm = QWen2MoeRMSNorm(config) @@ -848,6 +824,7 @@ def forward( # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel) residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) # Self Attention @@ -894,7 +871,12 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states, router_logits = self.block_sparse_moe(hidden_states) + hidden_states = self.mlp(hidden_states) + if isinstance(hidden_states, tuple): + hidden_states, router_logits = hidden_states + else: + router_logits = None + hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -917,7 +899,6 @@ def forward( class QWen2MoePretrainedModel(PretrainedModel): config_class = QWen2MoeConfig base_model_prefix = "qwen2moe" - supports_gradient_checkpointing = True _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod @@ -941,19 +922,19 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping for expert_idx in range(config.num_experts): expert_mappings = [ - [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w1.weight", None, "transpose"], - [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w2.weight", None, "transpose"], - [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w3.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.experts.{expert_idx}.gate_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.experts.{expert_idx}.down_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.experts.{expert_idx}.up_proj.weight", None, "transpose"], ] model_mappings.extend(expert_mappings) model_mappings.append([f"layers.{layer_index}.block_sparse_moe.gate.weight", None, "transpose"]) init_name_mappings(mappings=model_mappings) - # base-model prefix "MixtralModel" - if "MixtralModel" not in config.architectures: + # base-model prefix "QWen2MoeModel" + if "QWen2MoeModel" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] - mapping[1] = "mixtral." + mapping[1] + mapping[1] = "qwen2moe." + mapping[1] model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] @@ -999,9 +980,9 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): # Add tp split for expert params. base_actions = { - "layers.0.block_sparse_moe.experts.0.w1.weight": partial(fn, is_column=True), - "layers.0.block_sparse_moe.experts.0.w2.weight": partial(fn, is_column=False), - "layers.0.block_sparse_moe.experts.0.w3.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), } for key, action in base_actions.items(): for i in range(num_layers): @@ -1063,7 +1044,7 @@ def _init_weights(self, layer): with paddle.no_grad(): if isinstance(layer, QWen2MoeMLP): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) - layer.w2.weight.scale_(factor) + layer.down_proj.weight.scale_(factor) if isinstance(layer, QWen2MoeAttention): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) @@ -1102,11 +1083,10 @@ def __init__(self, config: QWen2MoeConfig): self.layers = nn.LayerList( [ - QWen2MoeDecoderLayer(config, layer_idx not in self.no_recompute_layers) + QWen2MoeDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) for layer_idx in range(config.num_hidden_layers) ] ) - self._attn_implementation = config._attn_implementation self.norm = QWen2MoeRMSNorm(config) def get_input_embeddings(self): @@ -1193,12 +1173,13 @@ def forward( raise ValueError("We currently only support sequence parallel without cache.") output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) + output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1213,8 +1194,6 @@ def forward( else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - past_key_values_length = 0 - if past_key_values is None: past_key_values = tuple([None] * len(self.layers)) # NOTE: to make cache can be clear in-time @@ -1223,24 +1202,8 @@ def forward( seq_length_with_past = seq_length cache_length = 0 if past_key_values[0] is not None: - cache_length = paddle.shape(past_key_values[0][0])[1] + cache_length = past_key_values[0][0].shape[1] seq_length_with_past += cache_length - - # if use_cache: - # use_legacy_cache = not isinstance(past_key_values, Cache) - # if use_legacy_cache: - # past_key_values = DynamicCache.from_legacy_cache(past_key_values) - # past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = paddle.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=paddle.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1427,7 +1390,10 @@ def __init__(self, config): self.num_experts = config.num_experts self.num_experts_per_tok = config.num_experts_per_tok # Initialize weights and apply final processing - self.post_init() + + if config.sliding_window: + self.config.sliding_window = False + logger.warning("We do not support sliding window attention for now.") def get_input_embeddings(self): return self.model.embed_tokens From 1e9832325f65055745acfdcfd0e8eb28ed40ff09 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 19 Apr 2024 09:42:52 +0000 Subject: [PATCH 05/41] update ckpt name --- paddlenlp/transformers/qwen2moe/modeling.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 52d67c20e94f..470c4cc1b9b4 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -1383,7 +1383,7 @@ def __init__(self, config): super().__init__(config) self.config = config - self.model = QWen2MoeModel(config) + self.qwen2moe = QWen2MoeModel(config) self.lm_head = QWen2MoeLMHead(config) self.criterion = QWen2MoePretrainingCriterion(config) self.router_aux_loss_coef = config.router_aux_loss_coef @@ -1396,10 +1396,10 @@ def __init__(self, config): logger.warning("We do not support sliding window attention for now.") def get_input_embeddings(self): - return self.model.embed_tokens + return self.qwen2moe.embed_tokens def set_input_embeddings(self, value): - self.model.embed_tokens = value + self.qwen2moe.embed_tokens = value def get_output_embeddings(self): return self.lm_head @@ -1408,10 +1408,10 @@ def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): - self.model = decoder + self.qwen2moe = decoder def get_decoder(self): - return self.model + return self.qwen2moe def prepare_inputs_for_generation( self, @@ -1500,7 +1500,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( + outputs = self.qwen2moe( input_ids=input_ids, # [bs, seq_len] position_ids=position_ids, attention_mask=attention_mask, From 37dd2d5a8582465a335269a9a412ce5aa0abc897 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 03:34:58 +0000 Subject: [PATCH 06/41] support same prefix model name for auto modeling --- paddlenlp/transformers/auto/modeling.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 449d214f36c7..248fe2cbcd0d 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -216,15 +216,20 @@ def _get_model_class_from_config(cls, pretrained_model_name_or_path, config_file else: init_class = config.pop("init_class", None) init_class = init_class[:-5] if init_class is not None and init_class.endswith("Model") else init_class + + # Sort the MAPPING_NAMES to reorder the model class names with longest-first rule + # thus the names with same prefix can be correctly inferred + # such as QWen and QWen2MOE, QWen2MOE is the longest prefix of QWen2MOEModel model_name = None + SORTED_MAPPING_NAMES = dict(sorted(MAPPING_NAMES.items(), key=lambda x: len(x[0]), reverse=True)) if init_class: - for model_flag, name in MAPPING_NAMES.items(): + for model_flag, name in SORTED_MAPPING_NAMES.items(): if model_flag in init_class: model_name = model_flag + "Model" break else: # From pretrained_model_name_or_path - for model_flag, name in MAPPING_NAMES.items(): + for model_flag, name in SORTED_MAPPING_NAMES.items(): if name in pretrained_model_name_or_path.lower(): model_name = model_flag + "Model" break From d12938a2302f721f1f3b17c1a02782d65df5c7ee Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 04:17:02 +0000 Subject: [PATCH 07/41] update qwen2moe testing --- tests/transformers/qwen2moe/test_modeling.py | 40 ++++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index cf8e8a564785..26c5597fd231 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -18,7 +18,7 @@ import paddle -from paddlenlp.transformers import Qwen2MoeConfig, Qwen2MoeForCausalLM, Qwen2MoeModel +from paddlenlp.transformers import QWen2MoeConfig, QWen2MoeForCausalLM, QWen2MoeModel from tests.transformers.test_configuration_common import ConfigTester from tests.transformers.test_generation_utils import GenerationTesterMixin from tests.transformers.test_modeling_common import ( @@ -28,7 +28,7 @@ ) -class Qwen2MoeModelTester: +class QWen2MoeModelTester: def __init__( self, parent, @@ -62,7 +62,7 @@ def __init__( use_labels: bool = False, return_dict=False, ): - self.parent: Qwen2MoeModelTest = parent + self.parent: QWen2MoeModelTest = parent self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -113,8 +113,8 @@ def prepare_config_and_inputs(self): config = self.get_config() return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels - def get_config(self) -> Qwen2MoeConfig: - return Qwen2MoeConfig( + def get_config(self) -> QWen2MoeConfig: + return QWen2MoeConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -136,17 +136,17 @@ def get_config(self) -> Qwen2MoeConfig: ) def create_and_check_model( - self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: QWen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = Qwen2MoeModel(config) + model = QWen2MoeModel(config) model.eval() result = model(input_ids) self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) def create_and_check_model_attention_mask( - self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: QWen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = Qwen2MoeModel(config) + model = QWen2MoeModel(config) model.eval() attn_mask_2d = random_attention_mask([self.batch_size, self.seq_length]) result_2d = model(input_ids, attention_mask=attn_mask_2d)[0] @@ -164,14 +164,14 @@ def create_and_check_model_attention_mask( def create_and_check_model_past_large_inputs( self, - config: Qwen2MoeConfig, + config: QWen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels, ): - model = Qwen2MoeModel(config) + model = QWen2MoeModel(config) model.eval() # first forward pass @@ -226,7 +226,7 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): - model = Qwen2MoeForCausalLM(config) + model = QWen2MoeForCausalLM(config) model.eval() result = model( @@ -242,7 +242,7 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.vocab_size]) def check_model_position_ids(self, config, input_ids, input_mask, *args): - model = Qwen2MoeForCausalLM(config) + model = QWen2MoeForCausalLM(config) model.eval() result_no_position_id = model( @@ -264,20 +264,20 @@ def check_model_position_ids(self, config, input_ids, input_mask, *args): self.parent.assertTrue((result_position_id[0] == result_no_position_id[0]).all()) -class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - base_model_class = Qwen2MoeModel +class QWen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = QWen2MoeModel return_dict = False use_labels = False use_test_model_name_list = False - all_model_classes = (Qwen2MoeModel, Qwen2MoeForCausalLM) - all_generative_model_classes = {Qwen2MoeForCausalLM: (Qwen2MoeModel, "mixtral")} + all_model_classes = (QWen2MoeModel, QWen2MoeForCausalLM) + all_generative_model_classes = {QWen2MoeForCausalLM: (QWen2MoeModel, "qwen2moe")} def setUp(self): super().setUp() - self.model_tester = Qwen2MoeModelTester(self) - self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, vocab_size=256, hidden_size=24) + self.model_tester = QWen2MoeModelTester(self) + self.config_tester = ConfigTester(self, config_class=QWen2MoeConfig, vocab_size=256, hidden_size=24) def _get_input_ids_and_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -309,7 +309,7 @@ def test_generate_without_input_ids(self): # this requires 4-D attention mask logic, which is not supported yet pass - def test_mixtral_lm_head_model(self): + def test_qwen2moe_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) From 8cc49fc65c61ca36ed5ff799c52a4bf0fce5853f Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 04:17:56 +0000 Subject: [PATCH 08/41] update qwen2moe modeling and config --- paddlenlp/transformers/qwen2moe/configuration.py | 12 ++++++------ paddlenlp/transformers/qwen2moe/modeling.py | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2moe/configuration.py index b344d70f7b94..08666c44d6e4 100644 --- a/paddlenlp/transformers/qwen2moe/configuration.py +++ b/paddlenlp/transformers/qwen2moe/configuration.py @@ -23,7 +23,7 @@ class QWen2MoeConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a + This is the configuration class to store the configuration of a [`QWen2MoeModel`]. It is used to instantiate a Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B"). @@ -35,7 +35,7 @@ class QWen2MoeConfig(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 151936): Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Qwen2MoeModel`] + `inputs_ids` passed when calling [`QWen2MoeModel`] hidden_size (`int`, *optional*, defaults to 2048): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 5632): @@ -93,19 +93,19 @@ class QWen2MoeConfig(PretrainedConfig): The aux loss factor for the total loss. ```python - >>> from paddlenlp.transformers import Qwen2MoeModel, Qwen2MoeConfig + >>> from paddlenlp.transformers import QWen2MoeModel, QWen2MoeConfig >>> # Initializing a Qwen2MoE style configuration - >>> configuration = Qwen2MoeConfig() + >>> configuration = QWen2MoeConfig() >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration - >>> model = Qwen2MoeModel(configuration) + >>> model = QWen2MoeModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "qwen2_moe" + model_type = "qwen2moe" keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 470c4cc1b9b4..b865c421dfd8 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -342,7 +342,7 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2Moe +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->QWen2Moe class QWen2MoeRMSNorm(nn.Layer): def __init__(self, config): super().__init__() @@ -374,7 +374,7 @@ def forward(self, hidden_states): return hidden_states * self.weight -# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2Moe +# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->QWen2Moe class QWen2MoeRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() @@ -452,7 +452,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed -# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe +# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->QWen2Moe class QWen2MoeMLP(nn.Layer): def __init__(self, config, is_shared=False): super().__init__() @@ -511,7 +511,7 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->Qwen2Moe +# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->QWen2Moe class QWen2MoeAttention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer @@ -927,7 +927,7 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping [f"layers.{layer_index}.mlp.experts.{expert_idx}.up_proj.weight", None, "transpose"], ] model_mappings.extend(expert_mappings) - model_mappings.append([f"layers.{layer_index}.block_sparse_moe.gate.weight", None, "transpose"]) + model_mappings.append([f"layers.{layer_index}.mlp.gate.weight", None, "transpose"]) init_name_mappings(mappings=model_mappings) # base-model prefix "QWen2MoeModel" From 9c8222e1600dbf39d753a13cb0236e5ecb330f4a Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 05:13:36 +0000 Subject: [PATCH 09/41] update qwen2moe import --- paddlenlp/transformers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index bd743c59f934..982257d222ff 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -294,6 +294,8 @@ from .deberta_v2.modeling import * from .deberta_v2.tokenizer import * from .deberta_v2.configuration import * +from .qwen2moe.modeling import * +from .qwen2moe.configuration import * # For faster tokenizer from ..utils.import_utils import is_fast_tokenizer_available From 4d6ff8750f205d999439658d3ae0a8b82b237d9f Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 06:01:20 +0000 Subject: [PATCH 10/41] fix mlp hidden_size --- paddlenlp/transformers/qwen2moe/modeling.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index b865c421dfd8..75ed8d3fa262 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -457,7 +457,9 @@ class QWen2MoeMLP(nn.Layer): def __init__(self, config, is_shared=False): super().__init__() self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size if not is_shared else config.shared_expert_intermediate_size + self.intermediate_size = ( + config.moe_intermediate_size if not is_shared else config.shared_expert_intermediate_size + ) self.tensor_parallel_degree = config.tensor_parallel_degree if config.sequence_parallel: @@ -929,6 +931,11 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping model_mappings.extend(expert_mappings) model_mappings.append([f"layers.{layer_index}.mlp.gate.weight", None, "transpose"]) + model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.gate_proj.weight", None, "transpose"]) + model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.down_proj.weight", None, "transpose"]) + model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.up_proj.weight", None, "transpose"]) + model_mappings.append([f"layers.{layer_index}.mlp.shared_expert_gate.weight", None, "transpose"]) + init_name_mappings(mappings=model_mappings) # base-model prefix "QWen2MoeModel" if "QWen2MoeModel" not in config.architectures: From f350a2fe7000ede57bd073c7fa9b3eae48cb339b Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 07:22:28 +0000 Subject: [PATCH 11/41] update qkv bias convert --- paddlenlp/transformers/qwen2moe/modeling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 75ed8d3fa262..702e170e7482 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -974,10 +974,13 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): # Column Linear base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. if config.num_key_value_heads % config.tensor_parallel_degree == 0: base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: From c53690de4adf2fa1ce4f28712ced808fdca1851e Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 07:36:59 +0000 Subject: [PATCH 12/41] update modeling init_weight --- paddlenlp/transformers/qwen2moe/modeling.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 702e170e7482..fbb394f0f8f3 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -1034,7 +1034,7 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.mixtral.config.initializer_range, + else self.qwen2moe.config.initializer_range, shape=layer.weight.shape, ) ) @@ -1044,10 +1044,12 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.mixtral.config.initializer_range, + else self.qwen2moe.config.initializer_range, shape=layer.weight.shape, ) ) + if isinstance(layer.bias, paddle.Tensor): + layer.bias.set_value(paddle.zeros_like(layer.bias.shape)) # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530 # sublayer is init first # scale RowParallelLinear weight From 9d12995489d7e9bc042958b39870a48b07fcb9b7 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 07:42:26 +0000 Subject: [PATCH 13/41] update _get_name_mappings --- paddlenlp/transformers/qwen2moe/modeling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index fbb394f0f8f3..fafcb36c0313 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -915,6 +915,9 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.q_proj.bias", None, "split", 1], + [f"layers.{layer_index}.self_attn.k_proj.bias", None, "split", 1], + [f"layers.{layer_index}.self_attn.v_proj.bias", None, "split", 1], [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], [f"layers.{layer_index}.input_layernorm.weight"], From dba0f74c1ad229913c7035a0f4c5989efce86a8b Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 25 Apr 2024 08:32:13 +0000 Subject: [PATCH 14/41] update _get_name_mappings and _init_weight --- paddlenlp/transformers/qwen2moe/modeling.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index fafcb36c0313..a1505d87c32b 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -915,9 +915,9 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], - [f"layers.{layer_index}.self_attn.q_proj.bias", None, "split", 1], - [f"layers.{layer_index}.self_attn.k_proj.bias", None, "split", 1], - [f"layers.{layer_index}.self_attn.v_proj.bias", None, "split", 1], + [f"layers.{layer_index}.self_attn.q_proj.bias", None], + [f"layers.{layer_index}.self_attn.k_proj.bias", None], + [f"layers.{layer_index}.self_attn.v_proj.bias", None], [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], [f"layers.{layer_index}.input_layernorm.weight"], @@ -1051,8 +1051,8 @@ def _init_weights(self, layer): shape=layer.weight.shape, ) ) - if isinstance(layer.bias, paddle.Tensor): - layer.bias.set_value(paddle.zeros_like(layer.bias.shape)) + if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor): + layer.bias.set_value(paddle.zeros_like(layer.bias)) # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530 # sublayer is init first # scale RowParallelLinear weight From e487606523a28725cd11982787d760433299f44f Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 26 Apr 2024 06:05:52 +0000 Subject: [PATCH 15/41] add tokenizer --- paddlenlp/transformers/__init__.py | 1 + paddlenlp/transformers/auto/tokenizer.py | 1 + paddlenlp/transformers/qwen2moe/__init__.py | 1 + paddlenlp/transformers/qwen2moe/modeling.py | 12 + paddlenlp/transformers/qwen2moe/tokenizer.py | 341 +++++++++++++++++++ 5 files changed, 356 insertions(+) create mode 100644 paddlenlp/transformers/qwen2moe/tokenizer.py diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 982257d222ff..b3bc4e66d3a1 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -296,6 +296,7 @@ from .deberta_v2.configuration import * from .qwen2moe.modeling import * from .qwen2moe.configuration import * +from .qwen2moe.tokenizer import * # For faster tokenizer from ..utils.import_utils import is_fast_tokenizer_available diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 1e6e1215fe5d..3f92f76caee5 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -97,6 +97,7 @@ ("BloomTokenizer", "bloom"), ("SpeechT5Tokenizer", "speecht5"), ("QWenTokenizer", "qwen"), + ("QWen2MoeTokenizer", "qwen2moe"), ("GemmaTokenizer", "gemma"), ] ) diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index 617ea4a8ec47..c8ba37240327 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -14,3 +14,4 @@ from .configuration import QWen2MoeConfig from .modeling import QWen2MoeForCausalLM +from .tokenizer import QWen2MoeTokenizer diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index a1505d87c32b..d993a44d7dd9 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -1004,6 +1004,18 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): newkey2 = newkey.replace("experts.0.", f"experts.{j}.") final_actions[newkey2] = action + # Add tp split for shared expert params. + base_actions = { + "layers.0.mlp.shared_expert.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.shared_expert.up_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=True), + } + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + return final_actions mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py new file mode 100644 index 000000000000..980f521b15bc --- /dev/null +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -0,0 +1,341 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for QWen2.""" + +import json +import os +import unicodedata +from functools import lru_cache +from typing import Optional, Tuple + +import regex as re + +from .. import AddedToken, PretrainedTokenizer + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + +__all__ = ["QWen2MoeTokenizer"] + +MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} + +PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" + + +@lru_cache() +# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class QWen2MoeTokenizer(PretrainedTokenizer): + """ + Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. + + Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import Qwen2Tokenizer + + >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer") + >>> tokenizer("Hello world")["input_ids"] + [9707, 1879] + + >>> tokenizer(" Hello world")["input_ids"] + [21927, 1879] + ``` + This is expected. + + You should not use GPT2Tokenizer instead, because of the different pretokenization rules. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*): + The beginning of sequence token. Not applicable for this tokenizer. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces. + split_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the special tokens should be split during the tokenization process. The default behavior is + to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") = + ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<', + '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment. + """ + + resource_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + max_model_input_sizes = MAX_MODEL_INPUT_SIZES + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token=None, + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", + clean_up_tokenization_spaces=False, + split_special_tokens=False, + **kwargs, + ): + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_merges = [] + with open(merges_file, encoding="utf-8") as merges_handle: + for i, line in enumerate(merges_handle): + line = line.strip() + if (i == 0 and line.startswith("#version:")) or not line: + continue + bpe_merges.append(tuple(line.split())) + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + # NOTE: the cache can grow without bound and will get really large for long running processes + # (esp. for texts of language that do not use space between word, e.g. Chinese); technically + # not a memory leak but appears as one. + # GPT2Tokenizer has the same problem, so let's be consistent. + self.cache = {} + + self.pat = re.compile(PRETOKENIZE_REGEX) + + # if kwargs.get("add_prefix_space", False): + # logger.warning_once( + # f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect." + # ) + + # Qwen vocab does not contain control tokens; added tokens need to be special + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(pad_token, str) + else pad_token + ) + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + split_special_tokens=split_special_tokens, + **kwargs, + ) + if "pad_token_id" in kwargs: + self.pad_token_id = kwargs["pad_token_id"] + if "eos_token_id" in kwargs: + self.eos_token_id = kwargs["eos_token_id"] + + @property + def vocab_size(self) -> int: + return len(self.encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def _decode( + self, + token_ids, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = False, + spaces_between_special_tokens: bool = False, + **kwargs, + ) -> str: + # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers + # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer + return super()._decode( + token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + spaces_between_special_tokens=spaces_between_special_tokens, + **kwargs, + ) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + # if not os.path.isdir(save_directory): + # logger.error(f"Vocabulary path ({save_directory}) should be a directory") + # return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + # logger.warning( + # f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + # " Please check that the tokenizer is not corrupted!" + # ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, **kwargs): + text = unicodedata.normalize("NFC", text) + return (text, kwargs) From cd9c753d4e0536d78df14b958cdd3d6aaa82cd40 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 26 Apr 2024 06:11:38 +0000 Subject: [PATCH 16/41] update modeling --- paddlenlp/transformers/qwen2moe/modeling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index d993a44d7dd9..d911d7bb7128 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -576,16 +576,16 @@ def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = True): if config.tensor_parallel_degree > 1: self.q_proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, has_bias=True, gather_output=False) self.k_proj = ColumnParallelLinear( - self.hidden_size, self.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False + self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False ) self.v_proj = ColumnParallelLinear( - self.hidden_size, self.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False + self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False ) self.o_proj = RowParallelLinear(self.hidden_size, self.hidden_size, has_bias=False, input_is_parallel=True) else: self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=True) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=True) + self.k_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) + self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) self.rotary_emb = QWen2MoeRotaryEmbedding( From 10407c40d2fea7b21c9e39814550a0540388ec29 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 26 Apr 2024 06:13:58 +0000 Subject: [PATCH 17/41] update modeling --- paddlenlp/transformers/qwen2moe/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index d911d7bb7128..944757e93ca4 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -1008,7 +1008,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): base_actions = { "layers.0.mlp.shared_expert.gate_proj.weight": partial(fn, is_column=True), "layers.0.mlp.shared_expert.up_proj.weight": partial(fn, is_column=True), - "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False), } for key, action in base_actions.items(): if "layers.0." in key: From beb0f4c66f58135df74411065d3a4f0063f40d7f Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 26 Apr 2024 09:48:17 +0000 Subject: [PATCH 18/41] update tokenizer --- paddlenlp/transformers/qwen2moe/tokenizer.py | 55 ++++++++++---------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py index 980f521b15bc..06115b8fd6b1 100644 --- a/paddlenlp/transformers/qwen2moe/tokenizer.py +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -145,6 +145,29 @@ def __init__( split_special_tokens=False, **kwargs, ): + super().__init__(**kwargs) + # Qwen vocab does not contain control tokens; added tokens need to be special + bos_token = ( + AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(bos_token, str) + else bos_token + ) + eos_token = ( + AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(eos_token, str) + else eos_token + ) + unk_token = ( + AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(unk_token, str) + else unk_token + ) + pad_token = ( + AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) + if isinstance(pad_token, str) + else pad_token + ) + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -171,28 +194,10 @@ def __init__( # logger.warning_once( # f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect." # ) - - # Qwen vocab does not contain control tokens; added tokens need to be special - bos_token = ( - AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) - if isinstance(bos_token, str) - else bos_token - ) - eos_token = ( - AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) - if isinstance(eos_token, str) - else eos_token - ) - unk_token = ( - AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) - if isinstance(unk_token, str) - else unk_token - ) - pad_token = ( - AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) - if isinstance(pad_token, str) - else pad_token - ) + self.bos_token_id = kwargs["bos_token_id"] if "bos_token_id" in kwargs else None + self.eos_token_id = kwargs["eos_token_id"] if "eos_token_id" in kwargs else None + self.unk_token_id = kwargs["unk_token_id"] if "unk_token_id" in kwargs else None + self.pad_token_id = kwargs["pad_token_id"] if "pad_token_id" in kwargs else None super().__init__( errors=errors, @@ -204,10 +209,6 @@ def __init__( split_special_tokens=split_special_tokens, **kwargs, ) - if "pad_token_id" in kwargs: - self.pad_token_id = kwargs["pad_token_id"] - if "eos_token_id" in kwargs: - self.eos_token_id = kwargs["eos_token_id"] @property def vocab_size(self) -> int: @@ -274,7 +275,7 @@ def _tokenize(self, text): # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - return self.encoder.get(token, self.encoder.get(self.unk_token)) + return self.encoder.get(token, len(self.encoder)) # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token def _convert_id_to_token(self, index): From beefee9675db9d4eb6acc629e1f9f635717f0bbd Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Sun, 28 Apr 2024 04:22:03 +0000 Subject: [PATCH 19/41] update modeling and tokenizer --- paddlenlp/transformers/qwen2moe/modeling.py | 4 +++- paddlenlp/transformers/qwen2moe/tokenizer.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 944757e93ca4..7b7b276b68e2 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -762,7 +762,9 @@ def forward(self, hidden_states): top_x = top_x.squeeze() if top_x.shape == []: top_x = paddle.to_tensor([top_x.item()]) - final_hidden_states.index_add_(top_x, 0, current_hidden_states.astype(hidden_states.dtype)) + final_hidden_states = paddle.index_add( + final_hidden_states, top_x, 0, current_hidden_states.astype(hidden_states.dtype) + ) shared_expert_output = self.shared_expert(hidden_states) shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py index 06115b8fd6b1..67e92609f3ef 100644 --- a/paddlenlp/transformers/qwen2moe/tokenizer.py +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -275,12 +275,12 @@ def _tokenize(self, text): # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" - return self.encoder.get(token, len(self.encoder)) + return self.encoder.get(token, self.added_tokens_encoder.get(token, len(self.encoder))) # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" - return self.decoder.get(index) + return self.decoder.get(index, self.added_tokens_decoder.get(index, self.unk_token)) # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string def convert_tokens_to_string(self, tokens): From 82ba345be352b64b62a66e2298698d477d3fad66 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Sun, 28 Apr 2024 06:15:16 +0000 Subject: [PATCH 20/41] fix index_add_ error --- paddlenlp/transformers/qwen2moe/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 7b7b276b68e2..b233fe81d651 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -762,7 +762,7 @@ def forward(self, hidden_states): top_x = top_x.squeeze() if top_x.shape == []: top_x = paddle.to_tensor([top_x.item()]) - final_hidden_states = paddle.index_add( + final_hidden_states = paddle.index_add_( final_hidden_states, top_x, 0, current_hidden_states.astype(hidden_states.dtype) ) From 4a1b2e3836de4f8bd2c549f302fa9b82f16b1ee0 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Sun, 28 Apr 2024 11:08:56 +0000 Subject: [PATCH 21/41] fix --- llm/qwen2moe/lora_argument.json | 6 +++--- llm/qwen2moe/sft_argument.json | 4 ++-- tests/transformers/qwen2moe/test_modeling.py | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/llm/qwen2moe/lora_argument.json b/llm/qwen2moe/lora_argument.json index ef832a113f46..9e5e10600ccd 100644 --- a/llm/qwen2moe/lora_argument.json +++ b/llm/qwen2moe/lora_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", + "model_name_or_path": "qwen/Qwen1.5-MoE-A2.7B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_lora_ckpts", "per_device_train_batch_size": 4, @@ -13,8 +13,8 @@ "evaluation_strategy": "epoch", "save_strategy": "epoch", "src_length": 1024, - "max_length": 2048, - "fp16": true, + "max_length": 32768, + "bf16": true, "fp16_opt_level": "O2", "do_train": true, "do_eval": true, diff --git a/llm/qwen2moe/sft_argument.json b/llm/qwen2moe/sft_argument.json index 7bb8547803dc..9a6a5a8388c4 100644 --- a/llm/qwen2moe/sft_argument.json +++ b/llm/qwen2moe/sft_argument.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", + "model_name_or_path": "qwen/Qwen1.5-MoE-A2.7B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_sft_ckpts", "per_device_train_batch_size": 4, @@ -13,7 +13,7 @@ "evaluation_strategy": "epoch", "save_strategy": "epoch", "src_length": 1024, - "max_length": 2048, + "max_length": 32768, "bf16": true, "fp16_opt_level": "O2", "do_train": true, diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index 26c5597fd231..120e9c4bd30c 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -36,6 +36,7 @@ def __init__( hidden_size=64, num_hidden_layers=2, num_attention_heads=8, + num_key_value_heads=8, masked_softmax_fusion=True, layer_norm_epsilon=1e-5, initializer_range=0.02, @@ -67,6 +68,7 @@ def __init__( self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads self.masked_softmax_fusion = masked_softmax_fusion self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range @@ -119,6 +121,7 @@ def get_config(self) -> QWen2MoeConfig: hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, masked_softmax_fusion=self.masked_softmax_fusion, layer_norm_epsilon=self.layer_norm_epsilon, initializer_range=self.initializer_range, From 2bb3aba68dae61c01c39edab0279cc7962b260eb Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Mon, 6 May 2024 03:19:40 +0000 Subject: [PATCH 22/41] update comments --- paddlenlp/transformers/qwen2moe/modeling.py | 4 ---- paddlenlp/transformers/qwen2moe/tokenizer.py | 9 --------- 2 files changed, 13 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index b233fe81d651..6ddb5f07f847 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -342,7 +342,6 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->QWen2Moe class QWen2MoeRMSNorm(nn.Layer): def __init__(self, config): super().__init__() @@ -374,7 +373,6 @@ def forward(self, hidden_states): return hidden_states * self.weight -# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->QWen2Moe class QWen2MoeRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() @@ -499,7 +497,6 @@ def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -# Copied from transformers.models.llama.modeling_llama.repeat_kv def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: """ This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch, @@ -513,7 +510,6 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->QWen2Moe class QWen2MoeAttention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py index 67e92609f3ef..10421fea8851 100644 --- a/paddlenlp/transformers/qwen2moe/tokenizer.py +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -37,7 +37,6 @@ @lru_cache() -# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control @@ -62,7 +61,6 @@ def bytes_to_unicode(): return dict(zip(bs, cs)) -# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs def get_pairs(word): """ Return set of symbol pairs in a word. @@ -214,11 +212,9 @@ def __init__( def vocab_size(self) -> int: return len(self.encoder) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe def bpe(self, token): if token in self.cache: return self.cache[token] @@ -261,7 +257,6 @@ def bpe(self, token): self.cache[token] = word return word - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize def _tokenize(self, text): """Tokenize a string.""" bpe_tokens = [] @@ -272,17 +267,14 @@ def _tokenize(self, text): bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.encoder.get(token, self.added_tokens_encoder.get(token, len(self.encoder))) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index, self.added_tokens_decoder.get(index, self.unk_token)) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" text = "".join(tokens) @@ -307,7 +299,6 @@ def _decode( **kwargs, ) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: # if not os.path.isdir(save_directory): # logger.error(f"Vocabulary path ({save_directory}) should be a directory") From f2039839a67548e482b88efadc7e65e06bdc36b7 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 10 May 2024 03:34:49 +0000 Subject: [PATCH 23/41] update lora weights --- llm/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llm/utils.py b/llm/utils.py index 8bcc52ae33ab..70482a75c3db 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -159,6 +159,16 @@ def get_lora_target_modules(model): ".*w2.*", ".*w3.*", ] + elif model.base_model_prefix == "qwen2moe": + target_modules = [ + ".*q_proj.*", + ".*k_proj.*", + ".*v_proj.*", + ".*o_proj.*", + ".*gate_proj.*", + ".*up_proj.*", + ".*down_proj.*", + ] else: raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}.") return target_modules From 58af3ecce6889125b52298fc6b2808a5b93a60ae Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 10 May 2024 03:57:53 +0000 Subject: [PATCH 24/41] add todo --- llm/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llm/utils.py b/llm/utils.py index 70482a75c3db..ac3a5c1bb9a6 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -41,7 +41,6 @@ def compute_metrics(eval_preds): - flattened_preds = np.array(eval_preds.predictions).flatten() flattened_labels = np.array(eval_preds.label_ids).flatten() filtered_preds = flattened_preds[flattened_labels != -100] @@ -155,6 +154,7 @@ def get_lora_target_modules(model): ".*k_proj.*", ".*v_proj.*", ".*o_proj.*", + # ".*gate.*", # TODO(DrownFish19): Does the gate weight require training? ".*w1.*", ".*w2.*", ".*w3.*", @@ -165,6 +165,7 @@ def get_lora_target_modules(model): ".*k_proj.*", ".*v_proj.*", ".*o_proj.*", + # ".*gate.*", # TODO(DrownFish19): Does the gate weight require training? ".*gate_proj.*", ".*up_proj.*", ".*down_proj.*", From 5ddc326159b7b58041ae9f90109189b78dd5b4f9 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 29 May 2024 02:52:46 +0000 Subject: [PATCH 25/41] update Copyright --- paddlenlp/transformers/qwen2moe/__init__.py | 2 +- paddlenlp/transformers/qwen2moe/configuration.py | 2 +- paddlenlp/transformers/qwen2moe/tokenizer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index c8ba37240327..65528189e888 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2moe/configuration.py index 08666c44d6e4..47ea56c21a7e 100644 --- a/paddlenlp/transformers/qwen2moe/configuration.py +++ b/paddlenlp/transformers/qwen2moe/configuration.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py index 10421fea8851..5f78eb5457ad 100644 --- a/paddlenlp/transformers/qwen2moe/tokenizer.py +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); From de1db677ed1e3cd42fd44da3c59bc172e2133eac Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 29 May 2024 03:45:31 +0000 Subject: [PATCH 26/41] update Moe to MoE --- paddlenlp/transformers/auto/modeling.py | 2 +- paddlenlp/transformers/auto/tokenizer.py | 2 +- paddlenlp/transformers/qwen2moe/__init__.py | 6 +- .../transformers/qwen2moe/configuration.py | 14 +-- paddlenlp/transformers/qwen2moe/modeling.py | 100 +++++++++--------- paddlenlp/transformers/qwen2moe/tokenizer.py | 4 +- tests/transformers/qwen2moe/test_modeling.py | 42 ++++---- 7 files changed, 83 insertions(+), 87 deletions(-) diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 248fe2cbcd0d..e84a62986d51 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -118,7 +118,7 @@ ("Bloom", "bloom"), ("QWen", "qwen"), ("Mixtral", "mixtral"), - ("QWen2Moe", "qwen2moe"), + ("QWen2MoE", "qwen2moe"), ("Gemma", "gemma"), ] ) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 27f840d48c6b..694b1c5ee35d 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -97,7 +97,7 @@ ("BloomTokenizer", "bloom"), ("SpeechT5Tokenizer", "speecht5"), ("QWenTokenizer", "qwen"), - ("QWen2MoeTokenizer", "qwen2moe"), + ("QWen2MoETokenizer", "qwen2moe"), ("GemmaTokenizer", "gemma"), ] ) diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index 65528189e888..92f7ec32de90 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .configuration import QWen2MoeConfig -from .modeling import QWen2MoeForCausalLM -from .tokenizer import QWen2MoeTokenizer +from .configuration import QWen2MoEConfig +from .modeling import QWen2MoEForCausalLM +from .tokenizer import QWen2MoETokenizer diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2moe/configuration.py index 47ea56c21a7e..636298d04bb6 100644 --- a/paddlenlp/transformers/qwen2moe/configuration.py +++ b/paddlenlp/transformers/qwen2moe/configuration.py @@ -17,13 +17,13 @@ from paddlenlp.transformers.configuration_utils import PretrainedConfig __all__ = [ - "QWen2MoeConfig", + "QWen2MoEConfig", ] -class QWen2MoeConfig(PretrainedConfig): +class QWen2MoEConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`QWen2MoeModel`]. It is used to instantiate a + This is the configuration class to store the configuration of a [`QWen2MoEModel`]. It is used to instantiate a Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B"). @@ -35,7 +35,7 @@ class QWen2MoeConfig(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 151936): Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`QWen2MoeModel`] + `inputs_ids` passed when calling [`QWen2MoEModel`] hidden_size (`int`, *optional*, defaults to 2048): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 5632): @@ -93,13 +93,13 @@ class QWen2MoeConfig(PretrainedConfig): The aux loss factor for the total loss. ```python - >>> from paddlenlp.transformers import QWen2MoeModel, QWen2MoeConfig + >>> from paddlenlp.transformers import QWen2MoEModel, QWen2MoEConfig >>> # Initializing a Qwen2MoE style configuration - >>> configuration = QWen2MoeConfig() + >>> configuration = QWen2MoEConfig() >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration - >>> model = QWen2MoeModel(configuration) + >>> model = QWen2MoEModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index 6ddb5f07f847..bcb93ed86ae3 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Paddle QWen2Moe model.""" +""" Paddle QWen2MoE model.""" from __future__ import annotations import math @@ -52,7 +52,7 @@ from paddlenlp.utils.log import logger from ..activations import ACT2FN -from .configuration import QWen2MoeConfig +from .configuration import QWen2MoEConfig try: from paddle.nn.functional.flash_attention import flash_attention @@ -60,10 +60,10 @@ flash_attention = None __all__ = [ - "QWen2MoeModel", - "QWen2MoePretrainedModel", - "QWen2MoeForCausalLM", - "QWen2MoePretrainingCriterion", + "QWen2MoEModel", + "QWen2MoEPretrainedModel", + "QWen2MoEForCausalLM", + "QWen2MoEPretrainingCriterion", ] @@ -342,7 +342,7 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -class QWen2MoeRMSNorm(nn.Layer): +class QWen2MoERMSNorm(nn.Layer): def __init__(self, config): super().__init__() self.hidden_size = config.hidden_size @@ -373,7 +373,7 @@ def forward(self, hidden_states): return hidden_states * self.weight -class QWen2MoeRotaryEmbedding(nn.Layer): +class QWen2MoERotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() self.dim = dim @@ -437,7 +437,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ if position_ids is None: - # Note: Only for QWen2MoeForCausalLMPipe model pretraining + # Note: Only for QWen2MoEForCausalLMPipe model pretraining cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] else: @@ -450,8 +450,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed -# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->QWen2Moe -class QWen2MoeMLP(nn.Layer): +# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->QWen2MoE +class QWen2MoEMLP(nn.Layer): def __init__(self, config, is_shared=False): super().__init__() self.hidden_size = config.hidden_size @@ -510,13 +510,13 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -class QWen2MoeAttention(nn.Layer): +class QWen2MoEAttention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". """ - def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = True): + def __init__(self, config: QWen2MoEConfig, layerwise_recompute: bool = True): super().__init__() self.config = config @@ -584,7 +584,7 @@ def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = True): self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) - self.rotary_emb = QWen2MoeRotaryEmbedding( + self.rotary_emb = QWen2MoERotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, @@ -707,17 +707,17 @@ def forward( return outputs -class QWen2MoeSparseMoeBlock(nn.Layer): - def __init__(self, config: QWen2MoeConfig): +class QWen2MoESparseMoEBlock(nn.Layer): + def __init__(self, config: QWen2MoEConfig): super().__init__() self.num_experts = config.num_experts self.top_k = config.num_experts_per_tok self.norm_topk_prob = config.norm_topk_prob self.gate = nn.Linear(config.hidden_size, self.num_experts, bias_attr=False) - self.experts = nn.LayerList([QWen2MoeMLP(config) for _ in range(self.num_experts)]) + self.experts = nn.LayerList([QWen2MoEMLP(config) for _ in range(self.num_experts)]) - self.shared_expert = QWen2MoeMLP(config, is_shared=True) + self.shared_expert = QWen2MoEMLP(config, is_shared=True) self.shared_expert_gate = nn.Linear(config.hidden_size, 1, bias_attr=False) def forward(self, hidden_states): @@ -771,21 +771,21 @@ def forward(self, hidden_states): return final_hidden_states, router_logits -class QWen2MoeDecoderLayer(nn.Layer): - def __init__(self, config: QWen2MoeConfig, layerwise_recompute: bool = False): +class QWen2MoEDecoderLayer(nn.Layer): + def __init__(self, config: QWen2MoEConfig, layerwise_recompute: bool = False): super().__init__() self.config = config - self.self_attn = QWen2MoeAttention(config, layerwise_recompute) + self.self_attn = QWen2MoEAttention(config, layerwise_recompute) if config.num_experts > 0: - self.mlp = QWen2MoeSparseMoeBlock(config) + self.mlp = QWen2MoESparseMoEBlock(config) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = QWen2MoeMLP(config) + self.mlp = QWen2MoEMLP(config) - self.input_layernorm = QWen2MoeRMSNorm(config) - self.post_attention_layernorm = QWen2MoeRMSNorm(config) + self.input_layernorm = QWen2MoERMSNorm(config) + self.post_attention_layernorm = QWen2MoERMSNorm(config) self.sequence_parallel = config.sequence_parallel # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True @@ -896,13 +896,13 @@ def forward( return outputs -class QWen2MoePretrainedModel(PretrainedModel): - config_class = QWen2MoeConfig +class QWen2MoEPretrainedModel(PretrainedModel): + config_class = QWen2MoEConfig base_model_prefix = "qwen2moe" _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod - def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping]: + def _get_name_mappings(cls, config: QWen2MoEConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] model_mappings = [ ["embed_tokens.weight"], @@ -938,8 +938,8 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping model_mappings.append([f"layers.{layer_index}.mlp.shared_expert_gate.weight", None, "transpose"]) init_name_mappings(mappings=model_mappings) - # base-model prefix "QWen2MoeModel" - if "QWen2MoeModel" not in config.architectures: + # base-model prefix "QWen2MoEModel" + if "QWen2MoEModel" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] mapping[1] = "qwen2moe." + mapping[1] @@ -949,7 +949,7 @@ def _get_name_mappings(cls, config: QWen2MoeConfig) -> list[StateDictNameMapping return mappings @classmethod - def _get_tensor_parallel_mappings(cls, config: QWen2MoeConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config: QWen2MoEConfig, is_split=True): from paddlenlp.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -1032,7 +1032,7 @@ def _init_weights(self, layer): mpu.VocabParallelEmbedding, mpu.ColumnParallelLinear, mpu.RowParallelLinear, - QWen2MoeLMHead, + QWen2MoELMHead, ColumnSequenceParallelLinear, RowSequenceParallelLinear, ), @@ -1067,23 +1067,23 @@ def _init_weights(self, layer): # sublayer is init first # scale RowParallelLinear weight with paddle.no_grad(): - if isinstance(layer, QWen2MoeMLP): + if isinstance(layer, QWen2MoEMLP): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.down_proj.weight.scale_(factor) - if isinstance(layer, QWen2MoeAttention): + if isinstance(layer, QWen2MoEAttention): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) @register_base_model -class QWen2MoeModel(QWen2MoePretrainedModel): +class QWen2MoEModel(QWen2MoEPretrainedModel): """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2MoeDecoderLayer`] + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2MoEDecoderLayer`] Args: - config: QWen2MoeConfig + config: QWen2MoEConfig """ - def __init__(self, config: QWen2MoeConfig): + def __init__(self, config: QWen2MoEConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -1108,11 +1108,11 @@ def __init__(self, config: QWen2MoeConfig): self.layers = nn.LayerList( [ - QWen2MoeDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) + QWen2MoEDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) for layer_idx in range(config.num_hidden_layers) ] ) - self.norm = QWen2MoeRMSNorm(config) + self.norm = QWen2MoERMSNorm(config) def get_input_embeddings(self): return self.embed_tokens @@ -1334,14 +1334,14 @@ def forward( ) -class QWen2MoePretrainingCriterion(nn.Layer): +class QWen2MoEPretrainingCriterion(nn.Layer): """ Criterion for Mixtral. It calculates the final loss. """ def __init__(self, config): - super(QWen2MoePretrainingCriterion, self).__init__() + super(QWen2MoEPretrainingCriterion, self).__init__() self.ignore_index = getattr(config, "ignore_index", -100) self.config = config self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output @@ -1369,9 +1369,9 @@ def forward(self, prediction_scores, masked_lm_labels): return loss -class QWen2MoeLMHead(nn.Layer): - def __init__(self, config: QWen2MoeConfig): - super(QWen2MoeLMHead, self).__init__() +class QWen2MoELMHead(nn.Layer): + def __init__(self, config: QWen2MoEConfig): + super(QWen2MoELMHead, self).__init__() self.config = config if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: vocab_size = config.vocab_size // config.tensor_parallel_degree @@ -1400,7 +1400,7 @@ def forward(self, hidden_states, tensor_parallel_output=None): return logits -class QWen2MoeForCausalLM(QWen2MoePretrainedModel): +class QWen2MoEForCausalLM(QWen2MoEPretrainedModel): enable_to_static_method = True _tied_weights_keys = ["lm_head.weight"] @@ -1408,9 +1408,9 @@ def __init__(self, config): super().__init__(config) self.config = config - self.qwen2moe = QWen2MoeModel(config) - self.lm_head = QWen2MoeLMHead(config) - self.criterion = QWen2MoePretrainingCriterion(config) + self.qwen2moe = QWen2MoEModel(config) + self.lm_head = QWen2MoELMHead(config) + self.criterion = QWen2MoEPretrainingCriterion(config) self.router_aux_loss_coef = config.router_aux_loss_coef self.num_experts = config.num_experts self.num_experts_per_tok = config.num_experts_per_tok diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2moe/tokenizer.py index 5f78eb5457ad..ddc475c2721f 100644 --- a/paddlenlp/transformers/qwen2moe/tokenizer.py +++ b/paddlenlp/transformers/qwen2moe/tokenizer.py @@ -29,7 +29,7 @@ "merges_file": "merges.txt", } -__all__ = ["QWen2MoeTokenizer"] +__all__ = ["QWen2MoETokenizer"] MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} @@ -75,7 +75,7 @@ def get_pairs(word): return pairs -class QWen2MoeTokenizer(PretrainedTokenizer): +class QWen2MoETokenizer(PretrainedTokenizer): """ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index 120e9c4bd30c..8f74f3f92743 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -18,7 +18,7 @@ import paddle -from paddlenlp.transformers import QWen2MoeConfig, QWen2MoeForCausalLM, QWen2MoeModel +from paddlenlp.transformers import QWen2MoEConfig, QWen2MoEForCausalLM, QWen2MoEModel from tests.transformers.test_configuration_common import ConfigTester from tests.transformers.test_generation_utils import GenerationTesterMixin from tests.transformers.test_modeling_common import ( @@ -28,7 +28,7 @@ ) -class QWen2MoeModelTester: +class QWen2MoEModelTester: def __init__( self, parent, @@ -63,7 +63,7 @@ def __init__( use_labels: bool = False, return_dict=False, ): - self.parent: QWen2MoeModelTest = parent + self.parent: QWen2MoEModelTest = parent self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -115,8 +115,8 @@ def prepare_config_and_inputs(self): config = self.get_config() return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels - def get_config(self) -> QWen2MoeConfig: - return QWen2MoeConfig( + def get_config(self) -> QWen2MoEConfig: + return QWen2MoEConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -139,17 +139,17 @@ def get_config(self) -> QWen2MoeConfig: ) def create_and_check_model( - self, config: QWen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: QWen2MoEConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = QWen2MoeModel(config) + model = QWen2MoEModel(config) model.eval() result = model(input_ids) self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) def create_and_check_model_attention_mask( - self, config: QWen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: QWen2MoEConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = QWen2MoeModel(config) + model = QWen2MoEModel(config) model.eval() attn_mask_2d = random_attention_mask([self.batch_size, self.seq_length]) result_2d = model(input_ids, attention_mask=attn_mask_2d)[0] @@ -167,14 +167,14 @@ def create_and_check_model_attention_mask( def create_and_check_model_past_large_inputs( self, - config: QWen2MoeConfig, + config: QWen2MoEConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels, ): - model = QWen2MoeModel(config) + model = QWen2MoEModel(config) model.eval() # first forward pass @@ -229,7 +229,7 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): - model = QWen2MoeForCausalLM(config) + model = QWen2MoEForCausalLM(config) model.eval() result = model( @@ -245,7 +245,7 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.vocab_size]) def check_model_position_ids(self, config, input_ids, input_mask, *args): - model = QWen2MoeForCausalLM(config) + model = QWen2MoEForCausalLM(config) model.eval() result_no_position_id = model( @@ -267,20 +267,20 @@ def check_model_position_ids(self, config, input_ids, input_mask, *args): self.parent.assertTrue((result_position_id[0] == result_no_position_id[0]).all()) -class QWen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - base_model_class = QWen2MoeModel +class QWen2MoEModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = QWen2MoEModel return_dict = False use_labels = False use_test_model_name_list = False - all_model_classes = (QWen2MoeModel, QWen2MoeForCausalLM) - all_generative_model_classes = {QWen2MoeForCausalLM: (QWen2MoeModel, "qwen2moe")} + all_model_classes = (QWen2MoEModel, QWen2MoEForCausalLM) + all_generative_model_classes = {QWen2MoEForCausalLM: (QWen2MoEModel, "qwen2moe")} def setUp(self): super().setUp() - self.model_tester = QWen2MoeModelTester(self) - self.config_tester = ConfigTester(self, config_class=QWen2MoeConfig, vocab_size=256, hidden_size=24) + self.model_tester = QWen2MoEModelTester(self) + self.config_tester = ConfigTester(self, config_class=QWen2MoEConfig, vocab_size=256, hidden_size=24) def _get_input_ids_and_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -315,7 +315,3 @@ def test_generate_without_input_ids(self): def test_qwen2moe_lm_head_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_lm_head_model(*config_and_inputs) - - -if __name__ == "__main__": - unittest.main() From 87f02765be4ff535a69cf2f7dcd4f939db5ab846 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Thu, 30 May 2024 08:40:44 +0000 Subject: [PATCH 27/41] update comment --- paddlenlp/transformers/qwen2moe/modeling.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index bcb93ed86ae3..ea920a3a241a 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -416,26 +416,6 @@ def rotate_half(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ if position_ids is None: # Note: Only for QWen2MoEForCausalLMPipe model pretraining cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] From 8d9970b3df8ba2a512ea3595f3828806d7760aa3 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 31 May 2024 07:33:42 +0000 Subject: [PATCH 28/41] update Copyright --- tests/transformers/qwen2moe/__init__.py | 2 +- tests/transformers/qwen2moe/test_modeling.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/transformers/qwen2moe/__init__.py b/tests/transformers/qwen2moe/__init__.py index 595add0aed9e..fd05a9208165 100644 --- a/tests/transformers/qwen2moe/__init__.py +++ b/tests/transformers/qwen2moe/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index 8f74f3f92743..850cefc25011 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); From d57a5b10ec7c4d83afbd174cf287ea48c0e66812 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Mon, 3 Jun 2024 07:54:58 +0000 Subject: [PATCH 29/41] update readme and json --- llm/qwen/README.md | 8 +++++++- .../lora_argument_qwen1.5moe.json} | 2 +- .../sft_argument_qwen1.5moe.json} | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) rename llm/{qwen2moe/lora_argument.json => qwen/lora_argument_qwen1.5moe.json} (94%) rename llm/{qwen2moe/sft_argument.json => qwen/sft_argument_qwen1.5moe.json} (94%) diff --git a/llm/qwen/README.md b/llm/qwen/README.md index d923a4ac3677..97dc43549375 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -13,7 +13,13 @@ | qwen/qwen-14b-chat| | qwen/qwen-72b | | qwen/qwen-72b-chat| -| qwen/qwen1.5-moe-a2.7b| + +[通义千问(Qwen1.5-MoE)](https://qwenlm.github.io/blog/qwen-moe/) 是阿里云研发的通义千问MoE模型。Qwen1.5-MoE基于Transformer架构,采用了专家混合(MoE)架构,这些模型通过密集型语言模型升级改造而来。例如,Qwen1.5-MoE-A2.7B就是从Qwen-1.8B升级改造而来的。它总共有143亿个参数,但在运行时仅激活27亿个参数,却实现了与Qwen1.5-7B相近的性能,而训练资源仅为其25%。 + +**支持模型权重:** +| Model (qwen-1.5) | +|------------------------| +| qwen/qwen1.5-moe-a2.7b | ## 2. 模型精调 请参考[LLM全流程工具介绍](../README.md) diff --git a/llm/qwen2moe/lora_argument.json b/llm/qwen/lora_argument_qwen1.5moe.json similarity index 94% rename from llm/qwen2moe/lora_argument.json rename to llm/qwen/lora_argument_qwen1.5moe.json index 9e5e10600ccd..c511e578e56b 100644 --- a/llm/qwen2moe/lora_argument.json +++ b/llm/qwen/lora_argument_qwen1.5moe.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "qwen/Qwen1.5-MoE-A2.7B", + "model_name_or_path": "qwen/qwen1.5-moe-a2.7b", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_lora_ckpts", "per_device_train_batch_size": 4, diff --git a/llm/qwen2moe/sft_argument.json b/llm/qwen/sft_argument_qwen1.5moe.json similarity index 94% rename from llm/qwen2moe/sft_argument.json rename to llm/qwen/sft_argument_qwen1.5moe.json index 9a6a5a8388c4..d3d048f423d3 100644 --- a/llm/qwen2moe/sft_argument.json +++ b/llm/qwen/sft_argument_qwen1.5moe.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "qwen/Qwen1.5-MoE-A2.7B", + "model_name_or_path": "qwen/qwen1.5-moe-a2.7b", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_sft_ckpts", "per_device_train_batch_size": 4, From bfb65a16e89eeea0376cc403d60f3979c55703de Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Mon, 3 Jun 2024 08:00:27 +0000 Subject: [PATCH 30/41] update __init__.py --- paddlenlp/transformers/__init__.py | 4 +--- paddlenlp/transformers/qwen2moe/__init__.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index b3bc4e66d3a1..f9a996093b67 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -294,9 +294,7 @@ from .deberta_v2.modeling import * from .deberta_v2.tokenizer import * from .deberta_v2.configuration import * -from .qwen2moe.modeling import * -from .qwen2moe.configuration import * -from .qwen2moe.tokenizer import * +from .qwen2moe import * # For faster tokenizer from ..utils.import_utils import is_fast_tokenizer_available diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index 92f7ec32de90..e64672df2941 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .configuration import QWen2MoEConfig -from .modeling import QWen2MoEForCausalLM -from .tokenizer import QWen2MoETokenizer +from .configuration import * +from .modeling import * +from .tokenizer import * From 4b96dd078c85241b41d21df365915525abff4ed6 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 4 Jun 2024 08:07:53 +0000 Subject: [PATCH 31/41] add qwen-1.5 --- paddlenlp/transformers/qwen2/__init__.py | 18 + paddlenlp/transformers/qwen2/configuration.py | 164 ++ paddlenlp/transformers/qwen2/modeling.py | 1521 +++++++++++++++++ paddlenlp/transformers/qwen2/modeling_pp.py | 232 +++ .../{qwen2moe => qwen2}/tokenizer.py | 0 paddlenlp/transformers/qwen2moe/modeling.py | 8 +- 6 files changed, 1939 insertions(+), 4 deletions(-) create mode 100644 paddlenlp/transformers/qwen2/__init__.py create mode 100644 paddlenlp/transformers/qwen2/configuration.py create mode 100644 paddlenlp/transformers/qwen2/modeling.py create mode 100644 paddlenlp/transformers/qwen2/modeling_pp.py rename paddlenlp/transformers/{qwen2moe => qwen2}/tokenizer.py (100%) diff --git a/paddlenlp/transformers/qwen2/__init__.py b/paddlenlp/transformers/qwen2/__init__.py new file mode 100644 index 000000000000..6e9485ea762a --- /dev/null +++ b/paddlenlp/transformers/qwen2/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .configuration import * +from .modeling import * +from .tokenizer import * diff --git a/paddlenlp/transformers/qwen2/configuration.py b/paddlenlp/transformers/qwen2/configuration.py new file mode 100644 index 000000000000..5330d81df4bf --- /dev/null +++ b/paddlenlp/transformers/qwen2/configuration.py @@ -0,0 +1,164 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Qwen2 model configuration""" + +from ..configuration_utils import PretrainedConfig + +__all__ = [ + "QWen2Config", +] + + +class QWen2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a + Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of + Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22016): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + max_window_layers (`int`, *optional*, defaults to 28): + The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import Qwen2Model, Qwen2Config + + >>> # Initializing a Qwen2 style configuration + >>> configuration = Qwen2Config() + + >>> # Initializing a model from the Qwen2-7B style configuration + >>> model = Qwen2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + seq_length=32768, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + use_recompute=False, + recompute_granularity="full", + no_recompute_layers=None, + use_flash_attention=False, + tie_word_embeddings=False, + rope_theta=10000.0, + tensor_parallel_output=True, + sequence_parallel=False, + fuse_sequence_parallel_allreduce=False, + pad_token_id=0, + bos_token_id=151643, + eos_token_id=151643, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=28, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.seq_length = seq_length + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + self.use_recompute = use_recompute + self.recompute_granularity = recompute_granularity + self.no_recompute_layers = no_recompute_layers + self.use_flash_attention = use_flash_attention + self.tensor_parallel_output = tensor_parallel_output + self.sequence_parallel = sequence_parallel + self.fuse_sequence_parallel_allreduce = fuse_sequence_parallel_allreduce + + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py new file mode 100644 index 000000000000..7f3c2fbc53c4 --- /dev/null +++ b/paddlenlp/transformers/qwen2/modeling.py @@ -0,0 +1,1521 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Paddle QWen2 model.""" + +import math +import warnings +from functools import partial +from typing import List, Optional, Tuple, Union + +import paddle +import paddle.distributed.fleet.meta_parallel as mpu +import paddle.nn.functional as F +from paddle import Tensor, nn +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker +from paddle.distributed.fleet.utils import recompute +from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, +) + +from ..activations import ACT2FN +from ..conversion_utils import StateDictNameMapping, init_name_mappings +from ..model_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ..model_utils import PretrainedModel, register_base_model +from .configuration import QWen2Config + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None + +try: + from paddle.nn.functional.flash_attention import flash_attention +except: + flash_attention = None + + +__all__ = [ + "QWen2Model", + "QWen2PretrainedModel", + "QWen2ForCausalLM", + "QWen2PretrainingCriterion", +] + + +def get_triangle_upper_mask(x, mask=None): + if mask is not None: + return mask + # [bsz, n_head, q_len, kv_seq_len] + shape = x.shape + # [bsz, 1, q_len, kv_seq_len] + shape[1] = 1 + mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype) + mask = paddle.triu(mask, diagonal=1) + mask.stop_gradient = True + return mask + + +def assign_kv_heads(num_kv_heads: int, num_gpus: int): + # Initialize the assignment list + """ + Assign kv heads to different GPUs in the Tensor Parallel Setup + + Examples: + assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]] + assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]] + assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]] + assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]] + assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]] + assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]] + """ + assignment_list = [[] for _ in range(num_gpus)] + # Case 1: more heads than cards + if num_kv_heads > num_gpus: + num_heads_per_card = num_kv_heads // num_gpus + for i in range(num_gpus): + for j in range(num_heads_per_card): + assignment_list[i].append(i * num_heads_per_card + j) + # Case 2: more cards than heads. each card get only 1 head. + else: + num_card_per_heads = num_gpus // num_kv_heads + for i in range(num_kv_heads): + for j in range(num_card_per_heads): + assignment_list[i * num_card_per_heads + j].append(i) + return assignment_list + + +def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True): + is_fleet_init = True + tensor_parallel_degree = 1 + try: + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + tensor_parallel_degree = hcg.get_model_parallel_world_size() + except: + is_fleet_init = False + + if paddle.in_dynamic_mode(): + y_is_distributed = y.is_distributed + else: + y_is_distributed = tensor_parallel_degree > 1 + + if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed: + # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg' + input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group) + logits = paddle.matmul(input_parallel, y, transpose_y=False) + + if tensor_parallel_output: + return logits + + return paddle.distributed.collective._c_concat(logits, group=model_parallel_group) + + else: + logits = paddle.matmul(x, y, transpose_y=False) + return logits + + +def scaled_dot_product_attention( + query_states, + config, + key_states, + value_states, + attention_mask, + output_attentions, + training=True, + sequence_parallel=False, +): + bsz, q_len, num_heads, head_dim = query_states.shape + _, kv_seq_len, _, _ = value_states.shape + + if config.use_flash_attention and flash_attention: + # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim] + # Torch Flash Attention input [ bz, nhead, seqlen, head_dim] + + version = paddle.version.full_version + if version != "0.0.0" and version <= "2.5.2": + attn_output, attn_weights = flash_attention( + query_states, + key_states, + value_states, + causal=True, + return_softmax=output_attentions, + ) + else: + attn_output = F.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + is_causal=attention_mask is None, + dropout_p=config.attention_dropout if training else 0.0, + training=training, + ) + attn_weights = None + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output + else: + # [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim] + query_states = paddle.transpose(query_states, [0, 2, 1, 3]) + # merge with the next tranpose + key_states = paddle.transpose(key_states, [0, 2, 1, 3]) + value_states = paddle.transpose(value_states, [0, 2, 1, 3]) + + # matmul and devide by sqrt(head_dim) + attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])) + + if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]: + raise ValueError( + f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.shape}" + ) + + if attention_mask is None: + attention_mask = get_triangle_upper_mask(attn_weights) + attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len]) + if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]: + raise ValueError( + f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}" + ) + + attn_weights = attn_weights + attention_mask + if not paddle.in_dynamic_mode(): + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) + else: + with paddle.amp.auto_cast(False): + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) + + attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training) + + attn_output = paddle.matmul(attn_weights, value_states) + attn_output = attn_output.transpose([0, 2, 1, 3]) + + if sequence_parallel: + attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads]) + else: + attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads]) + return (attn_output, attn_weights) if output_attentions else attn_output + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def is_casual_mask(attention_mask): + """ + Upper triangular of attention_mask equals to attention_mask is casual + """ + return (paddle.triu(attention_mask) == attention_mask).all().item() + + +def _make_causal_mask(input_ids_shape, past_key_values_length): + """ + Make causal mask used for self-attention + """ + batch_size, target_length = input_ids_shape # target_length: seq_len + + mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool")) + + if past_key_values_length > 0: + # [tgt_len, tgt_len + past_len] + mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1) + + # [bs, 1, tgt_len, tgt_len + past_len] + return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length]) + + +def _expand_2d_mask(mask, dtype, tgt_length): + """ + Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`. + """ + batch_size, src_length = mask.shape[0], mask.shape[-1] + tgt_length = tgt_length if tgt_length is not None else src_length + + mask = mask[:, None, None, :].astype("bool") + mask.stop_gradient = True + expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length]) + + return expanded_mask + + +class QWen2RMSNorm(nn.Layer): + def __init__(self, config: QWen2Config): + """ + QWen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.hidden_size = config.hidden_size + self.weight = paddle.create_parameter( + shape=[self.hidden_size], + dtype=paddle.get_default_dtype(), + default_initializer=nn.initializer.Constant(1.0), + ) + self.variance_epsilon = config.rms_norm_eps + self.config = config + + if config.sequence_parallel: + mark_as_sequence_parallel_parameter(self.weight) + + def forward(self, hidden_states): + if paddle.in_dynamic_mode(): + with paddle.amp.auto_cast(False): + hidden_states = hidden_states.astype("float32") + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + else: + hidden_states = hidden_states.astype("float32") + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + + if self.weight.dtype in [paddle.float16, paddle.bfloat16]: + hidden_states = paddle.cast(hidden_states, self.weight.dtype) + return hidden_states * self.weight + + +class QWen2RotaryEmbedding(nn.Layer): + def __init__(self, dim, max_position_embeddings=2048, base=10000): + super().__init__() + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + # [dim / 2] + self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim)) + self._set_cos_sin_cache(seq_len=max_position_embeddings) + + def _set_cos_sin_cache(self, seq_len): + self.max_seq_len_cached = seq_len + # [seq_len] + t = paddle.arange(seq_len, dtype="float32") + # [seq_len, dim/2] + freqs = paddle.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + # [seq_len, dim] + emb = paddle.concat([freqs, freqs], axis=-1) + # [1, seqlen, 1, dim] + self.cos_cached = emb.cos()[None, :, None, :] + self.sin_cached = emb.sin()[None, :, None, :] + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len) + cos = self.cos_cached[:, :seq_len, :, :] + sin = self.sin_cached[:, :seq_len, :, :] + return ( + cos.cast(x.dtype) if cos.dtype != x.dtype else cos, + sin.cast(x.dtype) if sin.dtype != x.dtype else sin, + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.concat([-x2, x1], axis=-1) # shape is the same as x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + if position_ids is None: + # Note: Only for QWen2MoEForCausalLMPipe model pretraining + cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] + sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] + else: + cos = cos.squeeze(axis=[0, 2]) # [seq_len, dim] + sin = sin.squeeze(axis=[0, 2]) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim] + sin = sin[position_ids].unsqueeze(2) # [bs, seq_len, 1, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class QWen2MLP(nn.Layer): + def __init__(self, config, is_shared=False): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = ( + config.moe_intermediate_size if not is_shared else config.shared_expert_intermediate_size + ) + self.tensor_parallel_degree = config.tensor_parallel_degree + + if config.sequence_parallel: + ColumnParallelLinear = ColumnSequenceParallelLinear + RowParallelLinear = RowSequenceParallelLinear + else: + ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear + RowParallelLinear = fleet.meta_parallel.RowParallelLinear + + if config.tensor_parallel_degree > 1: + self.gate_proj = ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + self.up_proj = ColumnParallelLinear( + self.hidden_size, + self.intermediate_size, + gather_output=False, + has_bias=False, + ) + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + input_is_parallel=True, + has_bias=False, + ) + else: + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w1 + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False) # w3 + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False) # w2 + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: + """ + This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, slen, num_key_value_heads, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + + hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1]) + return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) + + +class QWen2Attention(nn.Layer): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: QWen2Config, layerwise_recompute: bool = True): + super().__init__() + + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.head_dim = self.hidden_size // config.num_attention_heads + + self.num_key_value_heads = config.num_key_value_heads + assert config.num_attention_heads // config.num_key_value_heads + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + self.seq_length = config.seq_length + self.sequence_parallel = config.sequence_parallel + + # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True + # Enable_recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + self.layerwise_recompute = layerwise_recompute + self.recompute_granularity = config.recompute_granularity + if config.tensor_parallel_degree > 1: + assert ( + self.num_heads % config.tensor_parallel_degree == 0 + ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}" + self.num_heads = self.num_heads // config.tensor_parallel_degree + + assert ( + self.num_key_value_heads % config.tensor_parallel_degree == 0 + ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}" + self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree + + self.use_fused_rope = config.use_fused_rope + if self.use_fused_rope: + if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None: + warnings.warn( + "Enable fuse rope in the config, but fuse rope is not available. " + "Will disable fuse rope. Try using latest gpu version of Paddle." + ) + self.use_fused_rope = False + + if config.sequence_parallel: + ColumnParallelLinear = ColumnSequenceParallelLinear + RowParallelLinear = RowSequenceParallelLinear + else: + ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear + RowParallelLinear = fleet.meta_parallel.RowParallelLinear + + if config.tensor_parallel_degree > 1: + self.q_proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, has_bias=True, gather_output=False) + self.k_proj = ColumnParallelLinear( + self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False + ) + self.v_proj = ColumnParallelLinear( + self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False + ) + self.o_proj = RowParallelLinear(self.hidden_size, self.hidden_size, has_bias=False, input_is_parallel=True) + else: + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=True) + self.k_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) + self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) + + self.rotary_emb = QWen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states, + position_ids: Optional[Tuple[paddle.Tensor]] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) + + batch_size, seq_len, _ = hidden_states.shape + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.sequence_parallel: + target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim] + target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim] + else: + target_query_shape = [0, 0, self.num_heads, self.head_dim] + target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim] + query_states = query_states.reshape(shape=target_query_shape) + key_states = key_states.reshape(shape=target_key_value_shape) + value_states = value_states.reshape(shape=target_key_value_shape) + + kv_seq_len = key_states.shape[-3] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-3] + if self.use_fused_rope: + assert past_key_value is None, "fuse rotary not support cache kv for now" + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states, _ = fused_rotary_position_embedding( + query_states, + key_states, + v=None, + sin=sin, + cos=cos, + position_ids=position_ids, + use_neox_rotary_style=False, + ) + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + # [bs, seq_len, num_head, head_dim] + if past_key_value is not None: + key_states = paddle.concat([past_key_value[0], key_states], axis=1) + value_states = paddle.concat([past_key_value[1], value_states], axis=1) + past_key_value = (key_states, value_states) if use_cache else None + + # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1 + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient) + if ( + self.enable_recompute + and self.layerwise_recompute + and has_gradient + and self.recompute_granularity == "core_attn" + ): + outputs = recompute( + scaled_dot_product_attention, + query_states, + self.config, + key_states, + value_states, + attention_mask, + output_attentions, + self.training, + self.sequence_parallel, + use_reentrant=self.config.recompute_use_reentrant, + ) + else: + outputs = scaled_dot_product_attention( + query_states, + self.config, + key_states, + value_states, + attention_mask, + output_attentions, + self.training, + self.sequence_parallel, + ) + if output_attentions: + attn_output, attn_weights = outputs + else: + attn_output = outputs + + # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim] + # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism. + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + outputs = (attn_output,) + + if output_attentions: + outputs += (attn_weights,) + + if use_cache: + outputs += (past_key_value,) + + if type(outputs) is tuple and len(outputs) == 1: + outputs = outputs[0] + + return outputs + + +class QWen2DecoderLayer(nn.Layer): + def __init__(self, config: QWen2Config, layerwise_recompute: bool = False): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = QWen2Attention(config, layerwise_recompute) + + self.mlp = QWen2MLP(config) + self.input_layernorm = QWen2RMSNorm(config) + self.post_attention_layernorm = QWen2RMSNorm(config) + + # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True + # Enable_recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + self.layerwise_recompute = layerwise_recompute + self.recompute_granularity = config.recompute_granularity + + def forward( + self, + hidden_states: paddle.Tensor, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: + """ + Args: + hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`paddle.Tensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states + """ + + # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel) + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + has_gradient = not hidden_states.stop_gradient + if ( + self.enable_recompute + and self.layerwise_recompute + and has_gradient + and self.recompute_granularity == "full_attn" + ): + outputs = recompute( + self.self_attn, + hidden_states, + position_ids, + past_key_value, + attention_mask, + output_attentions, + use_cache, + use_reentrant=self.config.recompute_use_reentrant, + ) + else: + outputs = self.self_attn( + hidden_states, + position_ids, + past_key_value, + attention_mask, + output_attentions, + use_cache, + ) + + if type(outputs) is tuple: + hidden_states = outputs[0] + else: + hidden_states = outputs + + if output_attentions: + self_attn_weights = outputs[1] + + if use_cache: + present_key_value = outputs[2 if output_attentions else 1] + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + if type(outputs) is tuple and len(outputs) == 1: + outputs = outputs[0] + + return outputs + + +class QWen2PretrainedModel(PretrainedModel): + config_class = QWen2Config + base_model_prefix = "qwen2" + _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] + + @classmethod + def _get_name_mappings(cls, config: QWen2Config) -> list[StateDictNameMapping]: + mappings: list[StateDictNameMapping] = [] + model_mappings = [ + ["embed_tokens.weight"], + ["norm.weight"], + ] + for layer_index in range(config.num_hidden_layers): + layer_mappings = [ + [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.q_proj.bias", None], + [f"layers.{layer_index}.self_attn.k_proj.bias", None], + [f"layers.{layer_index}.self_attn.v_proj.bias", None], + [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], + [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], + [f"layers.{layer_index}.input_layernorm.weight"], + [f"layers.{layer_index}.post_attention_layernorm.weight"], + ] + model_mappings.extend(layer_mappings) + + init_name_mappings(mappings=model_mappings) + # base-model prefix "QWen2MoEModel" + if "QWen2Model" not in config.architectures: + for mapping in model_mappings: + mapping[0] = "model." + mapping[0] + mapping[1] = "qwen2." + mapping[1] + model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) + + mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] + return mappings + + @classmethod + def _get_tensor_parallel_mappings(cls, config: QWen2Config, is_split=True): + from paddlenlp.transformers.conversion_utils import split_or_merge_func + + fn = split_or_merge_func( + is_split=is_split, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + ) + + def get_tensor_parallel_split_mappings(num_layers, num_experts): + final_actions = {} + + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + # Row Linear + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } + + if not config.vocab_size % config.tensor_parallel_degree == 0: + base_actions.pop("lm_head.weight") + base_actions.pop("embed_tokens.weight") + + # Column Linear + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + # if we have enough num_key_value_heads to split, then split it. + if config.num_key_value_heads % config.tensor_parallel_degree == 0: + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) + + return mappings + + def _init_weights(self, layer): + """Initialization hook""" + if self.config.tensor_parallel_degree > 1: + rng_tracker = get_rng_state_tracker().rng_state + if isinstance( + layer, + ( + nn.Linear, + nn.Embedding, + mpu.VocabParallelEmbedding, + mpu.ColumnParallelLinear, + mpu.RowParallelLinear, + QWen2LMHead, + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ), + ): + # In the dygraph mode, use the `set_value` to reset the parameter directly, + # and reset the `state_dict` to update parameter in static mode. + if isinstance(layer.weight, paddle.Tensor): + if layer.weight.is_distributed: + with rng_tracker(): + layer.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.qwen2moe.config.initializer_range, + shape=layer.weight.shape, + ) + ) + else: + layer.weight.set_value( + paddle.tensor.normal( + mean=0.0, + std=self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.qwen2moe.config.initializer_range, + shape=layer.weight.shape, + ) + ) + if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor): + layer.bias.set_value(paddle.zeros_like(layer.bias)) + # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530 + # sublayer is init first + # scale RowParallelLinear weight + with paddle.no_grad(): + if isinstance(layer, QWen2MLP): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.down_proj.weight.scale_(factor) + if isinstance(layer, QWen2Attention): + factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) + layer.o_proj.weight.scale_(factor) + + +@register_base_model +class QWen2Model(QWen2PretrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2DecoderLayer`] + + Args: + config: QWen2Config + """ + + def __init__(self, config: QWen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.hidden_size = config.hidden_size + self.sequence_parallel = config.sequence_parallel + self.recompute_granularity = config.recompute_granularity + self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else [] + + # Recompute defaults to False and is controlled by Trainer + self.enable_recompute = False + if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: + self.embed_tokens = mpu.VocabParallelEmbedding( + self.vocab_size, + self.hidden_size, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()), + ) + else: + self.embed_tokens = nn.Embedding( + self.vocab_size, + self.hidden_size, + ) + + self.layers = nn.LayerList( + [ + QWen2DecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = QWen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @staticmethod + def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype): + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if len(attention_mask.shape) == 2: + expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1]) + # For decoding phase in generation, seq_length = 1, we don't need to add causal mask + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + past_key_values_length=past_key_values_length, + ) + expanded_attn_mask = expanded_attn_mask & combined_attention_mask + # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len] + elif len(attention_mask.shape) == 3: + expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool") + # if attention_mask is already 4-D, do nothing + else: + expanded_attn_mask = attention_mask + else: + expanded_attn_mask = _make_causal_mask( + input_shape, + past_key_values_length=past_key_values_length, + ) + # Convert bool attention_mask to float attention mask, which will be added to attention_scores later + expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + return expanded_attn_mask + + @paddle.jit.not_to_static + def recompute_training_full( + self, + layer_module: nn.Layer, + hidden_states: Tensor, + position_ids: Optional[Tensor], + attention_mask: Tensor, + output_attentions: bool, + past_key_value: Tensor, + use_cache: bool, + ): + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = recompute( + create_custom_forward(layer_module), + hidden_states, + position_ids, + attention_mask, + output_attentions, + past_key_value, + use_cache, + use_reentrant=self.config.recompute_use_reentrant, + ) + + return hidden_states + + def forward( + self, + input_ids: paddle.Tensor = None, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if past_key_values is None: + past_key_values = tuple([None] * len(self.layers)) + # NOTE: to make cache can be clear in-time + past_key_values = list(past_key_values) + + seq_length_with_past = seq_length + cache_length = 0 + if past_key_values[0] is not None: + cache_length = past_key_values[0][0].shape[1] + seq_length_with_past += cache_length + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self.sequence_parallel: + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = inputs_embeds.shape + inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + inputs_embeds = ScatterOp.apply(inputs_embeds) + + # embed positions + if attention_mask is None: + # [bs, seq_len] + attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool) + + if position_ids is None: + position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length)) + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype + ) # [bs, 1, seq_len, seq_len] + if self.config.use_flash_attention: + is_casual = is_casual_mask(attention_mask) + if is_casual: + attention_mask = None + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, (decoder_layer) in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + past_key_value = past_key_values[idx] if past_key_values is not None else None + + has_gradient = not hidden_states.stop_gradient + if ( + self.enable_recompute + and idx not in self.no_recompute_layers + and has_gradient + and self.recompute_granularity == "full" + ): + layer_outputs = self.recompute_training_full( + decoder_layer, + hidden_states, + position_ids, + attention_mask, + output_attentions, + past_key_value, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_ids, + attention_mask, + output_attentions, + past_key_value, + use_cache, + ) + + # NOTE: clear outdate cache after it has been used for memory saving + past_key_value = past_key_values[idx] = None + if type(layer_outputs) is tuple: + hidden_states = layer_outputs[0] + else: + hidden_states = layer_outputs + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class QWen2PretrainingCriterion(nn.Layer): + """ + Criterion for Mixtral. + It calculates the final loss. + """ + + def __init__(self, config): + super(QWen2PretrainingCriterion, self).__init__() + self.ignore_index = getattr(config, "ignore_index", -100) + self.config = config + self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output + + if self.enable_parallel_cross_entropy: # and False: # and lm_head is distributed + self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index) + else: + self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + def forward(self, prediction_scores, masked_lm_labels): + if self.enable_parallel_cross_entropy: + if prediction_scores.shape[-1] == self.config.vocab_size: + warnings.warn( + f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}" + ) + self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index) + + with paddle.amp.auto_cast(False): + masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)) + + # skip ignore_index which loss == 0 + masked_lm_loss = masked_lm_loss[masked_lm_loss > 0] + loss = paddle.mean(masked_lm_loss) + + return loss + + +class QWen2LMHead(nn.Layer): + def __init__(self, config: QWen2Config): + super(QWen2LMHead, self).__init__() + self.config = config + if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: + vocab_size = config.vocab_size // config.tensor_parallel_degree + else: + vocab_size = config.vocab_size + + self.weight = self.create_parameter( + shape=[config.hidden_size, vocab_size], + dtype=paddle.get_default_dtype(), + ) + # Must set distributed attr for Tensor Parallel ! + self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False + if self.weight.is_distributed: + self.weight.split_axis = 1 + + def forward(self, hidden_states, tensor_parallel_output=None): + if self.config.sequence_parallel: + hidden_states = GatherOp.apply(hidden_states) + seq_length = self.config.seq_length + hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size]) + + if tensor_parallel_output is None: + tensor_parallel_output = self.config.tensor_parallel_output + + logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output) + return logits + + +class QWen2ForCausalLM(QWen2PretrainedModel): + enable_to_static_method = True + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: QWen2Config): + super().__init__(config) + self.qwen2 = QWen2Model(config) + self.lm_head = QWen2LMHead(config) + self.criterion = QWen2PretrainingCriterion(config) + self.vocab_size = config.vocab_size + + def get_input_embeddings(self): + return self.qwen2.embed_tokens + + def set_input_embeddings(self, value): + self.qwen2.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.qwen2 = decoder + + def get_decoder(self): + return self.qwen2 + + def prepare_inputs_for_generation( + self, + input_ids, + use_cache=False, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + output_router_logits=False, + **kwargs + ): + batch_size, seq_length = input_ids.shape + position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length))) + attention_mask = kwargs.get("attention_mask", None) + if past_key_values: + input_ids = input_ids[:, -1].unsqueeze(axis=-1) + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "output_router_logits": output_router_logits, + } + ) + return model_inputs + + def _get_model_inputs_spec(self, dtype: str): + return { + "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"), + } + + @staticmethod + def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False): + # update cache + if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor): + model_kwargs["past_key_values"] = outputs[1] + + if isinstance(outputs, CausalLMOutputWithPast) and "past_key_values" in outputs: + model_kwargs["past_key_values"] = outputs.past_key_values + + # update position_ids + if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None: + position_ids = model_kwargs["position_ids"] + model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1) + + if not is_encoder_decoder and "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = paddle.concat( + [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1 + ) + + return model_kwargs + + def forward( + self, + input_ids: paddle.Tensor = None, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, QWen2ForCausalLM + + >>> model = QWen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.qwen2( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + # if labels is None,means we need full output, instead of tensor_parallel_output + # tensor_parallel_output is togather with ParallelCrossEntropy + tensor_parallel_output = ( + self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1 + ) + + logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output) + + loss = None + if labels is not None: + loss = self.criterion(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class QWen2ForSequenceClassification(QWen2PretrainedModel): + def __init__(self, config: QWen2Config): + super().__init__(config) + self.num_labels = config.num_labels + self.qwen2 = QWen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + def get_input_embeddings(self): + return self.qwen2.embed_tokens + + def set_input_embeddings(self, value): + self.qwen2.embed_tokens = value + + def forward( + self, + input_ids: paddle.Tensor = None, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.qwen2( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = paddle.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[paddle.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = nn.MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(pooled_logits.reshape([-1, self.num_labels]), labels.reshape([-1])) + elif self.config.problem_type == "multi_label_classification": + loss_fct = nn.BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->QWen2, LLAMA->QWEN2 +class QWen2ForTokenClassification(QWen2PretrainedModel): + def __init__(self, config: QWen2Config): + super().__init__(config) + self.num_labels = config.num_labels + self.qwen2 = QWen2Model(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.score = nn.Linear(config.hidden_size, config.num_labels) + + def get_input_embeddings(self): + return self.qwen2.embed_tokens + + def set_input_embeddings(self, value): + self.qwen2.embed_tokens = value + + def forward( + self, + input_ids: paddle.Tensor = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + past_key_values: Optional[List[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.qwen2( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.score(sequence_output) + + loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.reshape([-1])) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/paddlenlp/transformers/qwen2/modeling_pp.py b/paddlenlp/transformers/qwen2/modeling_pp.py new file mode 100644 index 000000000000..2965de45c4cd --- /dev/null +++ b/paddlenlp/transformers/qwen2/modeling_pp.py @@ -0,0 +1,232 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.distributed.fleet as fleet +import paddle.nn as nn +from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer +from paddle.distributed.fleet.utils import recompute + +from ...utils.tools import get_env_device +from ..model_utils import PipelinePretrainedModel +from .modeling import ( + QWen2Config, + QWen2DecoderLayer, + QWen2LMHead, + QWen2Model, + QWen2PretrainedModel, + QWen2PretrainingCriterion, + QWen2RMSNorm, +) + +__all__ = [ + "QWenForCausalLMPipe", +] + + +def parse_args(args): + if isinstance(args, tuple): + if len(args) == 3: + hidden_states, attention_mask, position_ids = args + elif len(args) == 2: + hidden_states, attention_mask = args + position_ids = None + elif len(args) == 1: + hidden_states = args + attention_mask, position_ids = None, None + else: + hidden_states = args + attention_mask, position_ids = None, None + + if position_ids is not None: + position_ids.stop_gradient = True + + if attention_mask is not None: + attention_mask.stop_gradient = True + + return hidden_states, attention_mask, position_ids + + +def return_args(hidden_states, attention_mask=None, position_ids=None): + ret = (hidden_states,) + + if attention_mask is not None: + ret += (attention_mask.clone(),) + if position_ids is not None: + ret += (position_ids.clone(),) + if len(ret) == 1: + ret = ret[0] + + return ret + + +class QWen2EmbeddingPipe(nn.Layer): + """Extends QWenEmbeddings to forward attention_mask through the pipeline.""" + + def __init__(self, config: QWen2Config): + super().__init__() + self.hidden_size = config.hidden_size + if config.tensor_parallel_degree > 1: + self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()), + ) + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) + + def forward(self, args): + """_summary_ + + Args: + input (_type_): _description_ + + Returns: + _type_: _description_ + """ + input_ids, attention_mask, position_ids = parse_args(args) + input_embeds = self.embed_tokens(input_ids) + if self.sequence_parallel: + from paddlenlp.transformers import ScatterOp + + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = input_embeds.shape + input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + input_embeds = ScatterOp.apply(input_embeds) + + batch_size, seq_length = input_ids.shape + + if attention_mask is not None: + attention_mask = QWen2Model._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), 0, input_embeds.dtype + ) + attention_mask.stop_gradient = True + if get_env_device() == "npu": + attention_mask = attention_mask.astype("bool") + elif get_env_device() == "npu": + attention_mask = paddle.tril(paddle.ones((seq_length, seq_length), dtype="bool")) + attention_mask.stop_gradient = True + + return return_args(input_embeds, attention_mask, position_ids) + + +class QWen2DecoderLayerPipe(QWen2DecoderLayer): + def forward(self, args): + hidden_states, attention_mask, position_ids = parse_args(args) + + has_gradient = not hidden_states.stop_gradient + + if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient: + if attention_mask is not None: + hidden_states = recompute( + super().forward, hidden_states, attention_mask=attention_mask, use_reentrant=False + ) + else: + # for pretrain + hidden_states = recompute( + super().forward, hidden_states, use_reentrant=self.config.recompute_use_reentrant + ) + else: + hidden_states = super().forward(hidden_states, attention_mask=attention_mask) + + return return_args(hidden_states, attention_mask, position_ids) + + +class QWen2RMSNormPipe(QWen2RMSNorm): + def __init__(self, config: QWen2Config): + super().__init__() + self.norm = QWen2RMSNorm(config) + + def forward(self, args): + hidden_states, attention_mask, position_ids = parse_args(args) + return self.norm(hidden_states) + + +class QWenForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): + """QWenForPretraining adapted for pipeline parallelism. + + The largest change is flattening the QWenModel class so we can express it as a + sequence of layers including embedding, transformer layers, and output. + """ + + config_class = QWen2Config + + _get_tensor_parallel_mappings = QWen2PretrainedModel._get_tensor_parallel_mappings + _init_weights = QWen2PretrainedModel._init_weights + _keys_to_ignore_on_load_unexpected = QWen2PretrainedModel._keys_to_ignore_on_load_unexpected + + # DONOT Add base_model_prefix !!!! + + def __init__(self, config: QWen2Config): + self.config = config + + self.use_recompute = self.config.use_recompute + self.recompute_granularity = self.config.recompute_granularity + self.pp_recompute_interval = self.config.pp_recompute_interval + self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else [] + if self.recompute_granularity == "full": + assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support" + + virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1) + + def get_hcg(): + return fleet.get_hybrid_communicate_group() + + hcg = get_hcg() + tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1) + tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0) + + # TODO: fix tensor_parallel_degree rewrite in here + config.tensor_parallel_degree = tensor_parallel_degree + config.tensor_parallel_rank = tensor_parallel_rank + + self.add_sequential_layer(LayerDesc(QWen2EmbeddingPipe, config=config), "qwen2") + for i in range(config.num_hidden_layers): + self.add_sequential_layer( + LayerDesc(QWen2DecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers), + f"qwen2.layers.{i}", + ) + self.add_sequential_layer(LayerDesc(QWen2RMSNormPipe, config=config), "norm") + self.add_sequential_layer(LayerDesc(QWen2LMHead, config=config), "lm_head") + + recompute_interval = 0 + if self.use_recompute and self.recompute_granularity == "full": + assert self.config.pp_recompute_interval <= config.num_hidden_layers // ( + virtual_pp_degree * get_hcg().topology().get_dim_size("pipe") + ), "pp recompute interval should smaller than num layers of each pp chunk" + recompute_interval = self.config.pp_recompute_interval + + seg_method = "layer:QWen2DecoderLayer" + if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0: + seg_method = "uniform" + + PipelineLayer.__init__( + self, + layers=self.get_sequential_layers(), + loss_fn=QWen2PretrainingCriterion(config), + topology=get_hcg().topology(), + seg_method=seg_method, + recompute_interval=recompute_interval, + recompute_ctx={ + "mp_group": get_hcg().get_model_parallel_group(), + "offload": False, + "partition": False, + }, + num_virtual_pipeline_stages=virtual_pp_degree, + ) + # You should call init here, since there is a diamond inheritance problem + self.apply(self._init_weights) + # DON'T init PipelinePretrainedModel + # PipelinePretrainedModel.__init__(self.super(), config=config) diff --git a/paddlenlp/transformers/qwen2moe/tokenizer.py b/paddlenlp/transformers/qwen2/tokenizer.py similarity index 100% rename from paddlenlp/transformers/qwen2moe/tokenizer.py rename to paddlenlp/transformers/qwen2/tokenizer.py diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2moe/modeling.py index ea920a3a241a..8e3ed555ff81 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2moe/modeling.py @@ -343,7 +343,7 @@ def _expand_2d_mask(mask, dtype, tgt_length): class QWen2MoERMSNorm(nn.Layer): - def __init__(self, config): + def __init__(self, config: QWen2MoEConfig): super().__init__() self.hidden_size = config.hidden_size self.weight = paddle.create_parameter( @@ -432,7 +432,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): # Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->QWen2MoE class QWen2MoEMLP(nn.Layer): - def __init__(self, config, is_shared=False): + def __init__(self, config: QWen2MoEConfig, is_shared=False): super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = ( @@ -1320,7 +1320,7 @@ class QWen2MoEPretrainingCriterion(nn.Layer): It calculates the final loss. """ - def __init__(self, config): + def __init__(self, config: QWen2MoEConfig): super(QWen2MoEPretrainingCriterion, self).__init__() self.ignore_index = getattr(config, "ignore_index", -100) self.config = config @@ -1384,7 +1384,7 @@ class QWen2MoEForCausalLM(QWen2MoEPretrainedModel): enable_to_static_method = True _tied_weights_keys = ["lm_head.weight"] - def __init__(self, config): + def __init__(self, config: QWen2MoEConfig): super().__init__(config) self.config = config From b274f12d24fc722ff437703cc4a6d0b457e55589 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 5 Jun 2024 06:21:06 +0000 Subject: [PATCH 32/41] update QWen to Qwen --- paddlenlp/transformers/__init__.py | 1 + paddlenlp/transformers/auto/tokenizer.py | 1 - paddlenlp/transformers/qwen2/configuration.py | 26 +++- paddlenlp/transformers/qwen2/modeling.py | 126 +++++++++--------- paddlenlp/transformers/qwen2/modeling_pp.py | 50 +++---- paddlenlp/transformers/qwen2/tokenizer.py | 6 +- paddlenlp/transformers/qwen2moe/__init__.py | 2 +- 7 files changed, 116 insertions(+), 96 deletions(-) diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index f9a996093b67..61242951b24f 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -294,6 +294,7 @@ from .deberta_v2.modeling import * from .deberta_v2.tokenizer import * from .deberta_v2.configuration import * +from .qwen2 import * from .qwen2moe import * # For faster tokenizer diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 694b1c5ee35d..6a960c6f7fac 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -97,7 +97,6 @@ ("BloomTokenizer", "bloom"), ("SpeechT5Tokenizer", "speecht5"), ("QWenTokenizer", "qwen"), - ("QWen2MoETokenizer", "qwen2moe"), ("GemmaTokenizer", "gemma"), ] ) diff --git a/paddlenlp/transformers/qwen2/configuration.py b/paddlenlp/transformers/qwen2/configuration.py index 5330d81df4bf..71eb7ee2f2d2 100644 --- a/paddlenlp/transformers/qwen2/configuration.py +++ b/paddlenlp/transformers/qwen2/configuration.py @@ -17,11 +17,11 @@ from ..configuration_utils import PretrainedConfig __all__ = [ - "QWen2Config", + "Qwen2Config", ] -class QWen2Config(PretrainedConfig): +class Qwen2Config(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -107,13 +107,19 @@ def __init__( use_cache=True, use_recompute=False, recompute_granularity="full", + pp_recompute_interval=1, no_recompute_layers=None, + fuse_attention_qkv=False, + fuse_attention_ffn=False, use_flash_attention=False, - tie_word_embeddings=False, - rope_theta=10000.0, + use_fused_rms_norm=False, + use_fused_rope=False, tensor_parallel_output=True, sequence_parallel=False, fuse_sequence_parallel_allreduce=False, + virtual_pp_degree=1, + tie_word_embeddings=False, + rope_theta=10000.0, pad_token_id=0, bos_token_id=151643, eos_token_id=151643, @@ -121,6 +127,8 @@ def __init__( sliding_window=4096, max_window_layers=28, attention_dropout=0.0, + rope_scaling_factor=1.0, + rope_scaling_type=None, **kwargs, ): self.vocab_size = vocab_size @@ -146,13 +154,23 @@ def __init__( self.rope_theta = rope_theta self.attention_dropout = attention_dropout + self.use_cache = use_cache self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.no_recompute_layers = no_recompute_layers + self.pp_recompute_interval = pp_recompute_interval + self.fuse_attention_qkv = fuse_attention_qkv self.use_flash_attention = use_flash_attention + self.fuse_attention_ffn = fuse_attention_ffn + self.use_fused_rms_norm = use_fused_rms_norm self.tensor_parallel_output = tensor_parallel_output self.sequence_parallel = sequence_parallel self.fuse_sequence_parallel_allreduce = fuse_sequence_parallel_allreduce + self.virtual_pp_degree = virtual_pp_degree + + self.use_fused_rope = use_fused_rope + self.rope_scaling_factor = rope_scaling_factor + self.rope_scaling_type = rope_scaling_type self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py index 7f3c2fbc53c4..2de89eb944c2 100644 --- a/paddlenlp/transformers/qwen2/modeling.py +++ b/paddlenlp/transformers/qwen2/modeling.py @@ -17,7 +17,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Paddle QWen2 model.""" +"""Paddle Qwen2 model.""" import math import warnings @@ -48,7 +48,7 @@ TokenClassifierOutput, ) from ..model_utils import PretrainedModel, register_base_model -from .configuration import QWen2Config +from .configuration import Qwen2Config try: from paddle.incubate.nn.functional import fused_rotary_position_embedding @@ -62,10 +62,10 @@ __all__ = [ - "QWen2Model", - "QWen2PretrainedModel", - "QWen2ForCausalLM", - "QWen2PretrainingCriterion", + "Qwen2Model", + "Qwen2PretrainedModel", + "Qwen2ForCausalLM", + "Qwen2PretrainingCriterion", ] @@ -269,10 +269,10 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -class QWen2RMSNorm(nn.Layer): - def __init__(self, config: QWen2Config): +class Qwen2RMSNorm(nn.Layer): + def __init__(self, config: Qwen2Config): """ - QWen2RMSNorm is equivalent to T5LayerNorm + Qwen2RMSNorm is equivalent to T5LayerNorm """ super().__init__() self.hidden_size = config.hidden_size @@ -303,7 +303,7 @@ def forward(self, hidden_states): return hidden_states * self.weight -class QWen2RotaryEmbedding(nn.Layer): +class Qwen2RotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() self.dim = dim @@ -347,7 +347,7 @@ def rotate_half(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids): if position_ids is None: - # Note: Only for QWen2MoEForCausalLMPipe model pretraining + # Note: Only for Qwen2MoEForCausalLMPipe model pretraining cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] else: @@ -360,13 +360,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed -class QWen2MLP(nn.Layer): - def __init__(self, config, is_shared=False): +class Qwen2MLP(nn.Layer): + def __init__(self, config: Qwen2Config, is_shared=False): super().__init__() self.hidden_size = config.hidden_size - self.intermediate_size = ( - config.moe_intermediate_size if not is_shared else config.shared_expert_intermediate_size - ) + self.intermediate_size = config.intermediate_size + self.tensor_parallel_degree = config.tensor_parallel_degree if config.sequence_parallel: @@ -419,13 +418,13 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -class QWen2Attention(nn.Layer): +class Qwen2Attention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". """ - def __init__(self, config: QWen2Config, layerwise_recompute: bool = True): + def __init__(self, config: Qwen2Config, layerwise_recompute: bool = True): super().__init__() self.config = config @@ -493,7 +492,7 @@ def __init__(self, config: QWen2Config, layerwise_recompute: bool = True): self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) - self.rotary_emb = QWen2RotaryEmbedding( + self.rotary_emb = Qwen2RotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, @@ -614,15 +613,15 @@ def forward( return outputs -class QWen2DecoderLayer(nn.Layer): - def __init__(self, config: QWen2Config, layerwise_recompute: bool = False): +class Qwen2DecoderLayer(nn.Layer): + def __init__(self, config: Qwen2Config, layerwise_recompute: bool = False): super().__init__() self.hidden_size = config.hidden_size - self.self_attn = QWen2Attention(config, layerwise_recompute) + self.self_attn = Qwen2Attention(config, layerwise_recompute) - self.mlp = QWen2MLP(config) - self.input_layernorm = QWen2RMSNorm(config) - self.post_attention_layernorm = QWen2RMSNorm(config) + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config) + self.post_attention_layernorm = Qwen2RMSNorm(config) # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True # Enable_recompute defaults to False and is controlled by Trainer @@ -721,13 +720,13 @@ def forward( return outputs -class QWen2PretrainedModel(PretrainedModel): - config_class = QWen2Config +class Qwen2PretrainedModel(PretrainedModel): + config_class = Qwen2Config base_model_prefix = "qwen2" _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod - def _get_name_mappings(cls, config: QWen2Config) -> list[StateDictNameMapping]: + def _get_name_mappings(cls, config: Qwen2Config) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] model_mappings = [ ["embed_tokens.weight"], @@ -742,6 +741,9 @@ def _get_name_mappings(cls, config: QWen2Config) -> list[StateDictNameMapping]: [f"layers.{layer_index}.self_attn.k_proj.bias", None], [f"layers.{layer_index}.self_attn.v_proj.bias", None], [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"], + [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"], [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"], [f"layers.{layer_index}.input_layernorm.weight"], [f"layers.{layer_index}.post_attention_layernorm.weight"], @@ -749,8 +751,8 @@ def _get_name_mappings(cls, config: QWen2Config) -> list[StateDictNameMapping]: model_mappings.extend(layer_mappings) init_name_mappings(mappings=model_mappings) - # base-model prefix "QWen2MoEModel" - if "QWen2Model" not in config.architectures: + # base-model prefix "Qwen2MoEModel" + if "Qwen2Model" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] mapping[1] = "qwen2." + mapping[1] @@ -760,7 +762,7 @@ def _get_name_mappings(cls, config: QWen2Config) -> list[StateDictNameMapping]: return mappings @classmethod - def _get_tensor_parallel_mappings(cls, config: QWen2Config, is_split=True): + def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): from paddlenlp.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -818,7 +820,7 @@ def _init_weights(self, layer): mpu.VocabParallelEmbedding, mpu.ColumnParallelLinear, mpu.RowParallelLinear, - QWen2LMHead, + Qwen2LMHead, ColumnSequenceParallelLinear, RowSequenceParallelLinear, ), @@ -833,7 +835,7 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.qwen2moe.config.initializer_range, + else self.qwen2.config.initializer_range, shape=layer.weight.shape, ) ) @@ -843,7 +845,7 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.qwen2moe.config.initializer_range, + else self.qwen2.config.initializer_range, shape=layer.weight.shape, ) ) @@ -853,24 +855,24 @@ def _init_weights(self, layer): # sublayer is init first # scale RowParallelLinear weight with paddle.no_grad(): - if isinstance(layer, QWen2MLP): + if isinstance(layer, Qwen2MLP): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.down_proj.weight.scale_(factor) - if isinstance(layer, QWen2Attention): + if isinstance(layer, Qwen2Attention): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) @register_base_model -class QWen2Model(QWen2PretrainedModel): +class Qwen2Model(Qwen2PretrainedModel): """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2DecoderLayer`] + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] Args: - config: QWen2Config + config: Qwen2Config """ - def __init__(self, config: QWen2Config): + def __init__(self, config: Qwen2Config): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -896,11 +898,11 @@ def __init__(self, config: QWen2Config): self.layers = nn.LayerList( [ - QWen2DecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) + Qwen2DecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) for layer_idx in range(config.num_hidden_layers) ] ) - self.norm = QWen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm = Qwen2RMSNorm(config) def get_input_embeddings(self): return self.embed_tokens @@ -1100,14 +1102,14 @@ def forward( ) -class QWen2PretrainingCriterion(nn.Layer): +class Qwen2PretrainingCriterion(nn.Layer): """ Criterion for Mixtral. It calculates the final loss. """ - def __init__(self, config): - super(QWen2PretrainingCriterion, self).__init__() + def __init__(self, config: Qwen2Config): + super(Qwen2PretrainingCriterion, self).__init__() self.ignore_index = getattr(config, "ignore_index", -100) self.config = config self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output @@ -1135,9 +1137,9 @@ def forward(self, prediction_scores, masked_lm_labels): return loss -class QWen2LMHead(nn.Layer): - def __init__(self, config: QWen2Config): - super(QWen2LMHead, self).__init__() +class Qwen2LMHead(nn.Layer): + def __init__(self, config: Qwen2Config): + super(Qwen2LMHead, self).__init__() self.config = config if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: vocab_size = config.vocab_size // config.tensor_parallel_degree @@ -1166,15 +1168,15 @@ def forward(self, hidden_states, tensor_parallel_output=None): return logits -class QWen2ForCausalLM(QWen2PretrainedModel): +class Qwen2ForCausalLM(Qwen2PretrainedModel): enable_to_static_method = True _tied_weights_keys = ["lm_head.weight"] - def __init__(self, config: QWen2Config): + def __init__(self, config: Qwen2Config): super().__init__(config) - self.qwen2 = QWen2Model(config) - self.lm_head = QWen2LMHead(config) - self.criterion = QWen2PretrainingCriterion(config) + self.qwen2 = Qwen2Model(config) + self.lm_head = Qwen2LMHead(config) + self.criterion = Qwen2PretrainingCriterion(config) self.vocab_size = config.vocab_size def get_input_embeddings(self): @@ -1283,9 +1285,9 @@ def forward( Example: ```python - >>> from transformers import AutoTokenizer, QWen2ForCausalLM + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM - >>> model = QWen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" @@ -1343,11 +1345,11 @@ def forward( ) -class QWen2ForSequenceClassification(QWen2PretrainedModel): - def __init__(self, config: QWen2Config): +class Qwen2ForSequenceClassification(Qwen2PretrainedModel): + def __init__(self, config: Qwen2Config): super().__init__(config) self.num_labels = config.num_labels - self.qwen2 = QWen2Model(config) + self.qwen2 = Qwen2Model(config) self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) def get_input_embeddings(self): @@ -1447,12 +1449,12 @@ def forward( ) -# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->QWen2, LLAMA->QWEN2 -class QWen2ForTokenClassification(QWen2PretrainedModel): - def __init__(self, config: QWen2Config): +# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2, LLAMA->QWEN2 +class Qwen2ForTokenClassification(Qwen2PretrainedModel): + def __init__(self, config: Qwen2Config): super().__init__(config) self.num_labels = config.num_labels - self.qwen2 = QWen2Model(config) + self.qwen2 = Qwen2Model(config) if getattr(config, "classifier_dropout", None) is not None: classifier_dropout = config.classifier_dropout elif getattr(config, "hidden_dropout", None) is not None: diff --git a/paddlenlp/transformers/qwen2/modeling_pp.py b/paddlenlp/transformers/qwen2/modeling_pp.py index 2965de45c4cd..952daf419cbd 100644 --- a/paddlenlp/transformers/qwen2/modeling_pp.py +++ b/paddlenlp/transformers/qwen2/modeling_pp.py @@ -21,13 +21,13 @@ from ...utils.tools import get_env_device from ..model_utils import PipelinePretrainedModel from .modeling import ( - QWen2Config, - QWen2DecoderLayer, - QWen2LMHead, - QWen2Model, - QWen2PretrainedModel, - QWen2PretrainingCriterion, - QWen2RMSNorm, + Qwen2Config, + Qwen2DecoderLayer, + Qwen2LMHead, + Qwen2Model, + Qwen2PretrainedModel, + Qwen2PretrainingCriterion, + Qwen2RMSNorm, ) __all__ = [ @@ -71,10 +71,10 @@ def return_args(hidden_states, attention_mask=None, position_ids=None): return ret -class QWen2EmbeddingPipe(nn.Layer): +class Qwen2EmbeddingPipe(nn.Layer): """Extends QWenEmbeddings to forward attention_mask through the pipeline.""" - def __init__(self, config: QWen2Config): + def __init__(self, config: Qwen2Config): super().__init__() self.hidden_size = config.hidden_size if config.tensor_parallel_degree > 1: @@ -109,7 +109,7 @@ def forward(self, args): batch_size, seq_length = input_ids.shape if attention_mask is not None: - attention_mask = QWen2Model._prepare_decoder_attention_mask( + attention_mask = Qwen2Model._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), 0, input_embeds.dtype ) attention_mask.stop_gradient = True @@ -122,7 +122,7 @@ def forward(self, args): return return_args(input_embeds, attention_mask, position_ids) -class QWen2DecoderLayerPipe(QWen2DecoderLayer): +class Qwen2DecoderLayerPipe(Qwen2DecoderLayer): def forward(self, args): hidden_states, attention_mask, position_ids = parse_args(args) @@ -144,10 +144,10 @@ def forward(self, args): return return_args(hidden_states, attention_mask, position_ids) -class QWen2RMSNormPipe(QWen2RMSNorm): - def __init__(self, config: QWen2Config): +class Qwen2RMSNormPipe(Qwen2RMSNorm): + def __init__(self, config: Qwen2Config): super().__init__() - self.norm = QWen2RMSNorm(config) + self.norm = Qwen2RMSNorm(config) def forward(self, args): hidden_states, attention_mask, position_ids = parse_args(args) @@ -161,15 +161,15 @@ class QWenForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): sequence of layers including embedding, transformer layers, and output. """ - config_class = QWen2Config + config_class = Qwen2Config - _get_tensor_parallel_mappings = QWen2PretrainedModel._get_tensor_parallel_mappings - _init_weights = QWen2PretrainedModel._init_weights - _keys_to_ignore_on_load_unexpected = QWen2PretrainedModel._keys_to_ignore_on_load_unexpected + _get_tensor_parallel_mappings = Qwen2PretrainedModel._get_tensor_parallel_mappings + _init_weights = Qwen2PretrainedModel._init_weights + _keys_to_ignore_on_load_unexpected = Qwen2PretrainedModel._keys_to_ignore_on_load_unexpected # DONOT Add base_model_prefix !!!! - def __init__(self, config: QWen2Config): + def __init__(self, config: Qwen2Config): self.config = config self.use_recompute = self.config.use_recompute @@ -192,14 +192,14 @@ def get_hcg(): config.tensor_parallel_degree = tensor_parallel_degree config.tensor_parallel_rank = tensor_parallel_rank - self.add_sequential_layer(LayerDesc(QWen2EmbeddingPipe, config=config), "qwen2") + self.add_sequential_layer(LayerDesc(Qwen2EmbeddingPipe, config=config), "qwen2") for i in range(config.num_hidden_layers): self.add_sequential_layer( - LayerDesc(QWen2DecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers), + LayerDesc(Qwen2DecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers), f"qwen2.layers.{i}", ) - self.add_sequential_layer(LayerDesc(QWen2RMSNormPipe, config=config), "norm") - self.add_sequential_layer(LayerDesc(QWen2LMHead, config=config), "lm_head") + self.add_sequential_layer(LayerDesc(Qwen2RMSNormPipe, config=config), "norm") + self.add_sequential_layer(LayerDesc(Qwen2LMHead, config=config), "lm_head") recompute_interval = 0 if self.use_recompute and self.recompute_granularity == "full": @@ -208,14 +208,14 @@ def get_hcg(): ), "pp recompute interval should smaller than num layers of each pp chunk" recompute_interval = self.config.pp_recompute_interval - seg_method = "layer:QWen2DecoderLayer" + seg_method = "layer:Qwen2DecoderLayer" if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0: seg_method = "uniform" PipelineLayer.__init__( self, layers=self.get_sequential_layers(), - loss_fn=QWen2PretrainingCriterion(config), + loss_fn=Qwen2PretrainingCriterion(config), topology=get_hcg().topology(), seg_method=seg_method, recompute_interval=recompute_interval, diff --git a/paddlenlp/transformers/qwen2/tokenizer.py b/paddlenlp/transformers/qwen2/tokenizer.py index ddc475c2721f..f637e4a10706 100644 --- a/paddlenlp/transformers/qwen2/tokenizer.py +++ b/paddlenlp/transformers/qwen2/tokenizer.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization classes for QWen2.""" +"""Tokenization classes for Qwen2.""" import json import os @@ -29,7 +29,7 @@ "merges_file": "merges.txt", } -__all__ = ["QWen2MoETokenizer"] +__all__ = ["Qwen2Tokenizer"] MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} @@ -75,7 +75,7 @@ def get_pairs(word): return pairs -class QWen2MoETokenizer(PretrainedTokenizer): +class Qwen2Tokenizer(PretrainedTokenizer): """ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding. diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2moe/__init__.py index e64672df2941..2f2acfa9b339 100644 --- a/paddlenlp/transformers/qwen2moe/__init__.py +++ b/paddlenlp/transformers/qwen2moe/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ..qwen2.tokenizer import * from .configuration import * from .modeling import * -from .tokenizer import * From 1054f06985c9683858424d6ff2e58f13ad8778b5 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 5 Jun 2024 08:02:55 +0000 Subject: [PATCH 33/41] update Qwen2MoE to Qwen2Moe --- paddlenlp/transformers/__init__.py | 2 +- .../{qwen2moe => qwen2_moe}/__init__.py | 0 .../{qwen2moe => qwen2_moe}/configuration.py | 22 +-- .../{qwen2moe => qwen2_moe}/modeling.py | 133 +++++++++--------- 4 files changed, 75 insertions(+), 82 deletions(-) rename paddlenlp/transformers/{qwen2moe => qwen2_moe}/__init__.py (100%) rename paddlenlp/transformers/{qwen2moe => qwen2_moe}/configuration.py (94%) rename paddlenlp/transformers/{qwen2moe => qwen2_moe}/modeling.py (95%) diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 61242951b24f..324ec2a09e4f 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -295,7 +295,7 @@ from .deberta_v2.tokenizer import * from .deberta_v2.configuration import * from .qwen2 import * -from .qwen2moe import * +from .qwen2_moe import * # For faster tokenizer from ..utils.import_utils import is_fast_tokenizer_available diff --git a/paddlenlp/transformers/qwen2moe/__init__.py b/paddlenlp/transformers/qwen2_moe/__init__.py similarity index 100% rename from paddlenlp/transformers/qwen2moe/__init__.py rename to paddlenlp/transformers/qwen2_moe/__init__.py diff --git a/paddlenlp/transformers/qwen2moe/configuration.py b/paddlenlp/transformers/qwen2_moe/configuration.py similarity index 94% rename from paddlenlp/transformers/qwen2moe/configuration.py rename to paddlenlp/transformers/qwen2_moe/configuration.py index 636298d04bb6..7fa5f91b2b93 100644 --- a/paddlenlp/transformers/qwen2moe/configuration.py +++ b/paddlenlp/transformers/qwen2_moe/configuration.py @@ -12,19 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Qwen2MoE model configuration""" +""" Qwen2Moe model configuration""" from paddlenlp.transformers.configuration_utils import PretrainedConfig __all__ = [ - "QWen2MoEConfig", + "Qwen2MoeConfig", ] -class QWen2MoEConfig(PretrainedConfig): +class Qwen2MoeConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`QWen2MoEModel`]. It is used to instantiate a - Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration + This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a + Qwen2Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B"). @@ -34,8 +34,8 @@ class QWen2MoEConfig(PretrainedConfig): Args: vocab_size (`int`, *optional*, defaults to 151936): - Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`QWen2MoEModel`] + Vocabulary size of the Qwen2Moe model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen2MoeModel`] hidden_size (`int`, *optional*, defaults to 2048): Dimension of the hidden representations. intermediate_size (`int`, *optional*, defaults to 5632): @@ -93,13 +93,13 @@ class QWen2MoEConfig(PretrainedConfig): The aux loss factor for the total loss. ```python - >>> from paddlenlp.transformers import QWen2MoEModel, QWen2MoEConfig + >>> from paddlenlp.transformers import Qwen2MoeModel, Qwen2MoeConfig - >>> # Initializing a Qwen2MoE style configuration - >>> configuration = QWen2MoEConfig() + >>> # Initializing a Qwen2Moe style configuration + >>> configuration = Qwen2MoeConfig() >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration - >>> model = QWen2MoEModel(configuration) + >>> model = Qwen2MoeModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config diff --git a/paddlenlp/transformers/qwen2moe/modeling.py b/paddlenlp/transformers/qwen2_moe/modeling.py similarity index 95% rename from paddlenlp/transformers/qwen2moe/modeling.py rename to paddlenlp/transformers/qwen2_moe/modeling.py index 8e3ed555ff81..c44e8bc0da88 100644 --- a/paddlenlp/transformers/qwen2moe/modeling.py +++ b/paddlenlp/transformers/qwen2_moe/modeling.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Paddle QWen2MoE model.""" +""" Paddle Qwen2Moe model.""" from __future__ import annotations import math @@ -27,11 +27,6 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute - -try: - from paddle.incubate.nn.functional import fused_rotary_position_embedding -except ImportError: - fused_rotary_position_embedding = None from paddle.distributed.fleet.utils.sequence_parallel_utils import ( ColumnSequenceParallelLinear, GatherOp, @@ -40,19 +35,17 @@ mark_as_sequence_parallel_parameter, ) -from paddlenlp.transformers.conversion_utils import ( - StateDictNameMapping, - init_name_mappings, -) -from paddlenlp.transformers.model_outputs import ( - MoECausalLMOutputWithPast, - MoEModelOutputWithPast, -) -from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model -from paddlenlp.utils.log import logger - +from ...utils.log import logger from ..activations import ACT2FN -from .configuration import QWen2MoEConfig +from ..conversion_utils import StateDictNameMapping, init_name_mappings +from ..model_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPast +from ..model_utils import PretrainedModel, register_base_model +from .configuration import Qwen2MoeConfig + +try: + from paddle.incubate.nn.functional import fused_rotary_position_embedding +except ImportError: + fused_rotary_position_embedding = None try: from paddle.nn.functional.flash_attention import flash_attention @@ -60,10 +53,10 @@ flash_attention = None __all__ = [ - "QWen2MoEModel", - "QWen2MoEPretrainedModel", - "QWen2MoEForCausalLM", - "QWen2MoEPretrainingCriterion", + "Qwen2MoeModel", + "Qwen2MoePretrainedModel", + "Qwen2MoeForCausalLM", + "Qwen2MoePretrainingCriterion", ] @@ -342,8 +335,8 @@ def _expand_2d_mask(mask, dtype, tgt_length): return expanded_mask -class QWen2MoERMSNorm(nn.Layer): - def __init__(self, config: QWen2MoEConfig): +class Qwen2MoeRMSNorm(nn.Layer): + def __init__(self, config: Qwen2MoeConfig): super().__init__() self.hidden_size = config.hidden_size self.weight = paddle.create_parameter( @@ -373,7 +366,7 @@ def forward(self, hidden_states): return hidden_states * self.weight -class QWen2MoERotaryEmbedding(nn.Layer): +class Qwen2MoeRotaryEmbedding(nn.Layer): def __init__(self, dim, max_position_embeddings=2048, base=10000): super().__init__() self.dim = dim @@ -417,7 +410,7 @@ def rotate_half(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids): if position_ids is None: - # Note: Only for QWen2MoEForCausalLMPipe model pretraining + # Note: Only for Qwen2MoeForCausalLMPipe model pretraining cos = cos[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] sin = sin[:, : q.shape[1], :, :] # [bs, seq_len, 1, dim] else: @@ -430,9 +423,9 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids): return q_embed, k_embed -# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->QWen2MoE -class QWen2MoEMLP(nn.Layer): - def __init__(self, config: QWen2MoEConfig, is_shared=False): +# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe +class Qwen2MoeMLP(nn.Layer): + def __init__(self, config: Qwen2MoeConfig, is_shared=False): super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = ( @@ -490,13 +483,13 @@ def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor: return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim]) -class QWen2MoEAttention(nn.Layer): +class Qwen2MoeAttention(nn.Layer): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". """ - def __init__(self, config: QWen2MoEConfig, layerwise_recompute: bool = True): + def __init__(self, config: Qwen2MoeConfig, layerwise_recompute: bool = True): super().__init__() self.config = config @@ -564,7 +557,7 @@ def __init__(self, config: QWen2MoEConfig, layerwise_recompute: bool = True): self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False) - self.rotary_emb = QWen2MoERotaryEmbedding( + self.rotary_emb = Qwen2MoeRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, @@ -687,17 +680,17 @@ def forward( return outputs -class QWen2MoESparseMoEBlock(nn.Layer): - def __init__(self, config: QWen2MoEConfig): +class Qwen2MoeSparseMoEBlock(nn.Layer): + def __init__(self, config: Qwen2MoeConfig): super().__init__() self.num_experts = config.num_experts self.top_k = config.num_experts_per_tok self.norm_topk_prob = config.norm_topk_prob self.gate = nn.Linear(config.hidden_size, self.num_experts, bias_attr=False) - self.experts = nn.LayerList([QWen2MoEMLP(config) for _ in range(self.num_experts)]) + self.experts = nn.LayerList([Qwen2MoeMLP(config) for _ in range(self.num_experts)]) - self.shared_expert = QWen2MoEMLP(config, is_shared=True) + self.shared_expert = Qwen2MoeMLP(config, is_shared=True) self.shared_expert_gate = nn.Linear(config.hidden_size, 1, bias_attr=False) def forward(self, hidden_states): @@ -709,7 +702,7 @@ def forward(self, hidden_states): with paddle.amp.auto_cast(False): routing_weights = F.softmax(router_logits.astype("float32"), axis=1) routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1) - if self.norm_topk_prob: # Note: Mixtral is set norm as default, QWen2MoE is set to no norm + if self.norm_topk_prob: # Note: Mixtral is set norm as default, Qwen2Moe is set to no norm routing_weights /= routing_weights.sum(axis=-1, keepdim=True) # we cast back to input dtype routing_weights = routing_weights.astype(hidden_states.dtype) @@ -751,21 +744,21 @@ def forward(self, hidden_states): return final_hidden_states, router_logits -class QWen2MoEDecoderLayer(nn.Layer): - def __init__(self, config: QWen2MoEConfig, layerwise_recompute: bool = False): +class Qwen2MoeDecoderLayer(nn.Layer): + def __init__(self, config: Qwen2MoeConfig, layerwise_recompute: bool = False): super().__init__() self.config = config - self.self_attn = QWen2MoEAttention(config, layerwise_recompute) + self.self_attn = Qwen2MoeAttention(config, layerwise_recompute) if config.num_experts > 0: - self.mlp = QWen2MoESparseMoEBlock(config) + self.mlp = Qwen2MoeSparseMoEBlock(config) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = QWen2MoEMLP(config) + self.mlp = Qwen2MoeMLP(config) - self.input_layernorm = QWen2MoERMSNorm(config) - self.post_attention_layernorm = QWen2MoERMSNorm(config) + self.input_layernorm = Qwen2MoeRMSNorm(config) + self.post_attention_layernorm = Qwen2MoeRMSNorm(config) self.sequence_parallel = config.sequence_parallel # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True @@ -876,13 +869,13 @@ def forward( return outputs -class QWen2MoEPretrainedModel(PretrainedModel): - config_class = QWen2MoEConfig +class Qwen2MoePretrainedModel(PretrainedModel): + config_class = Qwen2MoeConfig base_model_prefix = "qwen2moe" _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod - def _get_name_mappings(cls, config: QWen2MoEConfig) -> list[StateDictNameMapping]: + def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] model_mappings = [ ["embed_tokens.weight"], @@ -918,8 +911,8 @@ def _get_name_mappings(cls, config: QWen2MoEConfig) -> list[StateDictNameMapping model_mappings.append([f"layers.{layer_index}.mlp.shared_expert_gate.weight", None, "transpose"]) init_name_mappings(mappings=model_mappings) - # base-model prefix "QWen2MoEModel" - if "QWen2MoEModel" not in config.architectures: + # base-model prefix "Qwen2MoeModel" + if "Qwen2MoeModel" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] mapping[1] = "qwen2moe." + mapping[1] @@ -929,7 +922,7 @@ def _get_name_mappings(cls, config: QWen2MoEConfig) -> list[StateDictNameMapping return mappings @classmethod - def _get_tensor_parallel_mappings(cls, config: QWen2MoEConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True): from paddlenlp.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( @@ -1012,7 +1005,7 @@ def _init_weights(self, layer): mpu.VocabParallelEmbedding, mpu.ColumnParallelLinear, mpu.RowParallelLinear, - QWen2MoELMHead, + Qwen2MoeLMHead, ColumnSequenceParallelLinear, RowSequenceParallelLinear, ), @@ -1047,23 +1040,23 @@ def _init_weights(self, layer): # sublayer is init first # scale RowParallelLinear weight with paddle.no_grad(): - if isinstance(layer, QWen2MoEMLP): + if isinstance(layer, Qwen2MoeMLP): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.down_proj.weight.scale_(factor) - if isinstance(layer, QWen2MoEAttention): + if isinstance(layer, Qwen2MoeAttention): factor = 1 / math.sqrt(2 * self.config.num_hidden_layers) layer.o_proj.weight.scale_(factor) @register_base_model -class QWen2MoEModel(QWen2MoEPretrainedModel): +class Qwen2MoeModel(Qwen2MoePretrainedModel): """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QWen2MoEDecoderLayer`] + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`] Args: - config: QWen2MoEConfig + config: Qwen2MoeConfig """ - def __init__(self, config: QWen2MoEConfig): + def __init__(self, config: Qwen2MoeConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -1088,11 +1081,11 @@ def __init__(self, config: QWen2MoEConfig): self.layers = nn.LayerList( [ - QWen2MoEDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) + Qwen2MoeDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers) for layer_idx in range(config.num_hidden_layers) ] ) - self.norm = QWen2MoERMSNorm(config) + self.norm = Qwen2MoeRMSNorm(config) def get_input_embeddings(self): return self.embed_tokens @@ -1314,14 +1307,14 @@ def forward( ) -class QWen2MoEPretrainingCriterion(nn.Layer): +class Qwen2MoePretrainingCriterion(nn.Layer): """ Criterion for Mixtral. It calculates the final loss. """ - def __init__(self, config: QWen2MoEConfig): - super(QWen2MoEPretrainingCriterion, self).__init__() + def __init__(self, config: Qwen2MoeConfig): + super(Qwen2MoePretrainingCriterion, self).__init__() self.ignore_index = getattr(config, "ignore_index", -100) self.config = config self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output @@ -1349,9 +1342,9 @@ def forward(self, prediction_scores, masked_lm_labels): return loss -class QWen2MoELMHead(nn.Layer): - def __init__(self, config: QWen2MoEConfig): - super(QWen2MoELMHead, self).__init__() +class Qwen2MoeLMHead(nn.Layer): + def __init__(self, config: Qwen2MoeConfig): + super(Qwen2MoeLMHead, self).__init__() self.config = config if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0: vocab_size = config.vocab_size // config.tensor_parallel_degree @@ -1380,17 +1373,17 @@ def forward(self, hidden_states, tensor_parallel_output=None): return logits -class QWen2MoEForCausalLM(QWen2MoEPretrainedModel): +class Qwen2MoeForCausalLM(Qwen2MoePretrainedModel): enable_to_static_method = True _tied_weights_keys = ["lm_head.weight"] - def __init__(self, config: QWen2MoEConfig): + def __init__(self, config: Qwen2MoeConfig): super().__init__(config) self.config = config - self.qwen2moe = QWen2MoEModel(config) - self.lm_head = QWen2MoELMHead(config) - self.criterion = QWen2MoEPretrainingCriterion(config) + self.qwen2moe = Qwen2MoeModel(config) + self.lm_head = Qwen2MoeLMHead(config) + self.criterion = Qwen2MoePretrainingCriterion(config) self.router_aux_loss_coef = config.router_aux_loss_coef self.num_experts = config.num_experts self.num_experts_per_tok = config.num_experts_per_tok From 056b04c69f81a722e3dc81c358ff4c2072aae73a Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 5 Jun 2024 08:25:48 +0000 Subject: [PATCH 34/41] update readme --- llm/qwen/README.md | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/llm/qwen/README.md b/llm/qwen/README.md index 97dc43549375..b02f0ccb2178 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -5,21 +5,40 @@ [通义千问(Qwen)](https://arxiv.org/abs/2205.01068) 是阿里云研发的通义千问大模型系列的模型, 有 70 亿和 140 亿两个规模。Qwen是基于Transformer的大语言模型, 在超大规模的预训练数据上进行训练得到。预训练数据类型多样,覆盖广泛,包括大量网络文本、专业书籍、代码等。 **支持模型权重:** -| Model | -|-------------------| -| qwen/qwen-7b | -| qwen/qwen-7b-chat | -| qwen/qwen-14b | -| qwen/qwen-14b-chat| -| qwen/qwen-72b | -| qwen/qwen-72b-chat| +| Model | +|--------------------| +| qwen/qwen-7b | +| qwen/qwen-7b-chat | +| qwen/qwen-14b | +| qwen/qwen-14b-chat | +| qwen/qwen-72b | +| qwen/qwen-72b-chat | -[通义千问(Qwen1.5-MoE)](https://qwenlm.github.io/blog/qwen-moe/) 是阿里云研发的通义千问MoE模型。Qwen1.5-MoE基于Transformer架构,采用了专家混合(MoE)架构,这些模型通过密集型语言模型升级改造而来。例如,Qwen1.5-MoE-A2.7B就是从Qwen-1.8B升级改造而来的。它总共有143亿个参数,但在运行时仅激活27亿个参数,却实现了与Qwen1.5-7B相近的性能,而训练资源仅为其25%。 + + +[通义千问(Qwen1.5)](https://qwenlm.github.io/blog/qwen1.5/) 是阿里云研发的通义千问系列模型升级版。Qwen1.5包括0.5B、1.8B、4B、7B、14B、32B、72B、110B和MoE共计9个不同规模的Base和Chat模型。 **支持模型权重:** -| Model (qwen-1.5) | -|------------------------| -| qwen/qwen1.5-moe-a2.7b | +| Model (qwen-1.5) | +|-----------------------------| +| Qwen/Qwen1.5-0.5B | +| Qwen/Qwen1.5-0.5B-Chat | +| Qwen/Qwen1.5-1.8B | +| Qwen/Qwen1.5-1.8B-Chat | +| Qwen/Qwen1.5-4B | +| Qwen/Qwen1.5-4B-Chat | +| Qwen/Qwen1.5-7B | +| Qwen/Qwen1.5-7B-Chat | +| Qwen/Qwen1.5-14B | +| Qwen/Qwen1.5-14B-Chat | +| Qwen/Qwen1.5-32B | +| Qwen/Qwen1.5-32B-Chat | +| Qwen/Qwen1.5-72B | +| Qwen/Qwen1.5-72B-Chat | +| Qwen/Qwen1.5-110B | +| Qwen/Qwen1.5-110B-Chat | +| Qwen/Qwen1.5-MoE-A2.7B | +| Qwen/Qwen1.5-MoE-A2.7B-Chat | ## 2. 模型精调 请参考[LLM全流程工具介绍](../README.md) From ab08c174a799aa6c8ff1f35caddcc137ae4e01a4 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 5 Jun 2024 08:27:07 +0000 Subject: [PATCH 35/41] update qwen2moe sft and lora json --- ...ora_argument_qwen1.5moe.json => lora_argument_qwen2moe.json} | 2 +- ...{sft_argument_qwen1.5moe.json => sft_argument_qwen2moe.json} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename llm/qwen/{lora_argument_qwen1.5moe.json => lora_argument_qwen2moe.json} (94%) rename llm/qwen/{sft_argument_qwen1.5moe.json => sft_argument_qwen2moe.json} (94%) diff --git a/llm/qwen/lora_argument_qwen1.5moe.json b/llm/qwen/lora_argument_qwen2moe.json similarity index 94% rename from llm/qwen/lora_argument_qwen1.5moe.json rename to llm/qwen/lora_argument_qwen2moe.json index c511e578e56b..0344e3885ba0 100644 --- a/llm/qwen/lora_argument_qwen1.5moe.json +++ b/llm/qwen/lora_argument_qwen2moe.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "qwen/qwen1.5-moe-a2.7b", + "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_lora_ckpts", "per_device_train_batch_size": 4, diff --git a/llm/qwen/sft_argument_qwen1.5moe.json b/llm/qwen/sft_argument_qwen2moe.json similarity index 94% rename from llm/qwen/sft_argument_qwen1.5moe.json rename to llm/qwen/sft_argument_qwen2moe.json index d3d048f423d3..75d3a93500f5 100644 --- a/llm/qwen/sft_argument_qwen1.5moe.json +++ b/llm/qwen/sft_argument_qwen2moe.json @@ -1,5 +1,5 @@ { - "model_name_or_path": "qwen/qwen1.5-moe-a2.7b", + "model_name_or_path": "Qwen/Qwen1.5-MoE-A2.7B", "dataset_name_or_path": "./data", "output_dir": "./checkpoints/qwen2moe_sft_ckpts", "per_device_train_batch_size": 4, From ad02fdc9da2cd03b0b95640ffe35bc173256c7ac Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Wed, 5 Jun 2024 08:28:52 +0000 Subject: [PATCH 36/41] update qwen2moe base name --- llm/data.py | 4 +- llm/utils.py | 2 +- paddlenlp/transformers/auto/modeling.py | 2 +- .../transformers/qwen2_moe/configuration.py | 2 +- paddlenlp/transformers/qwen2_moe/modeling.py | 20 +++++----- tests/transformers/qwen2moe/test_modeling.py | 38 +++++++++---------- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/llm/data.py b/llm/data.py index addba7115b2f..f2c0112ab7e8 100644 --- a/llm/data.py +++ b/llm/data.py @@ -44,12 +44,12 @@ def get_convert_example(model): if base_model_prefix == "chatglm": return convert_example_chatglm - elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mixtral", "gemma", "qwen2moe"]: + elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mixtral", "gemma", "qwen2_moe"]: return convert_example_common else: raise ValueError( f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma", - "qwen2moe", + "qwen2_moe", ) diff --git a/llm/utils.py b/llm/utils.py index 1d1098022135..6e623c38ca00 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -161,7 +161,7 @@ def get_lora_target_modules(model): ".*w2.*", ".*w3.*", ] - elif model.base_model_prefix == "qwen2moe": + elif model.base_model_prefix == "qwen2_moe": target_modules = [ ".*q_proj.*", ".*k_proj.*", diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index e84a62986d51..5e6efb5a3619 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -118,7 +118,7 @@ ("Bloom", "bloom"), ("QWen", "qwen"), ("Mixtral", "mixtral"), - ("QWen2MoE", "qwen2moe"), + ("Qwen2Moe", "qwen2_moe"), ("Gemma", "gemma"), ] ) diff --git a/paddlenlp/transformers/qwen2_moe/configuration.py b/paddlenlp/transformers/qwen2_moe/configuration.py index 7fa5f91b2b93..46fe1f91437d 100644 --- a/paddlenlp/transformers/qwen2_moe/configuration.py +++ b/paddlenlp/transformers/qwen2_moe/configuration.py @@ -105,7 +105,7 @@ class Qwen2MoeConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "qwen2moe" + model_type = "qwen2_moe" keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/paddlenlp/transformers/qwen2_moe/modeling.py b/paddlenlp/transformers/qwen2_moe/modeling.py index c44e8bc0da88..da993c319c98 100644 --- a/paddlenlp/transformers/qwen2_moe/modeling.py +++ b/paddlenlp/transformers/qwen2_moe/modeling.py @@ -871,7 +871,7 @@ def forward( class Qwen2MoePretrainedModel(PretrainedModel): config_class = Qwen2MoeConfig - base_model_prefix = "qwen2moe" + base_model_prefix = "qwen2_moe" _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"] @classmethod @@ -915,7 +915,7 @@ def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping if "Qwen2MoeModel" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] - mapping[1] = "qwen2moe." + mapping[1] + mapping[1] = "qwen2_moe." + mapping[1] model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"]) mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] @@ -1020,7 +1020,7 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.qwen2moe.config.initializer_range, + else self.qwen2_moe.config.initializer_range, shape=layer.weight.shape, ) ) @@ -1030,7 +1030,7 @@ def _init_weights(self, layer): mean=0.0, std=self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.qwen2moe.config.initializer_range, + else self.qwen2_moe.config.initializer_range, shape=layer.weight.shape, ) ) @@ -1381,7 +1381,7 @@ def __init__(self, config: Qwen2MoeConfig): super().__init__(config) self.config = config - self.qwen2moe = Qwen2MoeModel(config) + self.qwen2_moe = Qwen2MoeModel(config) self.lm_head = Qwen2MoeLMHead(config) self.criterion = Qwen2MoePretrainingCriterion(config) self.router_aux_loss_coef = config.router_aux_loss_coef @@ -1394,10 +1394,10 @@ def __init__(self, config: Qwen2MoeConfig): logger.warning("We do not support sliding window attention for now.") def get_input_embeddings(self): - return self.qwen2moe.embed_tokens + return self.qwen2_moe.embed_tokens def set_input_embeddings(self, value): - self.qwen2moe.embed_tokens = value + self.qwen2_moe.embed_tokens = value def get_output_embeddings(self): return self.lm_head @@ -1406,10 +1406,10 @@ def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): - self.qwen2moe = decoder + self.qwen2_moe = decoder def get_decoder(self): - return self.qwen2moe + return self.qwen2_moe def prepare_inputs_for_generation( self, @@ -1498,7 +1498,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.qwen2moe( + outputs = self.qwen2_moe( input_ids=input_ids, # [bs, seq_len] position_ids=position_ids, attention_mask=attention_mask, diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index 850cefc25011..d2b11e76f6b0 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -18,7 +18,7 @@ import paddle -from paddlenlp.transformers import QWen2MoEConfig, QWen2MoEForCausalLM, QWen2MoEModel +from paddlenlp.transformers import Qwen2MoeConfig, Qwen2MoeForCausalLM, Qwen2MoeModel from tests.transformers.test_configuration_common import ConfigTester from tests.transformers.test_generation_utils import GenerationTesterMixin from tests.transformers.test_modeling_common import ( @@ -28,7 +28,7 @@ ) -class QWen2MoEModelTester: +class Qwen2MoeModelTester: def __init__( self, parent, @@ -63,7 +63,7 @@ def __init__( use_labels: bool = False, return_dict=False, ): - self.parent: QWen2MoEModelTest = parent + self.parent: Qwen2MoeModelTest = parent self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -115,8 +115,8 @@ def prepare_config_and_inputs(self): config = self.get_config() return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels - def get_config(self) -> QWen2MoEConfig: - return QWen2MoEConfig( + def get_config(self) -> Qwen2MoeConfig: + return Qwen2MoeConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -139,17 +139,17 @@ def get_config(self) -> QWen2MoEConfig: ) def create_and_check_model( - self, config: QWen2MoEConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = QWen2MoEModel(config) + model = Qwen2MoeModel(config) model.eval() result = model(input_ids) self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) def create_and_check_model_attention_mask( - self, config: QWen2MoEConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = QWen2MoEModel(config) + model = Qwen2MoeModel(config) model.eval() attn_mask_2d = random_attention_mask([self.batch_size, self.seq_length]) result_2d = model(input_ids, attention_mask=attn_mask_2d)[0] @@ -167,14 +167,14 @@ def create_and_check_model_attention_mask( def create_and_check_model_past_large_inputs( self, - config: QWen2MoEConfig, + config: Qwen2MoeConfig, input_ids, input_mask, sequence_labels, token_labels, choice_labels, ): - model = QWen2MoEModel(config) + model = Qwen2MoeModel(config) model.eval() # first forward pass @@ -229,7 +229,7 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): - model = QWen2MoEForCausalLM(config) + model = Qwen2MoeForCausalLM(config) model.eval() result = model( @@ -245,7 +245,7 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args): self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.vocab_size]) def check_model_position_ids(self, config, input_ids, input_mask, *args): - model = QWen2MoEForCausalLM(config) + model = Qwen2MoeForCausalLM(config) model.eval() result_no_position_id = model( @@ -267,20 +267,20 @@ def check_model_position_ids(self, config, input_ids, input_mask, *args): self.parent.assertTrue((result_position_id[0] == result_no_position_id[0]).all()) -class QWen2MoEModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - base_model_class = QWen2MoEModel +class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = Qwen2MoeModel return_dict = False use_labels = False use_test_model_name_list = False - all_model_classes = (QWen2MoEModel, QWen2MoEForCausalLM) - all_generative_model_classes = {QWen2MoEForCausalLM: (QWen2MoEModel, "qwen2moe")} + all_model_classes = (Qwen2MoeModel, Qwen2MoeForCausalLM) + all_generative_model_classes = {Qwen2MoeForCausalLM: (Qwen2MoeModel, "qwen2_moe")} def setUp(self): super().setUp() - self.model_tester = QWen2MoEModelTester(self) - self.config_tester = ConfigTester(self, config_class=QWen2MoEConfig, vocab_size=256, hidden_size=24) + self.model_tester = Qwen2MoeModelTester(self) + self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, vocab_size=256, hidden_size=24) def _get_input_ids_and_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() From 23e39fc02d3d2da91ce077382bd3baec1ca881ab Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 7 Jun 2024 10:03:35 +0000 Subject: [PATCH 37/41] update qwen2 --- llm/data.py | 13 ++++++++++++- paddlenlp/transformers/auto/modeling.py | 1 + paddlenlp/transformers/qwen2/modeling.py | 8 ++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/llm/data.py b/llm/data.py index f2c0112ab7e8..32aa0f2a4cdc 100644 --- a/llm/data.py +++ b/llm/data.py @@ -44,11 +44,22 @@ def get_convert_example(model): if base_model_prefix == "chatglm": return convert_example_chatglm - elif base_model_prefix in ["chatglm_v2", "llama", "bloom", "opt", "qwen", "mixtral", "gemma", "qwen2_moe"]: + elif base_model_prefix in [ + "chatglm_v2", + "llama", + "bloom", + "opt", + "qwen", + "mixtral", + "gemma", + "qwen2", + "qwen2_moe", + ]: return convert_example_common else: raise ValueError( f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma", + "qwen2", "qwen2_moe", ) diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py index 5e6efb5a3619..f22126f76c8a 100644 --- a/paddlenlp/transformers/auto/modeling.py +++ b/paddlenlp/transformers/auto/modeling.py @@ -118,6 +118,7 @@ ("Bloom", "bloom"), ("QWen", "qwen"), ("Mixtral", "mixtral"), + ("Qwen2", "qwen2"), ("Qwen2Moe", "qwen2_moe"), ("Gemma", "gemma"), ] diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py index 2de89eb944c2..6cc4b83a359a 100644 --- a/paddlenlp/transformers/qwen2/modeling.py +++ b/paddlenlp/transformers/qwen2/modeling.py @@ -772,7 +772,7 @@ def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, num_experts): + def get_tensor_parallel_split_mappings(num_layers): final_actions = {} base_actions = { @@ -796,6 +796,10 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.down_proj.weight"] = partial(fn, is_column=False) + for key, action in base_actions.items(): if "layers.0." in key: for i in range(num_layers): @@ -804,7 +808,7 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings From 36b38974d1d614f3509c1f5d184d96debd6eff69 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 7 Jun 2024 10:04:35 +0000 Subject: [PATCH 38/41] update --- llm/data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llm/data.py b/llm/data.py index 32aa0f2a4cdc..7c711485af3b 100644 --- a/llm/data.py +++ b/llm/data.py @@ -58,9 +58,7 @@ def get_convert_example(model): return convert_example_common else: raise ValueError( - f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma", - "qwen2", - "qwen2_moe", + f"Unknown base_model_prefix: {model.base_model_prefix}. Supported base_model_prefix list: chatglm, bloom, llama, qwen, mixtral, gemma, qwen2, qwen2_moe", ) From b140df651c76faf7f323a2bd109db3638335995c Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 11 Jun 2024 08:14:09 +0000 Subject: [PATCH 39/41] update readme --- llm/qwen/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llm/qwen/README.md b/llm/qwen/README.md index b02f0ccb2178..eea78a84a50b 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -40,5 +40,22 @@ | Qwen/Qwen1.5-MoE-A2.7B | | Qwen/Qwen1.5-MoE-A2.7B-Chat | + +[通义千问(Qwen2)](https://qwenlm.github.io/blog/qwen1.5/) 是阿里云研发的通义千问系列模型升级版。Qwen1.5包括0.5B、1.8B、4B、7B、14B、32B、72B、110B和MoE共计9个不同规模的Base和Chat模型。 +**支持模型权重:** +| Model (qwen2) | +|------------------------------| +| Qwen/Qwen2-0.5B | +| Qwen/Qwen2-0.5B-Instruct | +| Qwen/Qwen2-1.5B | +| Qwen/Qwen2-1.5B-Instruct | +| Qwen/Qwen2-7B | +| Qwen/Qwen2-7B-Instruct | +| Qwen/Qwen2-72B | +| Qwen/Qwen2-72B-Instruct | +| Qwen/Qwen2-57B-A14B | +| Qwen/Qwen2-57B-A14B-Instruct | + + ## 2. 模型精调 请参考[LLM全流程工具介绍](../README.md) From e6de5f3429bcb6d1bfcb945b9e40e3d2d45db847 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 11 Jun 2024 08:16:17 +0000 Subject: [PATCH 40/41] update readme --- llm/qwen/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/qwen/README.md b/llm/qwen/README.md index eea78a84a50b..3d60f2ec2006 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -41,7 +41,7 @@ | Qwen/Qwen1.5-MoE-A2.7B-Chat | -[通义千问(Qwen2)](https://qwenlm.github.io/blog/qwen1.5/) 是阿里云研发的通义千问系列模型升级版。Qwen1.5包括0.5B、1.8B、4B、7B、14B、32B、72B、110B和MoE共计9个不同规模的Base和Chat模型。 +[通义千问(Qwen2)](https://qwenlm.github.io/blog/qwen1.5/) 是阿里云研发的通义千问系列模型升级版。Qwen2包括0.5B、1.5B、7B、72B和MoE共计5个不同规模的Base和Chat模型。 **支持模型权重:** | Model (qwen2) | |------------------------------| From 48ae2ab9b8746d10d0328af5d87192a55da87894 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 11 Jun 2024 08:17:15 +0000 Subject: [PATCH 41/41] update readme --- llm/qwen/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/qwen/README.md b/llm/qwen/README.md index 3d60f2ec2006..22ac37c19e17 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -41,7 +41,7 @@ | Qwen/Qwen1.5-MoE-A2.7B-Chat | -[通义千问(Qwen2)](https://qwenlm.github.io/blog/qwen1.5/) 是阿里云研发的通义千问系列模型升级版。Qwen2包括0.5B、1.5B、7B、72B和MoE共计5个不同规模的Base和Chat模型。 +[通义千问(Qwen2)](https://qwenlm.github.io/blog/qwen2/) 是阿里云研发的通义千问系列模型升级版。Qwen2包括0.5B、1.5B、7B、72B和MoE共计5个不同规模的Base和Chat模型。 **支持模型权重:** | Model (qwen2) | |------------------------------|