diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index dc17f092397a..b2f72de84060 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -573,6 +573,21 @@ def train( # so, the trainable numel is a little bigger than real. logger.info(f" Number of trainable parameters = {trainable_numel:,} (all devices, roughly)") + model = paddle.jit.to_static( + model, + input_spec=[ + paddle.static.InputSpec(name="input_ids", shape=[-1, -1], dtype="int64"), # input_ids + None, # position_ids + None, # attention_mask + None, # inputs_embeds + paddle.static.InputSpec(name="labels", shape=[-1, -1], dtype="int64"), # labels + False, # use_cache + None, # past_key_values + None, # output_attentions + None, # output_hidden_states + None, # return_dict + ], + ) start_time = time.time() self._globalstep_last_start_time = time.time() self.state.epoch = 0 diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index b870706cbf4a..ec9cbfd8baa9 100644 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -233,8 +233,8 @@ def scaled_dot_product_attention( ) attn_weights = attn_weights + attention_mask - with paddle.amp.auto_cast(False): - attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) + # with paddle.amp.auto_cast(False): + attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype) attn_output = paddle.matmul(attn_weights, value_states) attn_output = attn_output.transpose([0, 2, 1, 3]) @@ -299,9 +299,9 @@ def forward(self, hidden_states): if self.config.use_fused_rms_norm: return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon) - with paddle.amp.auto_cast(False): - variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) - hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states + # with paddle.amp.auto_cast(False): + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states if self.weight.dtype in [paddle.float16, paddle.bfloat16]: hidden_states = paddle.cast(hidden_states, self.weight.dtype) @@ -1129,11 +1129,11 @@ def forward(self, prediction_scores, masked_lm_labels): prediction_scores = prediction_scores[..., :-1, :] masked_lm_labels = masked_lm_labels[..., 1:] - with paddle.amp.auto_cast(False): - masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)) - # skip ignore_index which loss == 0 - masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32") - loss = paddle.mean(masked_lm_loss) + # with paddle.amp.auto_cast(False): + masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2)) + # skip ignore_index which loss == 0 + masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32") + loss = paddle.mean(masked_lm_loss) return loss diff --git a/tests/test_tipc/benchmark/options.py b/tests/test_tipc/benchmark/options.py index 5c287833e9c6..993117c5659d 100644 --- a/tests/test_tipc/benchmark/options.py +++ b/tests/test_tipc/benchmark/options.py @@ -32,6 +32,8 @@ from .modules.stablediffusion import StableDiffusionBenchmark except Exception: StableDiffusionBenchmark = None +from paddlenlp.trainer.argparser import strtobool + from .modules.t5_for_conditional_generation import T5ForConditionalGenerationBenchmark from .modules.xlnet import XLNetBenchmark @@ -156,6 +158,7 @@ def get_parser(): help='The option of profiler, which should be in format "key1=value1;key2=value2;key3=value3".', ) parser.add_argument("--save_model", type=str, default=None, help="Directory to save models. ") + parser.add_argument("--use_nsys", type=strtobool, default=False, help="Enable nsys.") return parser diff --git a/tests/test_tipc/train.py b/tests/test_tipc/train.py index fadc23131d1a..b798c5aee6f5 100644 --- a/tests/test_tipc/train.py +++ b/tests/test_tipc/train.py @@ -25,6 +25,7 @@ from benchmark.modules.benchmark_utils import clone_inputs from benchmark.options import LR_SCHEDULER_REGISTRY, MODEL_REGISTRY, OPTIMIZER_REGISTRY from benchmark.utils.record import AverageStatistical +from paddle.fluid import core from paddlenlp.utils import profiler from paddlenlp.utils.log import logger @@ -223,6 +224,17 @@ def do_train(args): input_spec = benchmark_model.create_input_specs() model = paddle.jit.to_static(model, input_spec=input_spec) logger.info("Successfully to apply @to_static with specs: {}".format(input_spec)) + # paddle.jit.save(model, "/data/run_model/ernie/ernie", input_spec=input_spec) + # composite_program = model.forward.get_concrete_program(**tmp_input_data)[1].train_program + # for op in composite_program.block(0).ops: + # print(op) + # #print(op.type) + # build_strategy = paddle.static.BuildStrategy() + # build_strategy.build_cinn_pass = True + # build_strategy.debug_graphviz_path = "/data/run_model/ernie/paddle_to_cinn_graph/" + # program = paddle.static.CompiledProgram(composite_program, build_strategy=build_strategy) + # program._compile(paddle.fluid.executor.global_scope(), paddle.CUDAPlace(0)) + # import pdb; pdb.set_trace() if args.lr_scheduler is not None: benchmark_lr_scheduler = LR_SCHEDULER_REGISTRY[args.lr_scheduler]() @@ -255,6 +267,18 @@ def do_train(args): batch_id = 0 batch_start = time.time() for input_data in train_loader: + if args.use_nsys: + iter_id = step_id + if iter_id == 100: + core.nvprof_start() + core.nvprof_enable_record_event() + core.nvprof_nvtx_push(str(iter_id)) + if iter_id == 110: + core.nvprof_nvtx_pop() + core.nvprof_stop() + if iter_id > 100 and iter_id < 110: + core.nvprof_nvtx_pop() + core.nvprof_nvtx_push(str(iter_id)) train_reader_cost = time.time() - batch_start if args.use_amp: