diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index f47d800c5f15..f4c1ee8d46a7 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -48,13 +48,16 @@ MinLengthLogitsProcessor, RepetitionPenaltyLogitsProcessor, ) -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.segment_parallel_utils import ReshardLayer diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py index 1a73a35982ff..c86fa300e352 100644 --- a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py +++ b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py @@ -24,9 +24,12 @@ from ppfleetx.core.module.basic_module import BasicModule from ppfleetx.data.tokenizers import GPTTokenizer from ppfleetx.distributed.apis import env -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - register_sequence_parallel_allreduce_hooks, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + register_sequence_parallel_allreduce_hooks, + ) +except: + pass from ppfleetx.utils.log import logger # TODO(haohongxiang): to solve the problem of cross-reference diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 2ee9d7733f41..05fa5775399e 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -29,16 +29,20 @@ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin from .image_processing_utils import ImageProcessingMixin from .attention_utils import create_bigbird_rand_mask_idx_list -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - GatherOp, - ScatterOp, - AllGatherOp, - ReduceScatterOp, - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, - mark_as_sequence_parallel_parameter, - register_sequence_parallel_allreduce_hooks, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + GatherOp, + ScatterOp, + AllGatherOp, + ReduceScatterOp, + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + mark_as_sequence_parallel_parameter, + register_sequence_parallel_allreduce_hooks, + ) +except: + pass from .export import export_model # isort: split diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 50cfc892d336..8c066431979f 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -29,13 +29,17 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from paddle.utils import try_import diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py index 255763be395f..2e508339ab39 100644 --- a/paddlenlp/transformers/gpt/modeling_auto.py +++ b/paddlenlp/transformers/gpt/modeling_auto.py @@ -30,10 +30,14 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ScatterOp, - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from ...utils.converter import StateDictNameMapping from .. import PretrainedModel, register_base_model diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py index 3ec6b004edee..cd3dce018378 100644 --- a/paddlenlp/transformers/gpt/modeling_pp.py +++ b/paddlenlp/transformers/gpt/modeling_pp.py @@ -19,9 +19,13 @@ SharedLayerDesc, ) from paddle.distributed.fleet.utils import recompute -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - mark_as_sequence_parallel_parameter, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.model_utils import PipelinePretrainedModel diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index d4da1b195a94..b0b08c30241a 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -45,13 +45,16 @@ def swiglu(x, y=None): return F.silu(x) * y -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddle.utils import try_import from paddlenlp.transformers.conversion_utils import ( diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py index 7d669833e690..c39a78cc6252 100644 --- a/paddlenlp/transformers/mc2_seqence_parallel_linear.py +++ b/paddlenlp/transformers/mc2_seqence_parallel_linear.py @@ -23,10 +23,14 @@ from paddle import distributed as dist from paddle.autograd import PyLayer -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - RowSequenceParallelLinear, -) + +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + RowSequenceParallelLinear, + ) +except: + pass __all_gather_recomputation__ = False if int(os.getenv("MC2_Recompute", 0)): diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py index 592f9a47847a..7a8254d6877c 100644 --- a/paddlenlp/transformers/mixtral/modeling.py +++ b/paddlenlp/transformers/mixtral/modeling.py @@ -33,13 +33,16 @@ except ImportError: fused_rotary_position_embedding = None -from paddle.distributed.fleet.utils.sequence_parallel_utils import ( - ColumnSequenceParallelLinear, - GatherOp, - RowSequenceParallelLinear, - ScatterOp, - mark_as_sequence_parallel_parameter, -) +try: + from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ColumnSequenceParallelLinear, + GatherOp, + RowSequenceParallelLinear, + ScatterOp, + mark_as_sequence_parallel_parameter, + ) +except: + pass from paddlenlp.transformers.conversion_utils import ( StateDictNameMapping,