Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix row parallel lora layers parameters initialization bug #9427

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 44 additions & 34 deletions paddlenlp/peft/lora/lora_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import math
from contextlib import nullcontext
from typing import Optional

import paddle
Expand All @@ -22,6 +23,7 @@
from paddle.distributed.fleet.meta_parallel import (
ColumnParallelLinear,
RowParallelLinear,
get_rng_state_tracker,
)

from ...transformers import linear_utils
Expand Down Expand Up @@ -50,6 +52,10 @@
from .lora_quick_layers import quick_lora


def rng_ctx(is_mp: bool, in_dynamic_mode: bool):
return get_rng_state_tracker().rng_state() if (is_mp and in_dynamic_mode) else nullcontext()

Check warning on line 56 in paddlenlp/peft/lora/lora_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/peft/lora/lora_layers.py#L56

Added line #L56 was not covered by tests


class LoRALinear(nn.Linear):
# LoRA implemented in a dense layer
def __init__(
Expand Down Expand Up @@ -198,14 +204,15 @@
self.name = self._name

# Actual trainable parameters
self.lora_A = self.create_parameter(
shape=[self.input_size_per_partition, r],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
),
)
with rng_ctx(self.is_mp, paddle.in_dynamic_mode()):
self.lora_A = self.create_parameter(

Check warning on line 208 in paddlenlp/peft/lora/lora_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/peft/lora/lora_layers.py#L207-L208

Added lines #L207 - L208 were not covered by tests
shape=[self.input_size_per_partition, r],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
),
)
self.lora_B = self.create_parameter(
shape=[r, self.out_features],
dtype=self._dtype,
Expand Down Expand Up @@ -345,14 +352,15 @@
self.name = self._name

# Actual trainable parameters
self.lora_A = self.create_parameter(
shape=[self.input_size_per_partition, r],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
),
)
with rng_ctx(self.is_mp, paddle.in_dynamic_mode()):
self.lora_A = self.create_parameter(

Check warning on line 356 in paddlenlp/peft/lora/lora_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/peft/lora/lora_layers.py#L355-L356

Added lines #L355 - L356 were not covered by tests
shape=[self.input_size_per_partition, r],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
),
)
self.lora_B = self.create_parameter(
shape=[r, self.out_features],
dtype=self._dtype,
Expand Down Expand Up @@ -468,15 +476,16 @@
attr=lora_A_weight_attr,
)
self.lora_A.is_distributed = False
self.lora_B = self.create_parameter(
shape=[r, self.output_size_per_partition],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(value=0.0),
learning_rate=lora_plus_scale,
),
)
with rng_ctx(self.is_mp, paddle.in_dynamic_mode()):
self.lora_B = self.create_parameter(

Check warning on line 480 in paddlenlp/peft/lora/lora_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/peft/lora/lora_layers.py#L479-L480

Added lines #L479 - L480 were not covered by tests
shape=[r, self.output_size_per_partition],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(value=0.0),
learning_rate=lora_plus_scale,
),
)

self.lora_B.is_distributed = True
self.lora_B.split_axis = 1
Expand Down Expand Up @@ -599,15 +608,16 @@
self.lora_A.is_distributed = False
mark_as_sequence_parallel_parameter(self.lora_A)

self.lora_B = self.create_parameter(
shape=[r, self.output_size_per_partition],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(value=0.0),
learning_rate=lora_plus_scale,
),
)
with rng_ctx(self.is_mp, paddle.in_dynamic_mode()):
self.lora_B = self.create_parameter(

Check warning on line 612 in paddlenlp/peft/lora/lora_layers.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/peft/lora/lora_layers.py#L611-L612

Added lines #L611 - L612 were not covered by tests
shape=[r, self.output_size_per_partition],
dtype=self._dtype,
is_bias=False,
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(value=0.0),
learning_rate=lora_plus_scale,
),
)

self.lora_B.is_distributed = True
self.lora_B.split_axis = 1
Expand Down
Loading