Skip to content

Commit

Permalink
Merge pull request #26 from sljlp/moe
Browse files Browse the repository at this point in the history
add moe module
  • Loading branch information
lilong12 authored Dec 16, 2021
2 parents 7fea284 + 81d3159 commit 7540865
Show file tree
Hide file tree
Showing 9 changed files with 717 additions and 1 deletion.
3 changes: 2 additions & 1 deletion python/paddle/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

from . import cloud_utils # noqa: F401
from . import utils # noqa: F401

from .model import moe

__all__ = [ # noqa
"spawn",
Expand Down Expand Up @@ -85,4 +85,5 @@
"wait",
"get_rank",
"ProbabilityEntry",
"moe"
]
15 changes: 15 additions & 0 deletions python/paddle/distributed/model/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .moe_layer import *
17 changes: 17 additions & 0 deletions python/paddle/distributed/model/moe/gate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .gshard_gate import GShardGate
from .switch_gate import SwitchGate
from .naive_gate import NaiveGate
36 changes: 36 additions & 0 deletions python/paddle/distributed/model/moe/gate/base_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle.nn as nn


class BaseGate(nn.Layer):
def __init__(self, num_expert, world_size):
super().__init__()
self.world_size = world_size
self.num_expert = num_expert
self.tot_expert = world_size * num_expert
self.loss = None

def forward(self, x):
raise NotImplementedError("Please implement the forward function.")

def set_loss(self, loss):
self.loss = loss

def get_loss(self, clear=True):
loss = self.loss
if clear:
self.loss = None
return loss
67 changes: 67 additions & 0 deletions python/paddle/distributed/model/moe/gate/gshard_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import paddle.nn.functional as F
import numpy as np
from .naive_gate import NaiveGate
from ..utils import limit_by_capacity


class GShardGate(NaiveGate):
def __init__(self,
d_model,
num_expert,
world_size,
topk=2,
capacity=(1.2, 2.4),
random_routing=True,
group=None):
assert topk == 2, "topk should be 2 in gshard"
super().__init__(d_model, num_expert, world_size)
self.capacity = capacity
self.random_routing = random_routing
self.group = group

def forward(self, x):
topk_val, topk_idx, gate_score = super().forward(
x, return_all_scores=True)
s = gate_score.shape[0]
top1_idx = topk_idx.flatten()
c_e = paddle.scatter(
paddle.zeros(shape=[self.tot_expert]),
top1_idx,
paddle.ones_like(
top1_idx, dtype="float32"),
overwrite=False) / s
m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0)
loss = paddle.mean(c_e * m_e) * (self.num_expert**2)
self.set_loss(loss)

cap_rate = self.capacity[0 if self.training else 1]
capacity = math.ceil(cap_rate * x.shape[0])
_new_lec, _new_gec, topk_idx = limit_by_capacity(
topk_idx,
self.num_expert,
self.world_size,
capacity,
group=self.group)

if self.random_routing:
rand_routing_prob = paddle.rand(
shape=[gate_score.shape[0]], dtype="float32")
topk_idx = paddle.distributed.utils.random_routing(
topk_idx, topk_val, rand_routing_prob)
return topk_val, topk_idx
37 changes: 37 additions & 0 deletions python/paddle/distributed/model/moe/gate/naive_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .base_gate import BaseGate

import paddle
import paddle.nn as nn
import paddle.nn.functional as F


class NaiveGate(BaseGate):
def __init__(self, d_model, num_expert, world_size, topk=2):
super().__init__(num_expert, world_size)
self.gate = nn.Linear(d_model, self.tot_expert)
self.gate.weight.name = "gate_" + self.gate.weight.name
self.gate.bias.name = "gate_" + self.gate.bias.name
self.top_k = topk

def forward(self, inp, return_all_scores=False):
gate = self.gate(inp)
gate_top_k_val, gate_top_k_idx = paddle.topk(
gate, k=self.top_k, axis=-1, largest=True, sorted=False)

if return_all_scores:
return gate_top_k_val, gate_top_k_idx, gate
return gate_top_k_val, gate_top_k_idx
69 changes: 69 additions & 0 deletions python/paddle/distributed/model/moe/gate/switch_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from .naive_gate import NaiveGate
from ..utils import limit_by_capacity


class SwitchGate(NaiveGate):
def __init__(self,
d_model,
num_expert,
world_size,
topk=1,
switch_eps=.1,
capacity=(1.2, 2.4),
group=None):
assert topk == 1, "topk should be 1 in switch"
super().__init__(d_model, num_expert, world_size, topk=1)
self.switch_eps = switch_eps
self.capacity = capacity
self.group = group

def forward(self, inp):
score = self.gate(inp)

if self.training:
noise = paddle.rand(shape=score.shape)
noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps
score += noise

score = F.softmax(score, axis=-1)
top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True)

cap_rate = self.capacity[0 if self.training else 1]
capacity = math.ceil(cap_rate * inp.shape[0])
_new_lec, _new_gec, top1_idx = limit_by_capacity(
top1_idx,
self.num_expert,
self.world_size,
capacity,
group=self.group)
valid_idx = top1_idx[top1_idx > -1]
valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])
fraction_expert = paddle.scatter_nd_add(
x=paddle.zeros(shape=[self.tot_expert]),
index=valid_idx_tmp,
updates=paddle.ones_like(
valid_idx, dtype=paddle.float32).reshape(
shape=[len(valid_idx)]), ) / valid_idx.numel()
prob_expert = score.sum(axis=0) / valid_idx.numel()
loss = (fraction_expert * prob_expert).sum() * self.tot_expert
self.set_loss(loss)

return top1_score, top1_idx
Loading

0 comments on commit 7540865

Please sign in to comment.