Merge pull request #26 from sljlp/moe

add moe module
PaddlePaddle · Dec 16, 2021 · 7540865 · 7540865
2 parents 7fea284 + 81d3159
commit 7540865
Show file tree

Hide file tree

Showing 9 changed files with 717 additions and 1 deletion.
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
@@ -54,7 +54,7 @@
 
 from . import cloud_utils  # noqa: F401
 from . import utils  # noqa: F401
-
+from .model import moe
 
 __all__ = [  # noqa
       "spawn",
@@ -85,4 +85,5 @@
       "wait",
       "get_rank",
       "ProbabilityEntry",
+      "moe"
 ]
diff --git a/python/paddle/distributed/model/moe/__init__.py b/python/paddle/distributed/model/moe/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .moe_layer import *
diff --git a/python/paddle/distributed/model/moe/gate/__init__.py b/python/paddle/distributed/model/moe/gate/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gshard_gate import GShardGate
+from .switch_gate import SwitchGate
+from .naive_gate import NaiveGate
diff --git a/python/paddle/distributed/model/moe/gate/base_gate.py b/python/paddle/distributed/model/moe/gate/base_gate.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+
+class BaseGate(nn.Layer):
+    def __init__(self, num_expert, world_size):
+        super().__init__()
+        self.world_size = world_size
+        self.num_expert = num_expert
+        self.tot_expert = world_size * num_expert
+        self.loss = None
+
+    def forward(self, x):
+        raise NotImplementedError("Please implement the forward function.")
+
+    def set_loss(self, loss):
+        self.loss = loss
+
+    def get_loss(self, clear=True):
+        loss = self.loss
+        if clear:
+            self.loss = None
+        return loss
diff --git a/python/paddle/distributed/model/moe/gate/gshard_gate.py b/python/paddle/distributed/model/moe/gate/gshard_gate.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn.functional as F
+import numpy as np
+from .naive_gate import NaiveGate
+from ..utils import limit_by_capacity
+
+
+class GShardGate(NaiveGate):
+    def __init__(self,
+                 d_model,
+                 num_expert,
+                 world_size,
+                 topk=2,
+                 capacity=(1.2, 2.4),
+                 random_routing=True,
+                 group=None):
+        assert topk == 2, "topk should be 2 in gshard"
+        super().__init__(d_model, num_expert, world_size)
+        self.capacity = capacity
+        self.random_routing = random_routing
+        self.group = group
+
+    def forward(self, x):
+        topk_val, topk_idx, gate_score = super().forward(
+            x, return_all_scores=True)
+        s = gate_score.shape[0]
+        top1_idx = topk_idx.flatten()
+        c_e = paddle.scatter(
+            paddle.zeros(shape=[self.tot_expert]),
+            top1_idx,
+            paddle.ones_like(
+                top1_idx, dtype="float32"),
+            overwrite=False) / s
+        m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0)
+        loss = paddle.mean(c_e * m_e) * (self.num_expert**2)
+        self.set_loss(loss)
+
+        cap_rate = self.capacity[0 if self.training else 1]
+        capacity = math.ceil(cap_rate * x.shape[0])
+        _new_lec, _new_gec, topk_idx = limit_by_capacity(
+            topk_idx,
+            self.num_expert,
+            self.world_size,
+            capacity,
+            group=self.group)
+
+        if self.random_routing:
+            rand_routing_prob = paddle.rand(
+                shape=[gate_score.shape[0]], dtype="float32")
+            topk_idx = paddle.distributed.utils.random_routing(
+                topk_idx, topk_val, rand_routing_prob)
+        return topk_val, topk_idx
diff --git a/python/paddle/distributed/model/moe/gate/naive_gate.py b/python/paddle/distributed/model/moe/gate/naive_gate.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_gate import BaseGate
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class NaiveGate(BaseGate):
+    def __init__(self, d_model, num_expert, world_size, topk=2):
+        super().__init__(num_expert, world_size)
+        self.gate = nn.Linear(d_model, self.tot_expert)
+        self.gate.weight.name = "gate_" + self.gate.weight.name
+        self.gate.bias.name = "gate_" + self.gate.bias.name
+        self.top_k = topk
+
+    def forward(self, inp, return_all_scores=False):
+        gate = self.gate(inp)
+        gate_top_k_val, gate_top_k_idx = paddle.topk(
+            gate, k=self.top_k, axis=-1, largest=True, sorted=False)
+
+        if return_all_scores:
+            return gate_top_k_val, gate_top_k_idx, gate
+        return gate_top_k_val, gate_top_k_idx
diff --git a/python/paddle/distributed/model/moe/gate/switch_gate.py b/python/paddle/distributed/model/moe/gate/switch_gate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from .naive_gate import NaiveGate
+from ..utils import limit_by_capacity
+
+
+class SwitchGate(NaiveGate):
+    def __init__(self,
+                 d_model,
+                 num_expert,
+                 world_size,
+                 topk=1,
+                 switch_eps=.1,
+                 capacity=(1.2, 2.4),
+                 group=None):
+        assert topk == 1, "topk should be 1 in switch"
+        super().__init__(d_model, num_expert, world_size, topk=1)
+        self.switch_eps = switch_eps
+        self.capacity = capacity
+        self.group = group
+
+    def forward(self, inp):
+        score = self.gate(inp)
+
+        if self.training:
+            noise = paddle.rand(shape=score.shape)
+            noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps
+            score += noise
+
+        score = F.softmax(score, axis=-1)
+        top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True)
+
+        cap_rate = self.capacity[0 if self.training else 1]
+        capacity = math.ceil(cap_rate * inp.shape[0])
+        _new_lec, _new_gec, top1_idx = limit_by_capacity(
+            top1_idx,
+            self.num_expert,
+            self.world_size,
+            capacity,
+            group=self.group)
+        valid_idx = top1_idx[top1_idx > -1]
+        valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])
+        fraction_expert = paddle.scatter_nd_add(
+            x=paddle.zeros(shape=[self.tot_expert]),
+            index=valid_idx_tmp,
+            updates=paddle.ones_like(
+                valid_idx, dtype=paddle.float32).reshape(
+                    shape=[len(valid_idx)]), ) / valid_idx.numel()
+        prob_expert = score.sum(axis=0) / valid_idx.numel()
+        loss = (fraction_expert * prob_expert).sum() * self.tot_expert
+        self.set_loss(loss)
+
+        return top1_score, top1_idx