mosaicml · vchiley · Mar 10, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 10, 2023
diff --git a/examples/llm/src/models/param_init_fns.py b/examples/llm/src/models/param_init_fns.py
@@ -3,6 +3,7 @@
 
 import math
 import warnings
+from collections.abc import Sequence
 from functools import partial
 
 import torch
@@ -68,7 +69,38 @@ def generic_param_init_fn_(module, cfg, init_fn_):
 
     elif isinstance(module, nn.Embedding):
         # Embedding
-        init_fn_(module.weight)
+        if cfg.get('emb_init_std') is not None:
+            std = cfg.get('emb_init_std')
+            if std == 0:
+                warnings.warn(f'Embedding layer initialized to 0.')
+            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+            if cfg.get('verbose', 0) > 1:
+                warnings.warn(
+                    f'Embedding layer initialized using normal distribution with mean=0 and {std=}.'
+                )
+        elif cfg.get('emb_init_uniform_lim') is not None:
+            lim = cfg.get('emb_init_uniform_lim')
+            if isinstance(lim, Sequence):
+                if len(lim) > 2:
+                    raise ValueError(
+                        f'Uniform init requires a min and a max limit. User input: {lim}.'
+                    )
+                if lim[0] == lim[1]:
+                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
+            else:
+                if lim == 0:
+                    warnings.warn(f'Embedding layer initialized to 0.')
+                lim = [-lim, lim]
+            a, b = lim
+            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+            if cfg.get('verbose', 0) > 1:
+                warnings.warn(
+                    f'Embedding layer initialized using uniform distribution in range {lim}.'
+                )
+        else:
+            emb_init_fn_ = init_fn_
+
+        emb_init_fn_(module.weight)
 
     elif isinstance(module, nn.LayerNorm):
         # LayerNorm

diff --git a/examples/llm/tests/test_init_fn.py b/examples/llm/tests/test_init_fn.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections import OrderedDict
+from collections.abc import Sequence
 from functools import partial
 
 import pytest
@@ -10,7 +12,8 @@
 from omegaconf import OmegaConf as om
 from torch import nn
 
-from examples.llm.src.models.param_init_fns import generic_param_init_fn_
+from examples.llm.src.models.param_init_fns import (MODEL_INIT_REGISTRY,
+                                                    generic_param_init_fn_)
 
 
 class MLP(nn.Module):
@@ -121,3 +124,49 @@ def max_fill_init_(weight):
             assert (p == 0).all()
         elif n == 'weight':
             assert (p == fill_val).all()
+
+
+@pytest.mark.parametrize('emb_init_cfg', [
+    None, ('emb_init_std', 5), ('emb_init_std', 0), ('emb_init_uniform_lim', 2),
+    ('emb_init_uniform_lim', [-1, 4]), ('emb_init_uniform_lim', 0),
+    ('emb_init_uniform_lim', [1, 1])
+])
+def test_emb_init(emb_init_cfg):
+    reproducibility.seed_all(7)
+
+    cfg = {
+        'vocab_size': 64,
+        'in_features': 16,
+        'out_features': 32,
+        'n_layers': 2,
+    }
+    if emb_init_cfg is not None:
+        cfg[emb_init_cfg[0]] = emb_init_cfg[1]
+    cfg = om.create(cfg)
+
+    model = nn.Sequential(
+        OrderedDict([
+            ('emb', nn.Embedding(cfg.vocab_size, cfg.in_features)),
+            ('fc1', nn.Linear(cfg.in_features, cfg.out_features, bias=True)),
+            ('ln1', nn.LayerNorm(cfg.out_features)),
+            ('act1', nn.ReLU()),
+            ('fc2', nn.Linear(cfg.out_features, cfg.out_features, bias=True)),
+        ]))
+
+    model.apply(partial(MODEL_INIT_REGISTRY['kaiming_normal_'], cfg=cfg))
+
+    if cfg.get('emb_init_std') is not None:
+        emb_init_std = cfg.get('emb_init_std')
+        if emb_init_std == 0:
+            assert (model.emb.weight == 0).all()  # type: ignore
+    elif cfg.get('emb_init_uniform_lim') is not None:
+        emb_init_uniform_lim = cfg.get('emb_init_uniform_lim')
+        if emb_init_uniform_lim == 0:
+            assert (model.emb.weight == 0).all()  # type: ignore
+        elif isinstance(emb_init_uniform_lim, Sequence):
+            assert len(emb_init_uniform_lim) <= 2
+            if len(emb_init_uniform_lim
+                  ) == 2 and emb_init_uniform_lim[0] == emb_init_uniform_lim[1]:
+                assert (
+                    model.emb.weight == emb_init_uniform_lim[0]  # type: ignore
+                ).all()