ELS-RD · pommedeterresautee · Sep 15, 2022 · Aug 11, 2022 · Aug 11, 2022 · Aug 11, 2022
diff --git a/implementations/layer_norm.py b/implementations/layer_norm.py
@@ -2,12 +2,73 @@
 
 import triton
 import triton.language as tl
+from triton import JITFunction
+
 
 # CREDITS: Initially inspired by the Triton tutorial
 
 
 @triton.jit
-def _layer_norm_fwd_fused(
+def _layer_norm_fwd_fused_single_pass(
+        Out,
+        A,
+        Weight,
+        Bias,
+        Mean, Rstd,
+        stride, N, eps,
+        BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Based on Welford's variance computation algorithm.
+    https://changyaochen.github.io/welford/
+    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    """
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    Out += row * stride
+    A += row * stride
+    # compute mean
+    mean = 0.0
+    var = 0.0
+    for start in range(0, N, BLOCK_SIZE):
+        end = min((start + BLOCK_SIZE), N)
+        nb_block_col = end - start
+        cols = start + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_last").to(tl.float32)
+
+        block_mean = tl.sum(a, axis=0) / nb_block_col
+        # mean is 0 or has a mask applied to it, no need to mask delta_mean!
+        delta_mean = block_mean - mean
+        delta_mean_sqr = delta_mean * delta_mean
+
+        block_delta = tl.sum((a - block_mean) * a, axis=0)
+        # mean has a mask!
+        mean += tl.sum((a - mean) * mask, axis=0) / end
+        var += block_delta + delta_mean_sqr * (start * nb_block_col) / end
+
+    var = var / N
+    rstd = 1 / tl.sqrt(var + eps)
+
+    # write-back mean/rstd
+    tl.store(Mean + row, mean)
+    tl.store(Rstd + row, rstd)
+
+    # multiply by weight and add bias
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        weight = tl.load(Weight + cols, mask=mask)
+        bias = tl.load(Bias + cols, mask=mask)
+        a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
+        a_hat = (a - mean) * rstd
+        out = a_hat * weight + bias
+        # write-back
+        tl.store(Out + cols, out, mask=mask)
+
+
+@triton.jit
+def _layer_norm_fwd_fused_multi_pass(
         Out,
         A,
         Weight,
@@ -49,11 +110,11 @@ def _layer_norm_fwd_fused(
         a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
         a_hat = (a - mean) * rstd
         out = a_hat * weight + bias
-        # # write-back
+        # write-back
         tl.store(Out + cols, out, mask=mask)
 
 
-def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float):
+def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float, implementation: JITFunction = _layer_norm_fwd_fused_single_pass):
     # allocate output
     out = torch.empty_like(a)
     # reshape input data into 2D tensor
@@ -69,7 +130,7 @@ def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
     # heuristics for number of warps
     num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
     eps = min(eps, 1e-6)  # >= 1e-5 may decrease Bert accuracy
-    _layer_norm_fwd_fused[(M,)](
+    implementation[(M,)](
         out,
         a_arg,
         weight,

diff --git a/test/test_layer_norm.py b/test/test_layer_norm.py
@@ -4,17 +4,30 @@
 import pytest
 
 from implementations.cuda_graph import cuda_graphs_wrapper
-from implementations.layer_norm import layer_norm_forward
+from implementations.layer_norm import layer_norm_forward, _layer_norm_fwd_fused_single_pass, \
+    _layer_norm_fwd_fused_multi_pass
+
+
+def pytorch_naive(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float):
+    mean = a.mean(dim=-1, keepdim=True)
+    var = a.var(dim=-1, keepdim=True)
+    rstd = 1 / torch.sqrt(var + eps)
+    a_hat = (a - mean) * rstd
+    out = a_hat * weight + bias
+    return out
+
 
 implementations: dict[str, Callable[[torch.Tensor, torch.Tensor, torch.Tensor, float], torch.Tensor]] = {
     "pytorch": lambda x, weight, bias, eps: torch.nn.functional.layer_norm(x, weight.shape, weight, bias, eps),
-    "triton": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps),
+    "triton_original": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps, _layer_norm_fwd_fused_multi_pass),
+    "triton_improved": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps, _layer_norm_fwd_fused_single_pass),
+    "pytorch_naive": lambda x, weight, bias, eps: pytorch_naive(x, weight, bias, eps),
 }
 
 
-@pytest.mark.parametrize("shape", [128, 512, 1024, 2048, 4096], ids=lambda x: f"shape={x}x{x}")
+@pytest.mark.parametrize("shape", [128, 512, 1024, 2048, 4096, 8192], ids=lambda x: f"shape={x}x{x}")
 @pytest.mark.parametrize("cuda_graphs", [True, False], ids=["cuda_graphs", "no_cuda_graphs"])
-@pytest.mark.parametrize("implementation", ["pytorch", "triton"])
+@pytest.mark.parametrize("implementation", ["triton_original", "triton_improved", "pytorch", "pytorch_naive"])
 def test_benchmark_layer_norm(benchmark, shape: int, cuda_graphs: bool, implementation: str):
     assert implementation in implementations, f"Unknown implementation: {implementation}"
 
@@ -39,4 +52,4 @@ def inference(x, *args, **kwargs):
 
     value = benchmark(inference, x, weight, bias, eps)
 
-    assert torch.allclose(value, expected, atol=1e-2)
+    assert torch.allclose(value, expected, atol=1e-1)
diff --git a/test/test_linear_layer.py b/test/test_linear_layer.py
@@ -40,6 +40,7 @@ def test_benchmark(benchmark, shape: Shape, bias: bool, activation: str, contigu
     batch, M, N, K = dataclasses.astuple(shape)
 
     # order of dimensions is wrong so we force contiguous call
+
     a = torch.randn((batch, K, M), device='cuda', dtype=torch.float16, requires_grad=False)
     a = a.mT
     if contiguous:

diff --git a/test/test_torchdynamo_bert.py b/test/test_torchdynamo_bert.py
@@ -10,7 +10,7 @@
 
 
 @pytest.fixture
-def model_baseline_fp32():
+def model_reference_fp32():
     return get_model_baseline(float_16=False)
 
 
@@ -48,35 +48,37 @@ def get_input_non_causal(shape: (int, int)) -> Dict[str, torch.Tensor]:
 }
 
 
-@pytest.mark.parametrize("input_shape", [(1, 16), (1, 128), (1, 256), (1, 384), (1, 512),
-                                         (8, 16), (8, 128), (8, 256), (8, 384), (8, 512),
-                                         (32, 16), (32, 128), (32, 256),
-                                         ], ids=lambda x: f"{x[0]}x{x[1]}")
+@pytest.mark.parametrize("shape", [(1, 16), (1, 128), (1, 256), (1, 384), (1, 512),
+                                   (8, 16), (8, 128), (8, 256), (8, 384), (8, 512),
+                                   (32, 16), (32, 128), (32, 256),
+                                   ], ids=lambda x: f"{x[0]}x{x[1]}")
 @pytest.mark.parametrize("implementation", implementations.keys())
-def test_benchmark_implementations(benchmark, model_baseline_fp32, input_shape: (int, int), implementation: str):
+def test_benchmark_implementations(benchmark, model_reference_fp32, shape: (int, int), implementation: str):
     torch.manual_seed(0)
     assert implementation in implementations, f"unknown implementation: {implementation}"
     model_tested = implementations[implementation]
 
-    inputs = get_input_causal(input_shape) if model_tested.is_causal else get_input_non_causal(input_shape)
+    inputs = get_input_causal(shape) if model_tested.is_causal else get_input_non_causal(shape)
 
     with torch.inference_mode():
-        expected = model_baseline_fp32(**inputs)
+        expected = model_reference_fp32(**inputs)
         model = model_tested.model()
         value = benchmark(model, **inputs)
 
     torchdynamo.reset()
 
-    assert torch.allclose(input=value["last_hidden_state"].float(), other=expected["last_hidden_state"], rtol=1e-1, atol=1e-1)
+    assert torch.allclose(input=value["last_hidden_state"].float(), other=expected["last_hidden_state"], rtol=1e-1,
+                          atol=1e-1)
     assert torch.allclose(input=value["pooler_output"].float(), other=expected["pooler_output"], rtol=1e-1, atol=1e-1)
 
 
-def test_support_shape_change(model_baseline_fp32):
+def test_support_shape_change(model_reference_fp32):
     """Test that the model can handle shape changes without being reloaded/rebuilt."""
     for name, implementation in implementations.items():
         model_tested = implementation.model()
         for shape in [(1, 64), (8, 256), (16, 256), (16, 64)]:
             pytorch_input = get_input_causal(shape) if implementation.is_causal else get_input_non_causal(shape)
-            expected = model_baseline_fp32(**pytorch_input)
+            expected = model_reference_fp32(**pytorch_input)
             result = model_tested(**pytorch_input)
-            assert torch.allclose(result["last_hidden_state"].float(), expected["last_hidden_state"], atol=1e-1), f"failed on {name} with shape {shape}"
+            assert torch.allclose(result["last_hidden_state"].float(), expected["last_hidden_state"],
+                                  atol=1e-1), f"failed on {name} with shape {shape}"