ELS-RD · pommedeterresautee · Sep 15, 2022 · Aug 11, 2022 · Aug 11, 2022 · Aug 11, 2022
diff --git a/implementations/layer_norm.py b/implementations/layer_norm.py
@@ -2,12 +2,87 @@
 
 import triton
 import triton.language as tl
+from triton import JITFunction
+
 
 # CREDITS: Initially inspired by the Triton tutorial
 
 
 @triton.jit
-def _layer_norm_fwd_fused(
+def _layer_norm_fwd_fused_single_pass(
+        Out,
+        A,
+        Weight,
+        Bias,
+        Mean, std,
+        stride, N, eps,
+        BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Layernorm based on Welford's variance computation algorithm.
+    https://changyaochen.github.io/welford/
+    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+
+    :param Out: output tensor
+    :param A: input tensor
+    :param Weight: weights applied to the normalized input
+    :param Bias: bias added to the normalized input
+    :param Mean: save mean tensor for backward
+    :param std: save standard deviation tensor for backward
+    :param stride: stride of the input tensor
+    :param N: number of elements per row in the input tensor
+    :param eps: epsilon value to avoid division by zero
+    :param BLOCK_SIZE: number of threads per block
+    :return: None
+    """
+    # position of elements processed by this program
+    _idx = tl.program_id(0)
+    out_ptr = Out + _idx * stride
+    a_ptr = A + _idx * stride
+    # compute mean
+    mean = 0.0
+    var = 0.0
+    for start_n_offset in range(0, N, BLOCK_SIZE):
+        end_n_offset = min((start_n_offset + BLOCK_SIZE), N)
+        nb_block_cols = end_n_offset - start_n_offset
+        column_offset = start_n_offset + tl.arange(0, BLOCK_SIZE)
+        mask = column_offset < N
+        # eviction policy below have little impact now because of new implementation. Kept as is.
+        a = tl.load(a_ptr + column_offset, mask=mask, other=0., eviction_policy="evict_last").to(tl.float32)
+
+        block_mean = tl.sum(a, axis=0) / nb_block_cols
+        # mean is 0 or has a mask applied to it, no need to mask delta_mean!
+        delta_mean = block_mean - mean
+        delta_mean_sqr = delta_mean * delta_mean
+
+        block_delta = tl.sum((a - block_mean) * a, axis=0)
+        # mean has a mask!
+        mean += tl.sum((a - mean) * mask, axis=0) / end_n_offset
+        var += block_delta + delta_mean_sqr * (start_n_offset * nb_block_cols) / end_n_offset
+
+    var = var / N
+    rstd = 1 / tl.sqrt(var + eps)
+
+    # write-back mean/rstd for backward pass
+    tl.store(Mean + _idx, mean)
+    tl.store(std + _idx, rstd)
+
+    # multiply by weight and add bias
+    for off in range(0, N, BLOCK_SIZE):
+        column_offset = off + tl.arange(0, BLOCK_SIZE)
+        mask = column_offset < N
+        weight = tl.load(Weight + column_offset, mask=mask)
+        bias = tl.load(Bias + column_offset, mask=mask)
+        # eviction policy helps to keep weights in cache (reused by other threads)
+        a = tl.load(a_ptr + column_offset, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
+        a_hat = (a - mean) * rstd
+        out = a_hat * weight + bias
+        # write-back
+        tl.store(out_ptr + column_offset, out, mask=mask)
+
+
+@triton.jit
+def _layer_norm_fwd_fused_multi_pass(
         Out,
         A,
         Weight,
@@ -49,18 +124,19 @@ def _layer_norm_fwd_fused(
         a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
         a_hat = (a - mean) * rstd
         out = a_hat * weight + bias
-        # # write-back
+        # write-back
         tl.store(Out + cols, out, mask=mask)
 
 
-def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float):
+def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float, implementation: JITFunction = _layer_norm_fwd_fused_single_pass):
     # allocate output
     out = torch.empty_like(a)
     # reshape input data into 2D tensor
     a_arg = a.reshape(-1, a.shape[-1])
     M, N = a_arg.shape
+    # tensors below for backward pass
     mean = torch.empty((M,), dtype=torch.float32, device="cuda")
-    rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
+    std = torch.empty((M,), dtype=torch.float32, device="cuda")
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // a.element_size()
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
@@ -69,12 +145,12 @@ def layer_norm_forward(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
     # heuristics for number of warps
     num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
     eps = min(eps, 1e-6)  # >= 1e-5 may decrease Bert accuracy
-    _layer_norm_fwd_fused[(M,)](
+    implementation[(M,)](
         out,
         a_arg,
         weight,
         bias,
-        mean, rstd,
+        mean, std,
         a_arg.stride(0), N, eps,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,

diff --git a/test/test_layer_norm.py b/test/test_layer_norm.py
@@ -4,17 +4,30 @@
 import pytest
 
 from implementations.cuda_graph import cuda_graphs_wrapper
-from implementations.layer_norm import layer_norm_forward
+from implementations.layer_norm import layer_norm_forward, _layer_norm_fwd_fused_single_pass, \
+    _layer_norm_fwd_fused_multi_pass
+
+
+def pytorch_naive(a: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float):
+    mean = a.mean(dim=-1, keepdim=True)
+    var = a.var(dim=-1, keepdim=True)
+    rstd = 1 / torch.sqrt(var + eps)
+    a_hat = (a - mean) * rstd
+    out = a_hat * weight + bias
+    return out
+
 
 implementations: dict[str, Callable[[torch.Tensor, torch.Tensor, torch.Tensor, float], torch.Tensor]] = {
     "pytorch": lambda x, weight, bias, eps: torch.nn.functional.layer_norm(x, weight.shape, weight, bias, eps),
-    "triton": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps),
+    "triton_original": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps, _layer_norm_fwd_fused_multi_pass),
+    "triton_improved": lambda x, weight, bias, eps: layer_norm_forward(x, weight, bias, eps, _layer_norm_fwd_fused_single_pass),
+    "pytorch_naive": lambda x, weight, bias, eps: pytorch_naive(x, weight, bias, eps),
 }
 
 
-@pytest.mark.parametrize("shape", [128, 512, 1024, 2048, 4096], ids=lambda x: f"shape={x}x{x}")
+@pytest.mark.parametrize("shape", [128, 512, 1024, 2048, 4096, 8192], ids=lambda x: f"shape={x}x{x}")
 @pytest.mark.parametrize("cuda_graphs", [True, False], ids=["cuda_graphs", "no_cuda_graphs"])
-@pytest.mark.parametrize("implementation", ["pytorch", "triton"])
+@pytest.mark.parametrize("implementation", ["triton_original", "triton_improved", "pytorch", "pytorch_naive"])
 def test_benchmark_layer_norm(benchmark, shape: int, cuda_graphs: bool, implementation: str):
     assert implementation in implementations, f"Unknown implementation: {implementation}"
 
@@ -39,4 +52,4 @@ def inference(x, *args, **kwargs):
 
     value = benchmark(inference, x, weight, bias, eps)
 
-    assert torch.allclose(value, expected, atol=1e-2)
+    assert torch.allclose(value, expected, atol=1e-1)
diff --git a/test/test_linear_layer.py b/test/test_linear_layer.py
@@ -40,6 +40,7 @@ def test_benchmark(benchmark, shape: Shape, bias: bool, activation: str, contigu
     batch, M, N, K = dataclasses.astuple(shape)
 
     # order of dimensions is wrong so we force contiguous call
+
     a = torch.randn((batch, K, M), device='cuda', dtype=torch.float16, requires_grad=False)
     a = a.mT
     if contiguous:

diff --git a/test/test_torchdynamo_bert.py b/test/test_torchdynamo_bert.py
@@ -10,7 +10,7 @@
 
 
 @pytest.fixture
-def model_baseline_fp32():
+def model_reference_fp32():
     return get_model_baseline(float_16=False)
 
 
@@ -48,35 +48,37 @@ def get_input_non_causal(shape: (int, int)) -> Dict[str, torch.Tensor]:
 }
 
 
-@pytest.mark.parametrize("input_shape", [(1, 16), (1, 128), (1, 256), (1, 384), (1, 512),
-                                         (8, 16), (8, 128), (8, 256), (8, 384), (8, 512),
-                                         (32, 16), (32, 128), (32, 256),
-                                         ], ids=lambda x: f"{x[0]}x{x[1]}")
+@pytest.mark.parametrize("shape", [(1, 16), (1, 128), (1, 256), (1, 384), (1, 512),
+                                   (8, 16), (8, 128), (8, 256), (8, 384), (8, 512),
+                                   (32, 16), (32, 128), (32, 256),
+                                   ], ids=lambda x: f"{x[0]}x{x[1]}")
 @pytest.mark.parametrize("implementation", implementations.keys())
-def test_benchmark_implementations(benchmark, model_baseline_fp32, input_shape: (int, int), implementation: str):
+def test_benchmark_implementations(benchmark, model_reference_fp32, shape: (int, int), implementation: str):
     torch.manual_seed(0)
     assert implementation in implementations, f"unknown implementation: {implementation}"
     model_tested = implementations[implementation]
 
-    inputs = get_input_causal(input_shape) if model_tested.is_causal else get_input_non_causal(input_shape)
+    inputs = get_input_causal(shape) if model_tested.is_causal else get_input_non_causal(shape)
 
     with torch.inference_mode():
-        expected = model_baseline_fp32(**inputs)
+        expected = model_reference_fp32(**inputs)
         model = model_tested.model()
         value = benchmark(model, **inputs)
 
     torchdynamo.reset()
 
-    assert torch.allclose(input=value["last_hidden_state"].float(), other=expected["last_hidden_state"], rtol=1e-1, atol=1e-1)
+    assert torch.allclose(input=value["last_hidden_state"].float(), other=expected["last_hidden_state"], rtol=1e-1,
+                          atol=1e-1)
     assert torch.allclose(input=value["pooler_output"].float(), other=expected["pooler_output"], rtol=1e-1, atol=1e-1)
 
 
-def test_support_shape_change(model_baseline_fp32):
+def test_support_shape_change(model_reference_fp32):
     """Test that the model can handle shape changes without being reloaded/rebuilt."""
     for name, implementation in implementations.items():
         model_tested = implementation.model()
         for shape in [(1, 64), (8, 256), (16, 256), (16, 64)]:
             pytorch_input = get_input_causal(shape) if implementation.is_causal else get_input_non_causal(shape)
-            expected = model_baseline_fp32(**pytorch_input)
+            expected = model_reference_fp32(**pytorch_input)
             result = model_tested(**pytorch_input)
-            assert torch.allclose(result["last_hidden_state"].float(), expected["last_hidden_state"], atol=1e-1), f"failed on {name} with shape {shape}"
+            assert torch.allclose(result["last_hidden_state"].float(), expected["last_hidden_state"],
+                                  atol=1e-1), f"failed on {name} with shape {shape}"