Merge branch 'main' into eigh_tridiagonal_paolot

graphcore-research · Oct 20, 2023 · 922e4d4 · 922e4d4
2 parents 8f63bd4 + 5f247c4
commit 922e4d4
Show file tree

Hide file tree

Showing 15 changed files with 475 additions and 175 deletions.
diff --git a/tessellate_ipu/core/__init__.py b/tessellate_ipu/core/__init__.py
@@ -45,6 +45,7 @@
     primitive_clone,
     primitive_num_inout_alias_args,
 )
+from .tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes
 
 
 def tessellate_ipu_cleanup():

diff --git a/tessellate_ipu/core/tile_array.py b/tessellate_ipu/core/tile_array.py
@@ -1,8 +1,10 @@
 # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
+import itertools
 from dataclasses import dataclass
 from typing import Any, Sequence, Tuple, Union
 
 import chex
+import jax.lax
 import numpy as np
 from jax.core import ShapedArray
 from jax.interpreters.xla import DeviceArray
@@ -185,6 +187,14 @@ def __getitem__(self, key: Union[SliceType, MultiSliceType]) -> "TileShardedArra
         check_tile_array_multi_slice(key, self.array.shape)
         return TileShardedArray(array=self.array[key], tiles=self.tiles[key[0]])  # type:ignore
 
+    @classmethod
+    def concatenate(cls, arrays: Sequence["TileShardedArray"]) -> "TileShardedArray":
+        """Concatenate tile sharded arrays along the first axis."""
+        assert all([isinstance(v, TileShardedArray) for v in arrays])
+        outarray = jax.lax.concatenate([v.array for v in arrays], dimension=0)
+        outtiles = tuple(itertools.chain(*[v.tiles for v in arrays]))
+        return TileShardedArray(array=outarray, tiles=outtiles)
+
 
 def tile_put_sharded(array: DeviceArray, tiles: Sequence[int]) -> TileShardedArray:
     """Shard a JAX array over tiles on the first axis.

diff --git a/tessellate_ipu/core/tile_interpreter_vertex_utils.py b/tessellate_ipu/core/tile_interpreter_vertex_utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 import math
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 from numpy.typing import DTypeLike, NDArray
@@ -25,37 +25,105 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:
     return num_elements
 
 
+def make_ipu_vector1d_worker_offsets_and_sizes(
+    size: int,
+    vector_size: int = 2,
+    num_workers: int = 6,
+    wdtype: DTypeLike = np.uint16,
+    allow_overlap: bool = False,
+    grain_size: Optional[int] = None,
+) -> NDArray[np.int_]:
+    """Make worker sizes + offsets for a 1D array workload, i.e. how many
+    data vectors per worker thread (with starting offset)?
+
+    Args:
+        size: Size of the vector to divide.
+        vector_size: Vector size (2: float, 4: half).
+        num_workers: Number of workers.
+        wdtype: Worklists dtype.
+        allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
+        grain_size: Optional grain size. vector_size by default. Minimal size per thread.
+    Returns:
+        (NUM_WORKERS, 2) data offset + size per worker thread.
+
+            NOTE: offsets and sizes expressed in vector size unit!
+    """
+    grain_size = grain_size or vector_size
+    grain_scale = grain_size // vector_size
+    # TODO: support properly odd size.
+    assert size % 2 == 0, "Not supporting odd sizing at the moment."
+    # Base checks!
+    assert grain_size % vector_size == 0
+    assert size >= grain_size, f"Requires at least a size of {grain_size}."
+    assert (
+        size % grain_size == 0 or allow_overlap
+    ), f"Requires the size, {size}, divisible by the grain size {grain_size} (or overlap allowed)."
+
+    # Offset+size array to build.
+    offset_size_arr = np.zeros((num_workers, 2), dtype=np.int32)
+
+    # Base worksize on the first few workers.
+    base_worksize: int = math.ceil(size / (grain_size * num_workers))
+    num_base_workers = size // (grain_size * base_worksize)
+    # Offsets + size
+    offset_size_arr[:num_base_workers, 0] = np.arange(num_base_workers) * base_worksize * grain_scale
+    offset_size_arr[:num_base_workers, 1] = base_worksize * grain_scale
+    if num_base_workers == num_workers:
+        return offset_size_arr.astype(wdtype)
+
+    # Remainer term, for the next thread => all which is left, with potential overlap.
+    rem_worksize = size - base_worksize * grain_size * num_base_workers
+    rem_worksize = math.ceil(rem_worksize / grain_size)
+    offset_size_arr[num_base_workers, 0] = size / vector_size - rem_worksize * grain_scale
+    offset_size_arr[num_base_workers, 1] = rem_worksize * grain_scale
+    # Rest already filled with zeros...
+    return offset_size_arr.astype(wdtype)
+
+
 def make_ipu_vector1d_worker_offsets(
-    size: int, vector_size: int = 2, num_workers: int = 6, wdtype: DTypeLike = np.uint16
+    size: int,
+    vector_size: int = 2,
+    num_workers: int = 6,
+    wdtype: DTypeLike = np.uint16,
+    grain_size: Optional[int] = None,
 ) -> NDArray[np.int_]:
-    """Make the QR householder row update worker sizes, i.e. how many
+    """Make worker offsets (with additional padding) i.e. how many
     data vectors per worker thread?
 
     Args:
         size: Size of the vector to divide.
         vector_size: Vector size (2: float, 4: half).
         num_workers: Number of workers.
         wdtype: Worklists dtype.
+        grain_size: Optional grain size. vector_size by default.
     Returns:
-        (6,) number of data vectors per thread.
+        (NUM_WORKERS + 1,) data offset per worker thread.
     """
+    grain_size = grain_size or vector_size
+    grain_scale = grain_size // vector_size
 
     def make_offsets_fn(sizes):
         sizes = [0] + sizes
-        offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype)
+        offsets = np.cumsum(np.array(sizes, wdtype) * grain_scale, dtype=wdtype)
         return offsets
 
-    assert size % vector_size == 0
+    # TODO: support properly odd size.
+    assert size % 2 == 0, "Not supporting odd sizing at the moment."
+    # Base checks!
+    assert grain_size % vector_size == 0
+    assert size >= grain_size, f"Requires at least a size of {grain_size}."
+    assert size % grain_size == 0, f"Requires the size, {size}, divisible by the grain size {grain_size}."
+
     # Base worksize on the first few workers.
-    base_worksize: int = math.ceil(size / (vector_size * num_workers))
-    num_base_workers = size // (vector_size * base_worksize)
+    base_worksize: int = math.ceil(size / (grain_size * num_workers))
+    num_base_workers = size // (grain_size * base_worksize)
     worker_sizes: List[int] = [base_worksize] * num_base_workers
     if num_base_workers == num_workers:
         return make_offsets_fn(worker_sizes)
 
     # Remainer term, for the next thread.
-    rem_worksize = size - base_worksize * vector_size * num_base_workers
-    rem_worksize = rem_worksize // vector_size
+    rem_worksize = size - base_worksize * grain_size * num_base_workers
+    rem_worksize = rem_worksize // grain_size
     worker_sizes += [rem_worksize]
     # Fill the rest with zeros.
     unused_workers = num_workers - num_base_workers - 1

diff --git a/tessellate_ipu/core/vertex/intrinsics_utils.hpp b/tessellate_ipu/core/vertex/intrinsics_utils.hpp
@@ -64,6 +64,7 @@ ALWAYS_INLINE T ipu_div_by_6(T n) noexcept {
  */
 ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
   // TAS register, used for __builtin_ipu_f32v2axpy.
+  // TODO: use `__builtin_ipu_uput`?
   asm volatile(
       R"l( uput $TAS, %[sv]
         )l"
@@ -72,6 +73,20 @@ ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
       :);
 }
 
+/**
+ * @brief Zero AACC registers.
+ */
+ALWAYS_INLINE void __builtin_ipu_aacc_zero() {
+  asm (R"(
+    setzi $a0, 0x8
+    uput $FP_CLR, $a0
+  )"
+      :
+      :
+      : "$a0");
+}
+
+
 /**
  * @brief IPU cmac f32 instruction.
  */
@@ -97,13 +112,6 @@ ALWAYS_INLINE float ld32(const T* address, unsigned offset) {
   return result;
 }
 
-struct __ipu_and_ipumodel_tas {
-  void put(float v) { __builtin_ipu_put_tas(v); }
-  float2 f32v2axpy(float2 const& x, float2 const& y) {
-    return __builtin_ipu_f32v2axpy(x, y);
-  }
-};
-
 #else
 
 #include <limits>
@@ -137,60 +145,20 @@ IpuVector<T, N> fma(IpuVector<T, N> const& x, IpuVector<T, N> const& y,
 
 }  // namespace ipu
 
-// Reflect IPU's AXPY semantics in a way that is IPUModel compatible
-// IPU-only usage:
-//   __builtin_ipu_put_tas(v);
-//   z_prev = __builtin_ipu_f32v2axpy(x, y)
-//
-// IPUModel-compatible usage:
-//   __ipu_and_ipumodel_tas tas;
-//   tas.put(v);
-//   z_prev = tas.f32v2axpy(x, y)
-//
-// https://docs.graphcore.ai/projects/poplar-api/en/latest/ipu_intrinsics/ipu_builtins.html#_CPPv423__builtin_ipu_f32v2axpy6float26float2
-struct __ipu_and_ipumodel_tas {
-  float tas;
-  float2 prev;
-
-  __ipu_and_ipumodel_tas() : tas{0}, prev{0, 0} {}
-
-  void put(float v) { tas = v; }
-
-  float2 f32v2axpy(float2 const& x, float2 const& y) {
-    const auto res = prev;
-    prev = float2{
-        // TODO: understand ordering!?
-        // tas * x[0] + y[0],
-        // tas * x[1] + y[1],
-        tas * y[0] + x[0],
-        tas * y[1] + x[1],
-    };
-    return res;
-  }
-};
-
-// And give useful error messages when people port from IPU to IPUModel, e.g.
-/* clang-format off */ // need these error messages on one line
-/*
-/workspaces/tessellate-ipu/tessellate/tile/vertex/intrinsics_utils.hpp:166:3: error: static_assert failed due to requirement '__ipu_false<IpuVector<float, 2>>()': *** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
-  ^             ~~~~~~~~~~~~~~~~
-/workspaces/tessellate-ipu/tessellate/tile/vertex/tile_qr_vertex.cpp:231:12: note: in instantiation of function template specialization '__builtin_ipu_f32v2axpy<IpuVector<float, 2>>' requested here
-    rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
-*/
+// And give useful error messages when people port from IPU to IPUModel.
 template <typename T>
 constexpr bool __ipu_false() {
   return !std::is_same<T, T>::value;
 }
 
 template <typename T>
 void __builtin_ipu_put_tas(T v) {
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_put_tas with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
+  static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP` class for TAS handling on IPUModel.");
 }
 
 template <typename T>
 T __builtin_ipu_f32v2axpy(T const& x, T const& y) {
-  static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
+  static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP::axpy` for `f32v2axpy` intrinsic on IPUModel.");
   return T{};
 }
 // clang-format on

diff --git a/tessellate_ipu/core/vertex/ipu_amp.hpp b/tessellate_ipu/core/vertex/ipu_amp.hpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#pragma once
+#include <type_traits>
+
+#include "intrinsics_utils.hpp"
+#include "ipu_model_types.hpp"
+
+namespace ipu {
+
+/**
+ * @brief Thin abstraction of the IPU AMP unit(s) and registers, allowing
+ * to write generic code compiling on IPU model and IPU hardware.
+ *
+ * NOTE: zero-cost abstraction on IPU hardware.
+ *
+ * The AMP class is modelling AACC registers as well as AMP unit instructions
+ * on the IPU model, reproducing the expected behaviour of the hardware.
+ */
+template <typename T>
+class AMP {
+ public:
+  // TODO: support half as well.
+  static_assert(std::is_same_v<T, float>);
+  using FPType = T;
+  /** Number of AACC register available in hw. */
+  // TODO: use TFPU_AMP_UNITS_PER_SET and TFPU_AACC_PER_AMP_UNIT;
+  static constexpr unsigned NumAACC = 16;
+
+  // TODO: random initialization on IPU model of registers.
+  AMP() noexcept = default;
+  // No copy + no move allowed!
+  AMP(const AMP&) = delete;
+  AMP(AMP&&) = delete;
+
+  /**
+   * @brief Set the value of the TAS register, used in
+   * `axpy` operation.
+   */
+  ALWAYS_INLINE void tas(FPType val) noexcept {
+#ifdef __IPU__
+    __builtin_ipu_put_tas(val);
+#else
+    m_tas = val;
+#endif
+  }
+  /**
+   * @brief Zero AACC registers.
+   */
+  ALWAYS_INLINE void aaccZero() noexcept {
+#ifdef __IPU__
+    __builtin_ipu_aacc_zero();
+#else
+    for (unsigned idx = 0; idx < NumAACC; ++idx) {
+      m_aacc[idx] = 0;
+    }
+#endif
+  }
+
+  /**
+   * @brief Scaled-add `axpy` intrinsic. Only supported on FP32.
+   * NOTE: act as 1 stage pipeline, storing result in AACC[0...2]
+   */
+  ALWAYS_INLINE float2 axpy(float2 x, float2 y) noexcept {
+    using T2 = float2;
+#ifdef __IPU__
+    // Weird ordering here? Bug in the intrinsic definition?
+    return __builtin_ipu_f32v2axpy(y, x);
+#else
+    // Simulating pipeline with storing in AACC[0] and AACC[2].
+    const auto res = T2{m_aacc[0], m_aacc[2]};
+    // FIXME/TODO: understand ordering!?
+    m_aacc[0] = m_tas * x[0] + y[0];
+    m_aacc[2] = m_tas * x[1] + y[1];
+    return res;
+#endif
+  }
+
+  /**
+   * @brief Outer-product `aop` intrinsic. Only supported on FP32.
+   * Storing results in AACC[0...6]
+   */
+  void aop(float2 x, float2 y) noexcept {
+#ifdef __IPU__
+    // Note: third argument not used by hw.
+    __builtin_ipu_f32v2aop(x, y, 0);
+#else
+    // Multiply + accumulate.
+    m_aacc[0] += x[0] * y[0];
+    m_aacc[2] += x[1] * y[0];
+    m_aacc[4] += x[0] * y[1];
+    m_aacc[6] += x[1] * y[1];
+#endif
+  }
+
+  /**
+   * @brief `gina` instruction: get AACC register + propagate.
+   * FIXME: support non-zero flag/index.
+   */
+  template <unsigned int FLAG>
+  float2 gina(float2 val) noexcept {
+    using T2 = float2;
+#ifdef __IPU__
+    return __builtin_ipu_f32v2gina(val, 0);
+#else
+    // TODO: implement GINA_IMMFLAGS__SET__GET
+    const auto res = T2{m_aacc[0], m_aacc[2]};
+    // Propagate accumulator states.
+    for (unsigned idx = 4; idx < NumAACC; idx += 4) {
+      m_aacc[idx - 4] = m_aacc[idx];
+      m_aacc[idx - 2] = m_aacc[idx + 2];
+    }
+    m_aacc[NumAACC - 4] = val[0];
+    m_aacc[NumAACC - 2] = val[1];
+    return res;
+#endif
+  }
+
+ private:
+#ifndef __IPU__
+  // Simulating AACC registers on IPU model.
+  FPType m_aacc[NumAACC];
+  // Simulating TAS register on IPU model.
+  FPType m_tas;
+#endif
+};
+
+}  // namespace ipu