Skip to content

Commit

Permalink
Merge branch 'main' into eigh_tridiagonal_paolot
Browse files Browse the repository at this point in the history
  • Loading branch information
paolot-gc committed Oct 20, 2023
2 parents 8f63bd4 + 5f247c4 commit 922e4d4
Show file tree
Hide file tree
Showing 15 changed files with 475 additions and 175 deletions.
1 change: 1 addition & 0 deletions tessellate_ipu/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
primitive_clone,
primitive_num_inout_alias_args,
)
from .tile_interpreter_vertex_utils import make_ipu_vector1d_worker_offsets, make_ipu_vector1d_worker_offsets_and_sizes


def tessellate_ipu_cleanup():
Expand Down
10 changes: 10 additions & 0 deletions tessellate_ipu/core/tile_array.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import itertools
from dataclasses import dataclass
from typing import Any, Sequence, Tuple, Union

import chex
import jax.lax
import numpy as np
from jax.core import ShapedArray
from jax.interpreters.xla import DeviceArray
Expand Down Expand Up @@ -185,6 +187,14 @@ def __getitem__(self, key: Union[SliceType, MultiSliceType]) -> "TileShardedArra
check_tile_array_multi_slice(key, self.array.shape)
return TileShardedArray(array=self.array[key], tiles=self.tiles[key[0]]) # type:ignore

@classmethod
def concatenate(cls, arrays: Sequence["TileShardedArray"]) -> "TileShardedArray":
"""Concatenate tile sharded arrays along the first axis."""
assert all([isinstance(v, TileShardedArray) for v in arrays])
outarray = jax.lax.concatenate([v.array for v in arrays], dimension=0)
outtiles = tuple(itertools.chain(*[v.tiles for v in arrays]))
return TileShardedArray(array=outarray, tiles=outtiles)


def tile_put_sharded(array: DeviceArray, tiles: Sequence[int]) -> TileShardedArray:
"""Shard a JAX array over tiles on the first axis.
Expand Down
88 changes: 78 additions & 10 deletions tessellate_ipu/core/tile_interpreter_vertex_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import math
from typing import List
from typing import List, Optional

import numpy as np
from numpy.typing import DTypeLike, NDArray
Expand All @@ -25,37 +25,105 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:
return num_elements


def make_ipu_vector1d_worker_offsets_and_sizes(
size: int,
vector_size: int = 2,
num_workers: int = 6,
wdtype: DTypeLike = np.uint16,
allow_overlap: bool = False,
grain_size: Optional[int] = None,
) -> NDArray[np.int_]:
"""Make worker sizes + offsets for a 1D array workload, i.e. how many
data vectors per worker thread (with starting offset)?
Args:
size: Size of the vector to divide.
vector_size: Vector size (2: float, 4: half).
num_workers: Number of workers.
wdtype: Worklists dtype.
allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
grain_size: Optional grain size. vector_size by default. Minimal size per thread.
Returns:
(NUM_WORKERS, 2) data offset + size per worker thread.
NOTE: offsets and sizes expressed in vector size unit!
"""
grain_size = grain_size or vector_size
grain_scale = grain_size // vector_size
# TODO: support properly odd size.
assert size % 2 == 0, "Not supporting odd sizing at the moment."
# Base checks!
assert grain_size % vector_size == 0
assert size >= grain_size, f"Requires at least a size of {grain_size}."
assert (
size % grain_size == 0 or allow_overlap
), f"Requires the size, {size}, divisible by the grain size {grain_size} (or overlap allowed)."

# Offset+size array to build.
offset_size_arr = np.zeros((num_workers, 2), dtype=np.int32)

# Base worksize on the first few workers.
base_worksize: int = math.ceil(size / (grain_size * num_workers))
num_base_workers = size // (grain_size * base_worksize)
# Offsets + size
offset_size_arr[:num_base_workers, 0] = np.arange(num_base_workers) * base_worksize * grain_scale
offset_size_arr[:num_base_workers, 1] = base_worksize * grain_scale
if num_base_workers == num_workers:
return offset_size_arr.astype(wdtype)

# Remainer term, for the next thread => all which is left, with potential overlap.
rem_worksize = size - base_worksize * grain_size * num_base_workers
rem_worksize = math.ceil(rem_worksize / grain_size)
offset_size_arr[num_base_workers, 0] = size / vector_size - rem_worksize * grain_scale
offset_size_arr[num_base_workers, 1] = rem_worksize * grain_scale
# Rest already filled with zeros...
return offset_size_arr.astype(wdtype)


def make_ipu_vector1d_worker_offsets(
size: int, vector_size: int = 2, num_workers: int = 6, wdtype: DTypeLike = np.uint16
size: int,
vector_size: int = 2,
num_workers: int = 6,
wdtype: DTypeLike = np.uint16,
grain_size: Optional[int] = None,
) -> NDArray[np.int_]:
"""Make the QR householder row update worker sizes, i.e. how many
"""Make worker offsets (with additional padding) i.e. how many
data vectors per worker thread?
Args:
size: Size of the vector to divide.
vector_size: Vector size (2: float, 4: half).
num_workers: Number of workers.
wdtype: Worklists dtype.
grain_size: Optional grain size. vector_size by default.
Returns:
(6,) number of data vectors per thread.
(NUM_WORKERS + 1,) data offset per worker thread.
"""
grain_size = grain_size or vector_size
grain_scale = grain_size // vector_size

def make_offsets_fn(sizes):
sizes = [0] + sizes
offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype)
offsets = np.cumsum(np.array(sizes, wdtype) * grain_scale, dtype=wdtype)
return offsets

assert size % vector_size == 0
# TODO: support properly odd size.
assert size % 2 == 0, "Not supporting odd sizing at the moment."
# Base checks!
assert grain_size % vector_size == 0
assert size >= grain_size, f"Requires at least a size of {grain_size}."
assert size % grain_size == 0, f"Requires the size, {size}, divisible by the grain size {grain_size}."

# Base worksize on the first few workers.
base_worksize: int = math.ceil(size / (vector_size * num_workers))
num_base_workers = size // (vector_size * base_worksize)
base_worksize: int = math.ceil(size / (grain_size * num_workers))
num_base_workers = size // (grain_size * base_worksize)
worker_sizes: List[int] = [base_worksize] * num_base_workers
if num_base_workers == num_workers:
return make_offsets_fn(worker_sizes)

# Remainer term, for the next thread.
rem_worksize = size - base_worksize * vector_size * num_base_workers
rem_worksize = rem_worksize // vector_size
rem_worksize = size - base_worksize * grain_size * num_base_workers
rem_worksize = rem_worksize // grain_size
worker_sizes += [rem_worksize]
# Fill the rest with zeros.
unused_workers = num_workers - num_base_workers - 1
Expand Down
68 changes: 18 additions & 50 deletions tessellate_ipu/core/vertex/intrinsics_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ ALWAYS_INLINE T ipu_div_by_6(T n) noexcept {
*/
ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
// TAS register, used for __builtin_ipu_f32v2axpy.
// TODO: use `__builtin_ipu_uput`?
asm volatile(
R"l( uput $TAS, %[sv]
)l"
Expand All @@ -72,6 +73,20 @@ ALWAYS_INLINE void __builtin_ipu_put_tas(float v) noexcept {
:);
}

/**
* @brief Zero AACC registers.
*/
ALWAYS_INLINE void __builtin_ipu_aacc_zero() {
asm (R"(
setzi $a0, 0x8
uput $FP_CLR, $a0
)"
:
:
: "$a0");
}


/**
* @brief IPU cmac f32 instruction.
*/
Expand All @@ -97,13 +112,6 @@ ALWAYS_INLINE float ld32(const T* address, unsigned offset) {
return result;
}

struct __ipu_and_ipumodel_tas {
void put(float v) { __builtin_ipu_put_tas(v); }
float2 f32v2axpy(float2 const& x, float2 const& y) {
return __builtin_ipu_f32v2axpy(x, y);
}
};

#else

#include <limits>
Expand Down Expand Up @@ -137,60 +145,20 @@ IpuVector<T, N> fma(IpuVector<T, N> const& x, IpuVector<T, N> const& y,

} // namespace ipu

// Reflect IPU's AXPY semantics in a way that is IPUModel compatible
// IPU-only usage:
// __builtin_ipu_put_tas(v);
// z_prev = __builtin_ipu_f32v2axpy(x, y)
//
// IPUModel-compatible usage:
// __ipu_and_ipumodel_tas tas;
// tas.put(v);
// z_prev = tas.f32v2axpy(x, y)
//
// https://docs.graphcore.ai/projects/poplar-api/en/latest/ipu_intrinsics/ipu_builtins.html#_CPPv423__builtin_ipu_f32v2axpy6float26float2
struct __ipu_and_ipumodel_tas {
float tas;
float2 prev;

__ipu_and_ipumodel_tas() : tas{0}, prev{0, 0} {}

void put(float v) { tas = v; }

float2 f32v2axpy(float2 const& x, float2 const& y) {
const auto res = prev;
prev = float2{
// TODO: understand ordering!?
// tas * x[0] + y[0],
// tas * x[1] + y[1],
tas * y[0] + x[0],
tas * y[1] + x[1],
};
return res;
}
};

// And give useful error messages when people port from IPU to IPUModel, e.g.
/* clang-format off */ // need these error messages on one line
/*
/workspaces/tessellate-ipu/tessellate/tile/vertex/intrinsics_utils.hpp:166:3: error: static_assert failed due to requirement '__ipu_false<IpuVector<float, 2>>()': *** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.
static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
^ ~~~~~~~~~~~~~~~~
/workspaces/tessellate-ipu/tessellate/tile/vertex/tile_qr_vertex.cpp:231:12: note: in instantiation of function template specialization '__builtin_ipu_f32v2axpy<IpuVector<float, 2>>' requested here
rout = __builtin_ipu_f32v2axpy(rtmp, rtmp);
*/
// And give useful error messages when people port from IPU to IPUModel.
template <typename T>
constexpr bool __ipu_false() {
return !std::is_same<T, T>::value;
}

template <typename T>
void __builtin_ipu_put_tas(T v) {
static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_put_tas with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP` class for TAS handling on IPUModel.");
}

template <typename T>
T __builtin_ipu_f32v2axpy(T const& x, T const& y) {
static_assert(__ipu_false<T>(), "*** Replace __builtin_ipu_f32v2axpy with __ipu_and_ipumodel_tas for TAS handling on IPUModel.");
static_assert(__ipu_false<T>(), "*** Please use `ipu::AMP::axpy` for `f32v2axpy` intrinsic on IPUModel.");
return T{};
}
// clang-format on
Expand Down
127 changes: 127 additions & 0 deletions tessellate_ipu/core/vertex/ipu_amp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
#pragma once
#include <type_traits>

#include "intrinsics_utils.hpp"
#include "ipu_model_types.hpp"

namespace ipu {

/**
* @brief Thin abstraction of the IPU AMP unit(s) and registers, allowing
* to write generic code compiling on IPU model and IPU hardware.
*
* NOTE: zero-cost abstraction on IPU hardware.
*
* The AMP class is modelling AACC registers as well as AMP unit instructions
* on the IPU model, reproducing the expected behaviour of the hardware.
*/
template <typename T>
class AMP {
public:
// TODO: support half as well.
static_assert(std::is_same_v<T, float>);
using FPType = T;
/** Number of AACC register available in hw. */
// TODO: use TFPU_AMP_UNITS_PER_SET and TFPU_AACC_PER_AMP_UNIT;
static constexpr unsigned NumAACC = 16;

// TODO: random initialization on IPU model of registers.
AMP() noexcept = default;
// No copy + no move allowed!
AMP(const AMP&) = delete;
AMP(AMP&&) = delete;

/**
* @brief Set the value of the TAS register, used in
* `axpy` operation.
*/
ALWAYS_INLINE void tas(FPType val) noexcept {
#ifdef __IPU__
__builtin_ipu_put_tas(val);
#else
m_tas = val;
#endif
}
/**
* @brief Zero AACC registers.
*/
ALWAYS_INLINE void aaccZero() noexcept {
#ifdef __IPU__
__builtin_ipu_aacc_zero();
#else
for (unsigned idx = 0; idx < NumAACC; ++idx) {
m_aacc[idx] = 0;
}
#endif
}

/**
* @brief Scaled-add `axpy` intrinsic. Only supported on FP32.
* NOTE: act as 1 stage pipeline, storing result in AACC[0...2]
*/
ALWAYS_INLINE float2 axpy(float2 x, float2 y) noexcept {
using T2 = float2;
#ifdef __IPU__
// Weird ordering here? Bug in the intrinsic definition?
return __builtin_ipu_f32v2axpy(y, x);
#else
// Simulating pipeline with storing in AACC[0] and AACC[2].
const auto res = T2{m_aacc[0], m_aacc[2]};
// FIXME/TODO: understand ordering!?
m_aacc[0] = m_tas * x[0] + y[0];
m_aacc[2] = m_tas * x[1] + y[1];
return res;
#endif
}

/**
* @brief Outer-product `aop` intrinsic. Only supported on FP32.
* Storing results in AACC[0...6]
*/
void aop(float2 x, float2 y) noexcept {
#ifdef __IPU__
// Note: third argument not used by hw.
__builtin_ipu_f32v2aop(x, y, 0);
#else
// Multiply + accumulate.
m_aacc[0] += x[0] * y[0];
m_aacc[2] += x[1] * y[0];
m_aacc[4] += x[0] * y[1];
m_aacc[6] += x[1] * y[1];
#endif
}

/**
* @brief `gina` instruction: get AACC register + propagate.
* FIXME: support non-zero flag/index.
*/
template <unsigned int FLAG>
float2 gina(float2 val) noexcept {
using T2 = float2;
#ifdef __IPU__
return __builtin_ipu_f32v2gina(val, 0);
#else
// TODO: implement GINA_IMMFLAGS__SET__GET
const auto res = T2{m_aacc[0], m_aacc[2]};
// Propagate accumulator states.
for (unsigned idx = 4; idx < NumAACC; idx += 4) {
m_aacc[idx - 4] = m_aacc[idx];
m_aacc[idx - 2] = m_aacc[idx + 2];
}
m_aacc[NumAACC - 4] = val[0];
m_aacc[NumAACC - 2] = val[1];
return res;
#endif
}

private:
#ifndef __IPU__
// Simulating AACC registers on IPU model.
FPType m_aacc[NumAACC];
// Simulating TAS register on IPU model.
FPType m_tas;
#endif
};

} // namespace ipu
Loading

0 comments on commit 922e4d4

Please sign in to comment.