Skip to content

Commit

Permalink
#11838: Update files
Browse files Browse the repository at this point in the history
  • Loading branch information
VirdhatchaniKN committed Sep 7, 2024
1 parent d8b9194 commit f2f3e06
Show file tree
Hide file tree
Showing 93 changed files with 133 additions and 133 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ execute, we receive back:

.. code-block:: cpp
auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
The reason why we may have two separate sets of cores and tile counts is
because depending on the grid size, it may not be possible to evenly distribute
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ using std::chrono::microseconds;
// This test measures the bandwidth of DRAM accesses of Tensix cores. It creates
// a bfloat16 format DRAM buffer of a given input size. Every Tensix cores read
// from or write to the buffer whrere the amount of each core accesses is
// determined by split_work_to_cores function.
// determined by tt::tt_metal::split_work_to_cores function.
//
// Disclaimer:
// - This benchmark is designed to support an input size larger than 4GB. But
Expand Down Expand Up @@ -176,7 +176,7 @@ int main(int argc, char **argv) {
uint32_t num_cores_y = compute_with_storage_grid_size.y;
auto
[num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
split_work_to_cores(compute_with_storage_grid_size, num_tiles);
tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);

log_info(
LogTest,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ using std::chrono::microseconds;
// This test measures the bandwidth of DRAM accesses of Tensix cores. It creates
// a bfloat16 format DRAM buffer of a given input size. Every Tensix cores read
// from or write to the buffer whrere the amount of each core accesses is
// determined by split_work_to_cores function.
// determined by tt::tt_metal::split_work_to_cores function.
//
// Disclaimer:
// - This benchmark is designed to support an input size larger than 4GB. But
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ inline std::tuple<uint32_t, uint32_t> get_max_cores_divisible_by_tiles_per_core_
}

// Finds the maximum even divisor of val starting at start_max_div and below
inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
inline int tt::tt_metal:: find_max_divisor(uint32_t val, uint32_t start_max_div) {
int result = 1;
for (int find_divisor = start_max_div; find_divisor >= 1; find_divisor--) {
if (find_divisor == 7 || find_divisor == 5)
Expand All @@ -51,7 +51,7 @@ inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
return result;
}

inline std::set<CoreRange> num_cores_to_corerange_set(
inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
uint32_t target_num_cores, CoreCoord grid_size, bool row_wise = false) {
uint32_t num_cores_x = grid_size.x;
uint32_t num_cores_y = grid_size.y;
Expand Down Expand Up @@ -98,11 +98,11 @@ inline std::set<CoreRange> num_cores_to_corerange_set(
// evenly divided If it can be evenly divided, the second CoreRangeSet is the
// same as the first, and the last is empty The last 2 args are the units of
// work for the two core grids
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
CoreCoord grid_size, uint32_t units_to_divide) {
uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size));
CoreRangeSet all_cores(tt::tt_metal:: num_cores_to_corerange_set(target_num_cores, grid_size));

std::set<CoreRange> core_group_1_set;
std::set<CoreRange> core_group_2_set;
Expand All @@ -116,7 +116,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
// full grid of cores which is implicitly assumed in the following logic
} else {
// Group of cores that do more work
core_group_2_set = num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size);
core_group_2_set = tt::tt_metal:: num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size);
auto last_block_group_2 = (*core_group_2_set.rbegin());
auto last_block_all_cores = (*all_cores.ranges().rbegin());
// Case where only the last column is divided between core group 1 and 2
Expand Down
18 changes: 9 additions & 9 deletions tt_metal/common/work_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ inline std::tuple<uint32_t, uint32_t> get_max_cores_divisible_by_tiles_per_core_
}

// Finds the maximum even divisor of val starting at start_max_div and below
inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
inline int tt::tt_metal:: find_max_divisor(uint32_t val, uint32_t start_max_div) {
int result = 1;
for (int find_divisor = start_max_div; find_divisor >= 1; find_divisor--) {
if (find_divisor == 7 || find_divisor == 5)
Expand All @@ -74,7 +74,7 @@ inline int find_max_block_size(uint32_t val, uint32_t max_block_size = 8) {
return result;
}

inline std::set<CoreRange> num_cores_to_corerange_set(
inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
const CoreCoord start_core,
const uint32_t target_num_cores,
const CoreCoord grid_size,
Expand Down Expand Up @@ -149,28 +149,28 @@ inline std::set<CoreRange> num_cores_to_corerange_set(
}

// TODO: Get rid of old function
inline std::set<CoreRange> num_cores_to_corerange_set(
inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) {
return num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise);
return tt::tt_metal:: num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise);
}

// TODO: Switch num_cores_to_corerange_set to always return CoreRangeSet
// TODO: Switch tt::tt_metal:: num_cores_to_corerange_set to always return CoreRangeSet
inline CoreRangeSet num_cores_to_core_range_set(
const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) {
return CoreRangeSet(num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise));
return CoreRangeSet(tt::tt_metal:: num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise));
}

// This function takes in the core grid size, as well as the number of units of work to divide between the cores
// This function returns the number of cores, the CoreRangeSet of all cores, and then the CoreRangeSet that does
// the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided
// If it can be evenly divided, the second CoreRangeSet is the same as the first, and the last is empty
// The last 2 args are the units of work for the two core grids
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
const CoreCoord grid_size, const uint32_t units_to_divide, const bool row_wise = false) {
ZoneScoped;
uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));
CoreRangeSet all_cores(tt::tt_metal:: num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));

std::set<CoreRange> core_group_1_set;
std::set<CoreRange> core_group_2_set;
Expand All @@ -184,7 +184,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
// which is implicitly assumed in the following logic
} else {
// Group of cores that do more work
core_group_1_set = num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size, row_wise);
core_group_1_set = tt::tt_metal:: num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size, row_wise);
auto last_block_group_1 = (*core_group_1_set.rbegin());
auto last_block_all_cores = (*all_cores.ranges().rbegin());
if (row_wise) {
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/programming_examples/matmul_common/work_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ inline std::set<CoreRange> num_cores_to_corerange_set(uint32_t target_num_cores,
// the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided
// If it can be evenly divided, the second CoreRangeSet is the same as the first, and the last is empty
// The last 2 args are the units of work for the two core grids
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(CoreCoord grid_size, uint32_t units_to_divide, bool row_wise = false) {
inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(CoreCoord grid_size, uint32_t units_to_divide, bool row_wise = false) {
uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void matmul_multi_core(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16
* Use a helper function to deduce the splits needed to co-operatively do
* this matmul.
*/
auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);

/*
* Extracting Matrix dimensions from input/output vectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ operation::ProgramWithCallbacks moreh_adam_(
const auto num_cores_y = grid.y;

auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
split_work_to_cores(grid, num_tiles);
tt::tt_metal::split_work_to_cores(grid, num_tiles);

auto arch = param_in.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ operation::ProgramWithCallbacks moreh_adamw_(

auto
[num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
split_work_to_cores(core_range, num_units);
tt::tt_metal::split_work_to_cores(core_range, num_units);


auto arch = param_in.device()->arch();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ operation::ProgramWithCallbacks moreh_arange_(
uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;

auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
split_work_to_cores(core_range, units_to_divide);
tt::tt_metal::split_work_to_cores(core_range, units_to_divide);

auto element_size = output.element_size();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step1_impl(
core_group_1,
core_group_2,
num_inputs_per_core_group_1,
num_inputs_per_core_group_2] = split_work_to_cores(grid, num_inputs);
num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
TT_ASSERT(core_group_2.ranges().empty());
TT_ASSERT(num_inputs_per_core_group_1 == 1);
TT_ASSERT(num_inputs_per_core_group_2 == 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step3_impl(
core_group_1,
core_group_2,
num_inputs_per_core_group_1,
num_inputs_per_core_group_2] = split_work_to_cores(grid, num_inputs);
num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
TT_ASSERT(core_group_2.ranges().empty());
TT_ASSERT(num_inputs_per_core_group_1 == 1);
TT_ASSERT(num_inputs_per_core_group_2 == 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ operation::ProgramWithCallbacks moreh_cumsum_nc(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = split_work_to_cores(grid, num_tiles_per_chip);
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_tiles_per_chip);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ operation::ProgramWithCallbacks moreh_getitem_rm(
uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;

auto [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
split_work_to_cores(core_range, num_units);
tt::tt_metal::split_work_to_cores(core_range, num_units);

Program program = Program();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ operation::ProgramWithCallbacks moreh_getitem_tilized(

auto
[num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
split_work_to_cores(core_range, num_units);
tt::tt_metal::split_work_to_cores(core_range, num_units);

Program program = Program();

Expand Down Expand Up @@ -388,7 +388,7 @@ operation::ProgramWithCallbacks moreh_getitem_tilized(

auto
[num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
split_work_to_cores(core_range, num_units);
tt::tt_metal::split_work_to_cores(core_range, num_units);

Program program = Program();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = split_work_to_cores(grid, num_rows);
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ operation::ProgramWithOptionalOutputTensors moreh_groupnorm_backward_gamma_beta_
core_group_1,
core_group_2,
num_channels_per_core_group_1,
num_channels_per_core_group_2] = split_work_to_cores(grid, num_channels);
num_channels_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_channels);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_channels_per_core_group_1: {}", num_channels_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_input_grad_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = split_work_to_cores(grid, num_rows);
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ std::tuple<CoreRangeSet, CoreRangeSet, CoreRangeSet> add_core_offset(
return std::make_tuple(new_all_cores, new_core_group_1, new_core_group_2);
}

std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
CoreRange core_range, uint32_t units_to_divide) {
uint32_t core_w = core_range.end_coord.x - core_range.start_coord.x + 1;
uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;
Expand All @@ -61,7 +61,7 @@ std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_
core_group_1_t,
core_group_2_t,
num_tiles_per_core_group_1,
num_tiles_per_core_group_2] = split_work_to_cores(grid_size, units_to_divide);
num_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid_size, units_to_divide);

auto core_x_offset = core_range.start_coord.x;
auto core_y_offset = core_range.start_coord.y;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ inline bool is_same_shape(const Tensor &tensor_a, const Tensor &tensor_b) {
std::tuple<CoreRangeSet, CoreRangeSet, CoreRangeSet> add_core_offset(
CoreRangeSet all_cores, CoreRangeSet core_group_1, CoreRangeSet core_group_2, uint32_t offset_x, uint32_t offset_y);

std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
CoreRange core_range, uint32_t units_to_divide);

[[maybe_unused]] KernelHandle CreateReadKernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ operation::ProgramWithCallbacks moreh_layernorm_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = split_work_to_cores(grid, num_outer);
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_gamma_beta_grad_impl(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = split_work_to_cores(grid, num_inner);
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inner);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_input_grad_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = split_work_to_cores(grid, num_outer);
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ operation::ProgramWithCallbacks moreh_bias_backward_multi_core_h(const Tensor &o
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = split_work_to_cores(grid, Wt);
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, Wt);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ operation::ProgramWithCallbacks moreh_matmul_multi_core(
core_group_1,
core_group_2,
num_output_tiles_per_core_group_1,
num_output_tiles_per_core_group_2] = split_work_to_cores(grid, num_output_tiles);
num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);

log_debug(LogOp, "{}:{} num_output_tiles: {}", __func__, __LINE__, num_output_tiles);
log_debug(LogOp, "{}:{} num_output_tiles_per_core_group1: {}, 2: {} ", __func__, __LINE__, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2);
Expand Down
Loading

0 comments on commit f2f3e06

Please sign in to comment.