#11838: Update files

tenstorrent · Sep 7, 2024 · f2f3e06 · f2f3e06
1 parent d8b9194
commit f2f3e06
Show file tree

Hide file tree

Showing 93 changed files with 133 additions and 133 deletions.
diff --git a/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core.rst b/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core.rst
@@ -61,7 +61,7 @@ execute, we receive back:
 
 .. code-block:: cpp
 
-    auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
 
 The reason why we may have two separate sets of cores and tile counts is
 because depending on the grid size, it may not be possible to evenly distribute

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -27,7 +27,7 @@ using std::chrono::microseconds;
 // This test measures the bandwidth of DRAM accesses of Tensix cores. It creates
 // a bfloat16 format DRAM buffer of a given input size. Every Tensix cores read
 // from or write to the buffer whrere the amount of each core accesses is
-// determined by split_work_to_cores function.
+// determined by tt::tt_metal::split_work_to_cores function.
 //
 // Disclaimer:
 //   - This benchmark is designed to support an input size larger than 4GB. But
@@ -176,7 +176,7 @@ int main(int argc, char **argv) {
         uint32_t num_cores_y = compute_with_storage_grid_size.y;
         auto
             [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
-                split_work_to_cores(compute_with_storage_grid_size, num_tiles);
+                tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
         log_info(
             LogTest,

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -29,7 +29,7 @@ using std::chrono::microseconds;
 // This test measures the bandwidth of DRAM accesses of Tensix cores. It creates
 // a bfloat16 format DRAM buffer of a given input size. Every Tensix cores read
 // from or write to the buffer whrere the amount of each core accesses is
-// determined by split_work_to_cores function.
+// determined by tt::tt_metal::split_work_to_cores function.
 //
 // Disclaimer:
 //   - This benchmark is designed to support an input size larger than 4GB. But

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp
@@ -38,7 +38,7 @@ inline std::tuple<uint32_t, uint32_t> get_max_cores_divisible_by_tiles_per_core_
 }
 
 // Finds the maximum even divisor of val starting at start_max_div and below
-inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
+inline int tt::tt_metal:: find_max_divisor(uint32_t val, uint32_t start_max_div) {
     int result = 1;
     for (int find_divisor = start_max_div; find_divisor >= 1; find_divisor--) {
         if (find_divisor == 7 || find_divisor == 5)
@@ -51,7 +51,7 @@ inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
     return result;
 }
 
-inline std::set<CoreRange> num_cores_to_corerange_set(
+inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
     uint32_t target_num_cores, CoreCoord grid_size, bool row_wise = false) {
     uint32_t num_cores_x = grid_size.x;
     uint32_t num_cores_y = grid_size.y;
@@ -98,11 +98,11 @@ inline std::set<CoreRange> num_cores_to_corerange_set(
 // evenly divided If it can be evenly divided, the second CoreRangeSet is the
 // same as the first, and the last is empty The last 2 args are the units of
 // work for the two core grids
-inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
+inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
     CoreCoord grid_size, uint32_t units_to_divide) {
     uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
     auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
-    CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size));
+    CoreRangeSet all_cores(tt::tt_metal:: num_cores_to_corerange_set(target_num_cores, grid_size));
 
     std::set<CoreRange> core_group_1_set;
     std::set<CoreRange> core_group_2_set;
@@ -116,7 +116,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
         // full grid of cores which is implicitly assumed in the following logic
     } else {
         // Group of cores that do more work
-        core_group_2_set = num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size);
+        core_group_2_set = tt::tt_metal:: num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size);
         auto last_block_group_2 = (*core_group_2_set.rbegin());
         auto last_block_all_cores = (*all_cores.ranges().rbegin());
         // Case where only the last column is divided between core group 1 and 2

diff --git a/tt_metal/common/work_split.hpp b/tt_metal/common/work_split.hpp
@@ -50,7 +50,7 @@ inline std::tuple<uint32_t, uint32_t> get_max_cores_divisible_by_tiles_per_core_
 }
 
 // Finds the maximum even divisor of val starting at start_max_div and below
-inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
+inline int tt::tt_metal:: find_max_divisor(uint32_t val, uint32_t start_max_div) {
     int result = 1;
     for (int find_divisor = start_max_div; find_divisor >= 1; find_divisor--) {
         if (find_divisor == 7 || find_divisor == 5)
@@ -74,7 +74,7 @@ inline int find_max_block_size(uint32_t val, uint32_t max_block_size = 8) {
     return result;
 }
 
-inline std::set<CoreRange> num_cores_to_corerange_set(
+inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
     const CoreCoord start_core,
     const uint32_t target_num_cores,
     const CoreCoord grid_size,
@@ -149,28 +149,28 @@ inline std::set<CoreRange> num_cores_to_corerange_set(
 }
 
 // TODO: Get rid of old function
-inline std::set<CoreRange> num_cores_to_corerange_set(
+inline std::set<CoreRange> tt::tt_metal:: num_cores_to_corerange_set(
     const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) {
-    return num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise);
+    return tt::tt_metal:: num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise);
 }
 
-// TODO: Switch num_cores_to_corerange_set to always return CoreRangeSet
+// TODO: Switch tt::tt_metal:: num_cores_to_corerange_set to always return CoreRangeSet
 inline CoreRangeSet num_cores_to_core_range_set(
     const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) {
-    return CoreRangeSet(num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise));
+    return CoreRangeSet(tt::tt_metal:: num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise));
 }
 
 // This function takes in the core grid size, as well as the number of units of work to divide between the cores
 // This function returns the number of cores, the CoreRangeSet of all cores, and then the CoreRangeSet that does
 // the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided
 // If it can be evenly divided, the second CoreRangeSet is the same as the first, and the last is empty
 // The last 2 args are the units of work for the two core grids
-inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
+inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
     const CoreCoord grid_size, const uint32_t units_to_divide, const bool row_wise = false) {
     ZoneScoped;
     uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
     auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
-    CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));
+    CoreRangeSet all_cores(tt::tt_metal:: num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));
 
     std::set<CoreRange> core_group_1_set;
     std::set<CoreRange> core_group_2_set;
@@ -184,7 +184,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
         // which is implicitly assumed in the following logic
     } else {
         // Group of cores that do more work
-        core_group_1_set = num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size, row_wise);
+        core_group_1_set = tt::tt_metal:: num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size, row_wise);
         auto last_block_group_1 = (*core_group_1_set.rbegin());
         auto last_block_all_cores = (*all_cores.ranges().rbegin());
         if (row_wise) {

diff --git a/tt_metal/programming_examples/matmul_common/work_split.hpp b/tt_metal/programming_examples/matmul_common/work_split.hpp
@@ -106,7 +106,7 @@ inline std::set<CoreRange> num_cores_to_corerange_set(uint32_t target_num_cores,
 // the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided
 // If it can be evenly divided, the second CoreRangeSet is the same as the first, and the last is empty
 // The last 2 args are the units of work for the two core grids
-inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(CoreCoord grid_size, uint32_t units_to_divide, bool row_wise = false) {
+inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(CoreCoord grid_size, uint32_t units_to_divide, bool row_wise = false) {
 	uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
 	auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
 	CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise));

diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
@@ -70,7 +70,7 @@ void matmul_multi_core(vector<bfloat16>& a, vector<bfloat16>& b, vector<bfloat16
      * Use a helper function to deduce the splits needed to co-operatively do
      * this matmul.
      */
-    auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_output_tiles_total);
 
     /*
     * Extracting Matrix dimensions from input/output vectors

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_adam/moreh_adam.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_adam/moreh_adam.cpp
@@ -52,7 +52,7 @@ operation::ProgramWithCallbacks moreh_adam_(
     const auto num_cores_y = grid.y;
 
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
-        split_work_to_cores(grid, num_tiles);
+        tt::tt_metal::split_work_to_cores(grid, num_tiles);
 
     auto arch = param_in.device()->arch();
     auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp
@@ -58,7 +58,7 @@ operation::ProgramWithCallbacks moreh_adamw_(
 
     auto
         [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
-            split_work_to_cores(core_range, num_units);
+            tt::tt_metal::split_work_to_cores(core_range, num_units);
 
 
     auto arch = param_in.device()->arch();

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_arange/moreh_arange_op.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_arange/moreh_arange_op.cpp
@@ -33,7 +33,7 @@ operation::ProgramWithCallbacks moreh_arange_(
     uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;
 
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
-        split_work_to_cores(core_range, units_to_divide);
+        tt::tt_metal::split_work_to_cores(core_range, units_to_divide);
 
     auto element_size = output.element_size();
 

diff --git a/...op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp b/...op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp
@@ -60,7 +60,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step1_impl(
          core_group_1,
          core_group_2,
          num_inputs_per_core_group_1,
-         num_inputs_per_core_group_2] = split_work_to_cores(grid, num_inputs);
+         num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
     TT_ASSERT(core_group_2.ranges().empty());
     TT_ASSERT(num_inputs_per_core_group_1 == 1);
     TT_ASSERT(num_inputs_per_core_group_2 == 0);

diff --git a/...op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp b/...op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp
@@ -47,7 +47,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step3_impl(
          core_group_1,
          core_group_2,
          num_inputs_per_core_group_1,
-         num_inputs_per_core_group_2] = split_work_to_cores(grid, num_inputs);
+         num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
     TT_ASSERT(core_group_2.ranges().empty());
     TT_ASSERT(num_inputs_per_core_group_1 == 1);
     TT_ASSERT(num_inputs_per_core_group_2 == 0);

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp
@@ -70,7 +70,7 @@ operation::ProgramWithCallbacks moreh_cumsum_nc(
          core_group_1,
          core_group_2,
          num_cols_per_core_group_1,
-         num_cols_per_core_group_2] = split_work_to_cores(grid, num_tiles_per_chip);
+         num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_tiles_per_chip);
 
     ////////////////////////////////////////////////////////////////////////////
     //                         CircularBuffer Setup

diff --git a/...cpp/ttnn/deprecated/tt_dnn/op_library/moreh_getitem/moreh_getitem_rm/moreh_getitem_rm.cpp b/...cpp/ttnn/deprecated/tt_dnn/op_library/moreh_getitem/moreh_getitem_rm/moreh_getitem_rm.cpp
@@ -88,7 +88,7 @@ operation::ProgramWithCallbacks moreh_getitem_rm(
     uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;
 
     auto [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
-        split_work_to_cores(core_range, num_units);
+        tt::tt_metal::split_work_to_cores(core_range, num_units);
 
     Program program = Program();
 

diff --git a/...eprecated/tt_dnn/op_library/moreh_getitem/moreh_getitem_tilized/moreh_getitem_tilized.cpp b/...eprecated/tt_dnn/op_library/moreh_getitem/moreh_getitem_tilized/moreh_getitem_tilized.cpp
@@ -110,7 +110,7 @@ operation::ProgramWithCallbacks moreh_getitem_tilized(
 
         auto
             [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
-                split_work_to_cores(core_range, num_units);
+                tt::tt_metal::split_work_to_cores(core_range, num_units);
 
         Program program = Program();
 
@@ -388,7 +388,7 @@ operation::ProgramWithCallbacks moreh_getitem_tilized(
 
         auto
             [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
-                split_work_to_cores(core_range, num_units);
+                tt::tt_metal::split_work_to_cores(core_range, num_units);
 
         Program program = Program();
 

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp
@@ -111,7 +111,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_impl(
          core_group_1,
          core_group_2,
          num_rows_per_core_group_1,
-         num_rows_per_core_group_2] = split_work_to_cores(grid, num_rows);
+         num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);
 
     log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
     log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());

diff --git a/...ary/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp b/...ary/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp
@@ -85,7 +85,7 @@ operation::ProgramWithOptionalOutputTensors moreh_groupnorm_backward_gamma_beta_
          core_group_1,
          core_group_2,
          num_channels_per_core_group_1,
-         num_channels_per_core_group_2] = split_work_to_cores(grid, num_channels);
+         num_channels_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_channels);
 
     log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
     log_debug(LogTest, fmt::format("num_channels_per_core_group_1: {}", num_channels_per_core_group_1).c_str());

diff --git a/...nn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp b/...nn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp
@@ -81,7 +81,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_input_grad_impl(
          core_group_1,
          core_group_2,
          num_rows_per_core_group_1,
-         num_rows_per_core_group_2] = split_work_to_cores(grid, num_rows);
+         num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);
 
     log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
     log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_helper_functions.cpp
@@ -50,7 +50,7 @@ std::tuple<CoreRangeSet, CoreRangeSet, CoreRangeSet> add_core_offset(
     return std::make_tuple(new_all_cores, new_core_group_1, new_core_group_2);
 }
 
-std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
+std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
     CoreRange core_range, uint32_t units_to_divide) {
     uint32_t core_w = core_range.end_coord.x - core_range.start_coord.x + 1;
     uint32_t core_h = core_range.end_coord.y - core_range.start_coord.y + 1;
@@ -61,7 +61,7 @@ std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_
          core_group_1_t,
          core_group_2_t,
          num_tiles_per_core_group_1,
-         num_tiles_per_core_group_2] = split_work_to_cores(grid_size, units_to_divide);
+         num_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid_size, units_to_divide);
 
     auto core_x_offset = core_range.start_coord.x;
     auto core_y_offset = core_range.start_coord.y;

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_helper_functions.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_helper_functions.hpp
@@ -60,7 +60,7 @@ inline bool is_same_shape(const Tensor &tensor_a, const Tensor &tensor_b) {
 std::tuple<CoreRangeSet, CoreRangeSet, CoreRangeSet> add_core_offset(
     CoreRangeSet all_cores, CoreRangeSet core_group_1, CoreRangeSet core_group_2, uint32_t offset_x, uint32_t offset_y);
 
-std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> split_work_to_cores(
+std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_t> tt::tt_metal::split_work_to_cores(
     CoreRange core_range, uint32_t units_to_divide);
 
 [[maybe_unused]] KernelHandle CreateReadKernel(

diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp
@@ -115,7 +115,7 @@ operation::ProgramWithCallbacks moreh_layernorm_impl(
          core_group_1,
          core_group_2,
          num_rows_per_core_group_1,
-         num_rows_per_core_group_2] = split_work_to_cores(grid, num_outer);
+         num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);
 
     auto arch = input.device()->arch();
     auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =

diff --git a/...ary/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp b/...ary/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp
@@ -79,7 +79,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_gamma_beta_grad_impl(
          core_group_1,
          core_group_2,
          num_cols_per_core_group_1,
-         num_cols_per_core_group_2] = split_work_to_cores(grid, num_inner);
+         num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inner);
 
     auto arch = input.device()->arch();
     auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =

diff --git a/...nn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp b/...nn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp
@@ -90,7 +90,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_input_grad_impl(
          core_group_1,
          core_group_2,
          num_rows_per_core_group_1,
-         num_rows_per_core_group_2] = split_work_to_cores(grid, num_outer);
+         num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);
 
     auto arch = input.device()->arch();
     auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =

diff --git a/...dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp b/...dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp
@@ -58,7 +58,7 @@ operation::ProgramWithCallbacks moreh_bias_backward_multi_core_h(const Tensor &o
          core_group_1,
          core_group_2,
          num_cols_per_core_group_1,
-         num_cols_per_core_group_2] = split_work_to_cores(grid, Wt);
+         num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, Wt);
 
     ////////////////////////////////////////////////////////////////////////////
     //                         CircularBuffer Setup

diff --git a/.../ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp b/.../ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp
@@ -193,7 +193,7 @@ operation::ProgramWithCallbacks moreh_matmul_multi_core(
          core_group_1,
          core_group_2,
          num_output_tiles_per_core_group_1,
-         num_output_tiles_per_core_group_2] = split_work_to_cores(grid, num_output_tiles);
+         num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);
 
     log_debug(LogOp, "{}:{} num_output_tiles: {}", __func__, __LINE__, num_output_tiles);
     log_debug(LogOp, "{}:{} num_output_tiles_per_core_group1: {}, 2: {} ", __func__, __LINE__, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2);