Skip to content

Commit

Permalink
Add dynamic NoC support for GS, WH, and BH (#13376)
Browse files Browse the repository at this point in the history
* #0: add noc modes to kernel config

* #0: remove all hardcoded noc_index

* #0: add u-bench for read DRAM and write to remote L1

* #0: u-bench code clean up + add constexpr to cmd_bufs

* #0: rename to DM_DEDICATED_NOC and DM_DYNAMIC_NOC

* #0: fix soc descriptor for moving FD cores

* #0: add fix to fast div, change back soc desc

* #0: fix dram read l1 write for GS

* #0: #0: fix dram read l1 write for BH

* #0: minor fix after rebase

* #0: re-calculate ret addr for atomic cmd

* #0: code clean up + add dynamic noc for eth

* #0: add read dram write l1 u-bench to CI

* #0: reduce code size, remove dynamic noc for eth, rename dynamic_noc_init

* #0: reduce code size by not using extern for noc_index when compile kernel

* #0: remove NOC_MODE FW define, move NOC_MODE outside of noc_nonblocking_api

* #0: reduce code size by remove else condition for noc_local_state_init
  • Loading branch information
yugaoTT authored Oct 8, 2024
1 parent 2564aeb commit 9e9dc00
Show file tree
Hide file tree
Showing 24 changed files with 1,721 additions and 260 deletions.
8 changes: 6 additions & 2 deletions tests/scripts/run_moreh_microbenchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ run_profiling_test() {
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME

if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
fi
# bypass wh_b0 for now until we can move FD cores to last col
if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_l1_write_core -k $ARCH_NAME
fi
}

Expand Down
67 changes: 67 additions & 0 deletions tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,28 @@ def run_dram_read_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
run_moreh_single_test("DRAM BW test multi-core", command)


def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
command = (
"TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write "
+ " --k "
+ str(k)
+ " --n "
+ str(n)
+ " --num-blocks "
+ str(num_blocks)
+ " --num-tests "
+ str(1)
+ " --data-type "
+ str(df)
+ " --num-banks "
+ str(num_banks)
+ " --bank-start-id "
+ str(bank_start_id)
+ " --bypass-check "
)
run_moreh_single_test("DRAM BW test multi-core", command)


# noc
def test_noc_local(r=9, c=12, nt=256, cb=1):
command = (
Expand Down Expand Up @@ -672,6 +694,51 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
assert bw_bound <= throughput


@pytest.mark.parametrize(
"arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
[
("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
],
)
def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
data = []
cycle_list = []
time_list = []
throughput_list = []
for _ in range(num_tests):
k = int(test_vector[0])
n = int(test_vector[1])
if data_format == 0:
input_size = k * n * 1088 // 1024
elif data_format == 1:
input_size = k * n * 2048 // 1024
run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
cycle = profile_results_kernel_duration()
time = cycle / freq / 1000.0 / 1000.0
throughput = input_size / cycle * freq / 1000.0
cycle_list.append(cycle)
time_list.append(time)
throughput_list.append(throughput)
cycle = sum(cycle_list) / len(cycle_list)
time = sum(time_list) / len(time_list)
throughput = sum(throughput_list) / len(throughput_list)
logger.info("DRAM read cycle: " + str(cycle))
logger.info("DRAM read time: " + str(time))
logger.info("DRAM read throughput: " + str(throughput))
data.append([throughput])
# check within range
dev_freq = get_device_freq()
if arch == "grayskull":
bw_bound = 100.0
elif arch == "wormhole_b0":
bw_bound = 260.0
elif arch == "blackhole":
bw_bound = 340.0
assert bw_bound <= throughput


@pytest.mark.parametrize(
"arch, freq, r, c, test_vector_global, test_vector_local",
[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>

#include "dataflow_api.h"

#include "debug/dprint.h"

template <uint32_t bank_base_address, uint32_t page_size, bool use_vc>
FORCE_INLINE
void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) {
uint32_t src_addr_;
uint32_t src_noc_xy;

src_addr_ = src_addr + bank_base_address;
src_addr_ += bank_to_dram_offset[bank_id];
src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id];

WAYPOINT("NRTW");
DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size);
while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF));
WAYPOINT("NRTD");

if constexpr(use_vc) {
uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field);
}

NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size); // len_bytes
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
noc_reads_num_issued[noc_index] += 1;
}

void kernel_main() {
constexpr uint32_t input_addr = get_compile_time_arg_val(0);
constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1);
constexpr uint32_t num_blocks = get_compile_time_arg_val(2);
constexpr uint32_t num_pages = get_compile_time_arg_val(3);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(4);
constexpr uint32_t page_size = get_compile_time_arg_val(5);

constexpr uint32_t block_size_bytes = page_size * num_pages;

const uint32_t bank_id = get_arg_val<uint32_t>(0);
const uint32_t vc = get_arg_val<uint32_t>(1);

constexpr uint32_t cb_id = 0;

uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<page_size, true>(input_addr, bank_id, vc);
uint32_t src_read_addr = 0;

#ifdef ARCH_GRAYSKULL
for (uint32_t block = 0; block < num_blocks; ++block) {
// Operand 1
cb_reserve_back(cb_id, block_num_tiles);
auto l1_write_addr = get_write_ptr(cb_id);

for (uint32_t h = 0; h < num_pages; ++h) {
noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr);
src_read_addr += page_size;
l1_write_addr += page_size;
}

noc_async_read_barrier();
cb_push_back(cb_id, block_num_tiles);
}
#else
constexpr uint32_t total_num_blocks_in_buffer = 3;
constexpr uint32_t total_num_trid = 4;
uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer;
uint32_t curr_block_trid = 1;
uint32_t block_trid_to_wait = 1;

cb_reserve_back(cb_id, block_num_tiles);
uint32_t l1_write_addr_offset = 0;
uint32_t l1_write_addr_start = get_write_ptr(cb_id);
uint32_t l1_write_addr = l1_write_addr_start;
for (uint32_t block = 0; block < num_blocks; ++block) {
noc_async_read_tile_dram_sharded_set_trid(curr_block_trid);

for (uint32_t h = 0; h < num_pages; ++h) {
noc_async_read_tile_dram_sharded_with_state_with_trid(
src_base_addr, src_read_addr, l1_write_addr, curr_block_trid);
src_read_addr += page_size;
l1_write_addr += page_size;
}

if (num_free_blocks_in_buffer == 2) {
noc_async_read_barrier_with_trid(block_trid_to_wait);
cb_push_back(cb_id, block_num_tiles);
// wait for next block trid
block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1);
// reserve for next block
cb_reserve_back(cb_id, block_num_tiles * 2);
} else {
num_free_blocks_in_buffer -= 1;
}

if (curr_block_trid == total_num_blocks_in_buffer) {
l1_write_addr_offset = 0;
curr_block_trid = 1;
} else {
l1_write_addr_offset += block_size_bytes;
curr_block_trid += 1;
}
l1_write_addr = l1_write_addr_start + l1_write_addr_offset;
}
// last block to wait
noc_async_read_barrier_with_trid(block_trid_to_wait);
cb_push_back(cb_id, block_num_tiles);
#endif
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>

#include "dataflow_api.h"

#include "debug/dprint.h"


void kernel_main() {
constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
constexpr uint32_t num_pages = get_compile_time_arg_val(1);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
constexpr uint32_t page_size = get_compile_time_arg_val(3);
constexpr uint32_t noc = get_compile_time_arg_val(4);

const uint32_t vc = get_arg_val<uint32_t>(0);
const uint32_t noc_x = get_arg_val<uint32_t>(1);
const uint32_t noc_y = get_arg_val<uint32_t>(2);

constexpr uint32_t cb_id = 0;

uint32_t l1_write_addr = get_write_ptr(cb_id);
const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);

noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);

for (uint32_t block = 0; block < num_blocks; ++block) {

auto remote_l1_write_addr = l1_noc_write_addr;

cb_wait_front(cb_id, block_num_tiles);
auto l1_read_addr = get_read_ptr(cb_id);

for (uint32_t h = 0; h < num_pages; ++h) {
noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
l1_read_addr += page_size;
remote_l1_write_addr += page_size;
}

noc_async_write_barrier(noc);

cb_pop_front(cb_id, block_num_tiles);

}


}
Loading

0 comments on commit 9e9dc00

Please sign in to comment.