Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#0: Shard and Pad programming examples #12974

Merged
merged 2 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tt_metal/programming_examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ set(PROGRAMMING_EXAMPLES_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/matmul_multicore_reuse/matmul_multicore_reuse.cpp
${CMAKE_CURRENT_SOURCE_DIR}/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
${CMAKE_CURRENT_SOURCE_DIR}/matmul_single_core/matmul_single_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp
)

include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"

void kernel_main() {

const uint32_t src_addr = get_arg_val<uint32_t>(0);
const uint32_t pad_addr = get_arg_val<uint32_t>(1);
const uint32_t start_src_stick_id = get_arg_val<uint32_t>(2);
const uint32_t row_size_diff = get_arg_val<uint32_t>(3);
const uint32_t dst_N = get_arg_val<uint32_t>(4);
const uint32_t data_size_bytes = get_arg_val<uint32_t>(5);
const uint32_t num_rows_per_core = get_arg_val<uint32_t>(6);

constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
constexpr bool pad_is_dram = get_compile_time_arg_val(1) == 1;
constexpr uint32_t cb_id = tt::CB::c_in0;

const InterleavedAddrGen<src_is_dram> s0 = {
.bank_base_address = src_addr,
.page_size = data_size_bytes
};
const InterleavedAddrGen<pad_is_dram> s1 = {
.bank_base_address = pad_addr,
.page_size = data_size_bytes
};

// pad based on page
uint32_t src_stick_id = start_src_stick_id;
uint32_t src_start_col_idx = row_size_diff / 2;
uint32_t src_end_col_idx = dst_N - src_start_col_idx;
for (uint32_t i = 0; i < num_rows_per_core; i++) {
for (uint32_t dst_col_idx = 0; dst_col_idx < dst_N; dst_col_idx++) {
cb_reserve_back(cb_id, 1);
uint32_t l1_addr = get_write_ptr(cb_id);
if (dst_col_idx < src_start_col_idx || dst_col_idx >= src_end_col_idx) {
// add pad value to cb
uint64_t pad_noc_addr = get_noc_addr(0, s1);
noc_async_read(pad_noc_addr, l1_addr, data_size_bytes);
}
else {
// add original src data to cb
uint64_t src_noc_addr = get_noc_addr(src_stick_id, s0);
noc_async_read(src_noc_addr, l1_addr, data_size_bytes);
src_stick_id++;
}
noc_async_read_barrier();
cb_push_back(cb_id, 1);
l1_addr += data_size_bytes;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"


void kernel_main() {

const uint32_t dst_addr = get_arg_val<uint32_t>(0);
const uint32_t start_dst_stick_id = get_arg_val<uint32_t>(1);
const uint32_t dst_N = get_arg_val<uint32_t>(2);
const uint32_t data_size_bytes = get_arg_val<uint32_t>(3);
const uint32_t num_rows_per_core = get_arg_val<uint32_t>(4);

constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
constexpr uint32_t cb_id = tt::CB::c_in0;

const InterleavedAddrGen<dst_is_dram> s0 = {
.bank_base_address = dst_addr,
.page_size = data_size_bytes
};

uint32_t dst_stick_id = start_dst_stick_id;
for (uint32_t row_idx = 0; row_idx < num_rows_per_core; row_idx++) {
for (uint32_t dst_col_idx = 0; dst_col_idx < dst_N; dst_col_idx++) {
cb_wait_front(cb_id, 1);
uint32_t l1_addr = get_read_ptr(cb_id);
uint64_t dst_noc_addr = get_noc_addr(dst_stick_id, s0);
noc_async_write(l1_addr, dst_noc_addr, data_size_bytes);
noc_async_write_barrier();
dst_stick_id++;
cb_pop_front(cb_id, 1);
}
}

}
178 changes: 178 additions & 0 deletions tt_metal/programming_examples/pad/pad_multi_core.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "tt_metal/host_api.hpp"
#include "tt_metal/common/constants.hpp"
#include "tt_metal/detail/util.hpp"
#include "tt_metal/common/bfloat16.hpp"
#include "tt_metal/impl/dispatch/command_queue.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/impl/device/device.hpp"

using namespace tt;
using namespace tt::tt_metal;

int main(int argc, char **argv) {
// get program/device
int device_id = 0;
Device *device = CreateDevice(device_id);
CommandQueue& cq = device->command_queue();
Program program = CreateProgram();

// initialize source data
constexpr uint32_t src_M = 8;
constexpr uint32_t src_N = 4;
constexpr uint32_t packed_data_size = sizeof(uint32_t);
constexpr uint32_t unpacked_data_size = sizeof(bfloat16);
constexpr uint32_t packing_ratio = packed_data_size / unpacked_data_size;
uint32_t src_num_values_unpacked = src_M * src_N;
uint32_t src_num_values_packed = src_num_values_unpacked / packing_ratio;
std::vector<uint32_t> src_vec(src_num_values_packed, 0);
// source vector = {1, 2, 3, ... , 30, 31, 32}
for (uint32_t i = 0; i < src_vec.size(); i++) {
bfloat16 bfloat_val1 = bfloat16(2 * i + 1);
bfloat16 bfloat_val2 = bfloat16(2 * i + 2);
src_vec[i] = pack_two_bfloat16_into_uint32(std::pair<bfloat16, bfloat16>(bfloat_val1, bfloat_val2));
}

// create pad vector
bfloat16 pad_value = bfloat16(2);
std::vector<uint32_t> pad_vec(1, pack_two_bfloat16_into_uint32(std::pair<bfloat16, bfloat16>(pad_value, pad_value)));

// create destination vector
constexpr uint32_t dst_M = 8;
constexpr uint32_t dst_N = 8;
uint32_t dst_num_values_unpacked = dst_M * dst_N;
uint32_t dst_num_values_packed = dst_num_values_unpacked / packing_ratio;
std::vector<uint32_t> dst_vec(dst_num_values_packed, 0);

// designate cores and core specs
CoreCoord start_core = {0, 0};
CoreCoord end_core = {0, 3};
CoreRange cores(start_core, end_core);
uint32_t num_cores = cores.size();

// configure and create DRAM buffers for input, pad, output
uint32_t src_buffer_size = packed_data_size * src_num_values_packed;
tt_metal::InterleavedBufferConfig input_dram_config {
.device = device,
.size = src_buffer_size,
.page_size = packed_data_size,
.buffer_type = tt_metal::BufferType::DRAM
};
std::shared_ptr<tt::tt_metal::Buffer> src_buffer = CreateBuffer(input_dram_config);
uint32_t src_addr = src_buffer->address();

uint32_t pad_buffer_size = packed_data_size * pad_vec.size();
tt_metal::InterleavedBufferConfig pad_dram_config {
.device = device,
.size = pad_buffer_size,
.page_size = packed_data_size,
.buffer_type = tt_metal::BufferType::DRAM
};
std::shared_ptr<tt::tt_metal::Buffer> pad_buffer = CreateBuffer(pad_dram_config);
uint32_t pad_addr = pad_buffer->address();

uint32_t dst_buffer_size = packed_data_size * dst_num_values_packed;
tt_metal::InterleavedBufferConfig output_dram_config {
.device = device,
.size = dst_buffer_size,
.page_size = packed_data_size,
.buffer_type = tt_metal::BufferType::DRAM
};
std::shared_ptr<tt::tt_metal::Buffer> dst_buffer = CreateBuffer(output_dram_config);
uint32_t dst_addr = dst_buffer->address();

// configure and create circular buffer
uint32_t cb_id = CB::c_in0;
tt::DataFormat cb_data_format = tt::DataFormat::UInt32;
CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(dst_N * packed_data_size * 2, {{cb_id, cb_data_format}})
.set_page_size(cb_id, packed_data_size);
auto cb_src = tt::tt_metal::CreateCircularBuffer(program, cores, cb_config);

// specify compile time args
bool src_is_dram = src_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
bool pad_is_dram = pad_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
bool dst_is_dram = dst_buffer->buffer_type() == BufferType::DRAM ? 1 : 0;
std::vector<uint32_t> reader_compile_time_args = {(uint32_t) src_is_dram,
(uint32_t) pad_is_dram};
std::vector<uint32_t> writer_compile_time_args = {(uint32_t) dst_is_dram};

// create kernels
KernelHandle reader_id = CreateKernel(program,
"tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp",
cores,
tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = reader_compile_time_args});
KernelHandle writer_id = CreateKernel(program,
"tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp",
cores,
tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = writer_compile_time_args});

// set kernel runtime arguments
uint32_t start_src_idx = 0;
uint32_t start_dst_idx = 0;
uint32_t num_rows_per_core = src_M / num_cores;
uint32_t row_size_diff = dst_N - src_N;
uint32_t num_packed_row_src = src_N / packing_ratio;
uint32_t num_packed_row_dst = dst_N / packing_ratio;
uint32_t num_src_sticks_per_core = num_packed_row_src * num_rows_per_core;
for (uint32_t core_idx = 0; core_idx < num_cores; core_idx++) {
CoreCoord core = {0, core_idx};
tt_metal::SetRuntimeArgs(
program,
reader_id,
core,
{src_addr,
pad_addr,
start_src_idx,
row_size_diff / packing_ratio,
num_packed_row_dst,
packed_data_size,
num_rows_per_core
}
);
tt_metal::SetRuntimeArgs(
program,
writer_id,
core,
{dst_addr,
start_dst_idx,
num_packed_row_dst,
packed_data_size,
num_rows_per_core
}
);
start_src_idx += num_src_sticks_per_core;
start_dst_idx += num_packed_row_dst * num_rows_per_core;
}

printf("Padding tensor of shape (%d, %d) to shape (%d, %d) with pad value: %d\n", src_M, src_N, dst_M, dst_N, pad_value.to_uint16());
printf("Original tensor with shape (%d, %d):\n", src_M, src_N);
for (uint32_t m = 0; m < src_M; m++) {
for (uint32_t n = 0; n < num_packed_row_src; n++) {
printf("%d ", (uint16_t)src_vec[m * num_packed_row_src + n]);
printf("%d ", (uint16_t)(src_vec[m * num_packed_row_src + n] >> 16));
}
printf("\n");
}
printf("\n");

// dispatch program to device for execution
EnqueueWriteBuffer(cq, src_buffer, src_vec.data(), false);
EnqueueWriteBuffer(cq, pad_buffer, pad_vec.data(), false);
EnqueueProgram(cq, program, false);
EnqueueReadBuffer(cq, dst_buffer, dst_vec.data(), false);
Finish(cq);

printf("Padded tensor with shape (%d, %d):\n", dst_M, dst_N);
for (uint32_t m = 0; m < dst_M; m++) {
for (uint32_t n = 0; n < num_packed_row_dst; n++) {
printf("%d ", (uint16_t)dst_vec[m * num_packed_row_dst + n]);
printf("%d ", (uint16_t)(dst_vec[m * num_packed_row_dst + n] >> 16));
}
printf("\n");
}

CloseDevice(device);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"

// export TT_METAL_DPRINT_CORES='(0,0)-(0,3)' in order to see DPRINT messages

void kernel_main() {
const uint32_t src_addr = get_arg_val<uint32_t>(0);
const uint32_t stick_size = get_arg_val<uint32_t>(1);
const uint32_t shard_height = get_arg_val<uint32_t>(2);
const uint32_t shard_width_bytes = get_arg_val<uint32_t>(3);
const uint32_t padded_offset_bytes = get_arg_val<uint32_t>(4);
const uint32_t start_id = get_arg_val<uint32_t>(5);
const uint32_t current_core = get_arg_val<uint32_t>(6);

constexpr uint32_t cb_id_in0 = get_compile_time_arg_val(0);
constexpr bool src_is_dram = get_compile_time_arg_val(1) == 1;
const InterleavedAddrGen<src_is_dram> s0 = {
.bank_base_address = src_addr,
.page_size = stick_size
};
uint32_t stick_id = start_id;
cb_reserve_back(cb_id_in0, shard_height);
uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
DPRINT_DATA0(DPRINT << "Core (0," << current_core << "): ");
for (uint32_t h = 0; h < shard_height; ++h) {
uint64_t src_noc_addr = get_noc_addr(stick_id, s0);
noc_async_read(src_noc_addr, l1_write_addr, stick_size);
// print both BFloat16 values that are packed into the page
uint32_t* read_ptr = (uint32_t*)l1_write_addr;
DPRINT_DATA0(DPRINT << (uint16_t)*read_ptr << " ");
DPRINT_DATA0(DPRINT << (uint16_t)(*read_ptr >> 16) << " ");
stick_id++;
l1_write_addr += padded_offset_bytes;
}
DPRINT_DATA0(DPRINT << ENDL());
noc_async_read_barrier();
cb_push_back(cb_id_in0, shard_height);
}
Loading
Loading