Skip to content

Commit

Permalink
#10228: Add support for profiling dispatch writes
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Aug 9, 2024
1 parent 119f7b9 commit 94af3ff
Show file tree
Hide file tree
Showing 10 changed files with 379 additions and 7 deletions.
1 change: 1 addition & 0 deletions tt_metal/common/tt_backend_api_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ inline std::ostream& operator<<(std::ostream& os, const RISCV& riscv) {
case RISCV::TRISC0: os << "TRISC0"; break;
case RISCV::TRISC1: os << "TRISC1"; break;
case RISCV::TRISC2: os << "TRISC2"; break;
case RISCV::ERISC: os << "ERISC"; break;
case RISCV::COMPUTE: os << "COMPUTE"; break;
default: throw std::invalid_argument("Unknown format");
}
Expand Down
1 change: 1 addition & 0 deletions tt_metal/impl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(IMPL_SRC
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/data_collection.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/dprint_server.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_server.cpp
${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp
Expand Down
34 changes: 34 additions & 0 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "tt_metal/impl/debug/dprint_server.hpp"
#include "tt_metal/impl/debug/watcher_server.hpp"
#include "tt_metal/impl/dispatch/cq_commands.hpp"
#include "tt_metal/impl/dispatch/data_collection.hpp"
#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
#include "tt_metal/third_party/umd/device/tt_xy_pair.h"

Expand Down Expand Up @@ -567,6 +568,11 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
false,
core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE
: DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE);
for (auto &data_per_kernel : unique_rt_data_and_sizes) {
for (auto &data_and_sizes : data_per_kernel) {
RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
}
}
unique_sub_cmds.clear();
unique_rt_data_and_sizes.clear();
unique_rt_args_data.clear();
Expand Down Expand Up @@ -648,6 +654,11 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
common_sub_cmds);
}

for (auto& data_per_kernel : common_rt_data_and_sizes) {
for (auto& data_and_sizes : data_per_kernel) {
RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes));
}
}
common_rt_data_and_sizes.clear();
common_rt_args_data.clear();
}
Expand Down Expand Up @@ -841,6 +852,11 @@ void EnqueueProgramCommand::assemble_device_commands(
noc_encoding, // noc_xy_addr
kg_transfer_info.dst_base_addrs[kernel_idx],
kg_transfer_info.lengths[kernel_idx]);
RecordDispatchData(
program,
DISPATCH_DATA_BINARY,
kg_transfer_info.lengths[kernel_idx],
kg_transfer_info.riscvs[kernel_idx]);
// Difference between prefetch total relayed pages and dispatch write linear
uint32_t relayed_bytes =
align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE);
Expand Down Expand Up @@ -893,6 +909,8 @@ void EnqueueProgramCommand::assemble_device_commands(
.addr = dst_addr,
.length = (uint16_t)write_length,
.num_mcast_dests = (uint16_t)num_mcast_dests});
RecordDispatchData(
program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]);
dst_addr += write_length;

kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{
Expand Down Expand Up @@ -1007,6 +1025,9 @@ void EnqueueProgramCommand::assemble_device_commands(
this->packed_write_max_unicast_sub_cmds,
curr_sub_cmd_idx);
curr_sub_cmd_idx += num_sub_cmds_in_cmd;
for (auto &data_and_size : multicast_sem_data[i]) {
RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
}
}
}

Expand All @@ -1024,6 +1045,9 @@ void EnqueueProgramCommand::assemble_device_commands(
this->packed_write_max_unicast_sub_cmds,
curr_sub_cmd_idx);
curr_sub_cmd_idx += num_sub_cmds_in_cmd;
for (auto &data_and_size : unicast_sem_data[i]) {
RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second);
}
}
}

Expand All @@ -1043,7 +1067,11 @@ void EnqueueProgramCommand::assemble_device_commands(
multicast_cb_config_data,
this->packed_write_max_unicast_sub_cmds,
curr_sub_cmd_idx);
for (auto &data_and_size : multicast_cb_config_data) {
RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second);
}
curr_sub_cmd_idx += num_sub_cmds_in_cmd;
RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB);
uint32_t curr_sub_cmd_data_offset_words =
(write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) +
align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) /
Expand Down Expand Up @@ -1190,6 +1218,11 @@ void EnqueueProgramCommand::process() {
this->assemble_stall_commands(true);
// Runtime Args Command Sequence
this->assemble_runtime_args_commands();

// Record kernel groups in this program, only need to do it once.
for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) {
RecordKernelGroups(program, core_type, program.get_kernel_groups(core_type));
}
} else {
static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count));
static constexpr uint32_t tensix_l1_write_offset_offset =
Expand All @@ -1208,6 +1241,7 @@ void EnqueueProgramCommand::process() {
this->cached_program_command_sequences[program.id].preamble_command_sequence.update_cmd_sequence(
eth_l1_write_offset_offset, &eth_l1_write_offset, sizeof(uint32_t));
}
RecordProgramRun(program);

// Main Command Sequence
this->assemble_device_commands(is_cached, tensix_l1_write_offset, eth_l1_write_offset);
Expand Down
Loading

0 comments on commit 94af3ff

Please sign in to comment.