diff --git a/tt_metal/common/tt_backend_api_types.hpp b/tt_metal/common/tt_backend_api_types.hpp index b326bf292ea..9df35e871d0 100644 --- a/tt_metal/common/tt_backend_api_types.hpp +++ b/tt_metal/common/tt_backend_api_types.hpp @@ -156,6 +156,7 @@ inline std::ostream& operator<<(std::ostream& os, const RISCV& riscv) { case RISCV::TRISC0: os << "TRISC0"; break; case RISCV::TRISC1: os << "TRISC1"; break; case RISCV::TRISC2: os << "TRISC2"; break; + case RISCV::ERISC: os << "ERISC"; break; case RISCV::COMPUTE: os << "COMPUTE"; break; default: throw std::invalid_argument("Unknown format"); } diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 396e6d9b5a1..9a5ce41afba 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -16,6 +16,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/debug_tools.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/command_queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/data_collection.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/dprint_server.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_server.cpp ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index d980bbb2314..a6c2cd2100a 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -26,6 +26,7 @@ #include "tt_metal/impl/debug/dprint_server.hpp" #include "tt_metal/impl/debug/watcher_server.hpp" #include "tt_metal/impl/dispatch/cq_commands.hpp" +#include "tt_metal/impl/dispatch/data_collection.hpp" #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp" #include "tt_metal/third_party/umd/device/tt_xy_pair.h" @@ -567,6 +568,11 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { false, core_type == CoreType::WORKER ? DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE : DISPATCH_WRITE_OFFSET_ETH_L1_CONFIG_BASE); + for (auto &data_per_kernel : unique_rt_data_and_sizes) { + for (auto &data_and_sizes : data_per_kernel) { + RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); + } + } unique_sub_cmds.clear(); unique_rt_data_and_sizes.clear(); unique_rt_args_data.clear(); @@ -648,6 +654,11 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { common_sub_cmds); } + for (auto& data_per_kernel : common_rt_data_and_sizes) { + for (auto& data_and_sizes : data_per_kernel) { + RecordDispatchData(program, DISPATCH_DATA_RTARGS, std::get<1>(data_and_sizes)); + } + } common_rt_data_and_sizes.clear(); common_rt_args_data.clear(); } @@ -841,6 +852,11 @@ void EnqueueProgramCommand::assemble_device_commands( noc_encoding, // noc_xy_addr kg_transfer_info.dst_base_addrs[kernel_idx], kg_transfer_info.lengths[kernel_idx]); + RecordDispatchData( + program, + DISPATCH_DATA_BINARY, + kg_transfer_info.lengths[kernel_idx], + kg_transfer_info.riscvs[kernel_idx]); // Difference between prefetch total relayed pages and dispatch write linear uint32_t relayed_bytes = align(kg_transfer_info.lengths[kernel_idx], HostMemDeviceCommand::PROGRAM_PAGE_SIZE); @@ -893,6 +909,8 @@ void EnqueueProgramCommand::assemble_device_commands( .addr = dst_addr, .length = (uint16_t)write_length, .num_mcast_dests = (uint16_t)num_mcast_dests}); + RecordDispatchData( + program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]); dst_addr += write_length; kernel_bins_prefetch_subcmds.back().emplace_back(CQPrefetchRelayPagedPackedSubCmd{ @@ -1007,6 +1025,9 @@ void EnqueueProgramCommand::assemble_device_commands( this->packed_write_max_unicast_sub_cmds, curr_sub_cmd_idx); curr_sub_cmd_idx += num_sub_cmds_in_cmd; + for (auto &data_and_size : multicast_sem_data[i]) { + RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); + } } } @@ -1024,6 +1045,9 @@ void EnqueueProgramCommand::assemble_device_commands( this->packed_write_max_unicast_sub_cmds, curr_sub_cmd_idx); curr_sub_cmd_idx += num_sub_cmds_in_cmd; + for (auto &data_and_size : unicast_sem_data[i]) { + RecordDispatchData(program, DISPATCH_DATA_SEMAPHORE, data_and_size.second); + } } } @@ -1043,7 +1067,11 @@ void EnqueueProgramCommand::assemble_device_commands( multicast_cb_config_data, this->packed_write_max_unicast_sub_cmds, curr_sub_cmd_idx); + for (auto &data_and_size : multicast_cb_config_data) { + RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second); + } curr_sub_cmd_idx += num_sub_cmds_in_cmd; + RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, mcast_cb_payload_sizeB); uint32_t curr_sub_cmd_data_offset_words = (write_offset_bytes + (sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) + align(num_sub_cmds_in_cmd * sizeof(CQDispatchWritePackedMulticastSubCmd), L1_ALIGNMENT)) / @@ -1190,6 +1218,11 @@ void EnqueueProgramCommand::process() { this->assemble_stall_commands(true); // Runtime Args Command Sequence this->assemble_runtime_args_commands(); + + // Record kernel groups in this program, only need to do it once. + for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) { + RecordKernelGroups(program, core_type, program.get_kernel_groups(core_type)); + } } else { static constexpr uint32_t wait_count_offset = (sizeof(CQPrefetchCmd) + offsetof(CQDispatchCmd, wait.count)); static constexpr uint32_t tensix_l1_write_offset_offset = @@ -1208,6 +1241,7 @@ void EnqueueProgramCommand::process() { this->cached_program_command_sequences[program.id].preamble_command_sequence.update_cmd_sequence( eth_l1_write_offset_offset, ð_l1_write_offset, sizeof(uint32_t)); } + RecordProgramRun(program); // Main Command Sequence this->assemble_device_commands(is_cached, tensix_l1_write_offset, eth_l1_write_offset); diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp new file mode 100644 index 00000000000..a305e3516c9 --- /dev/null +++ b/tt_metal/impl/dispatch/data_collection.cpp @@ -0,0 +1,283 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "data_collection.hpp" +#include "llrt/rtoptions.hpp" + +using namespace tt; + +namespace { + +// Class to track stats for DispatchData +class DispatchStats { +public: + uint32_t max_transaction_size = 0; + uint32_t min_transaction_size = UINT32_MAX; + uint32_t num_writes = 0; + uint64_t total_write_size = 0; + + void Update( + uint32_t max_transaction_size, uint32_t min_transaction_size, uint32_t num_writes, uint64_t total_write_size) { + this->max_transaction_size = std::max(this->max_transaction_size, max_transaction_size); + this->min_transaction_size = std::min(this->min_transaction_size, min_transaction_size); + this->num_writes += num_writes; + this->total_write_size += total_write_size; + } + void Update(uint32_t transaction_size, uint32_t transaction_count) { + Update(transaction_size, transaction_size, transaction_count, transaction_count * transaction_size); + } + void Update(DispatchStats &other) { + Update(other.max_transaction_size, other.min_transaction_size, other.num_writes, other.total_write_size); + } + + void Dump(std::ofstream &outfile, map &raw_data) { + outfile << fmt::format("\t\tmax_transaction_size = {}\n", max_transaction_size); + outfile << fmt::format("\t\tmin_transaction_size = {}\n", min_transaction_size); + outfile << fmt::format("\t\tnum_writes = {}\n", num_writes); + outfile << fmt::format("\t\ttotal_write_size = {}\n", total_write_size); + outfile << "\t\ttransaction_counts = ["; + for (auto &size_and_count : raw_data) { + outfile << size_and_count.first << ":" << size_and_count.second << " "; + } + outfile << "]\n"; + } +}; + +// Class to hold dispatch write data for the DataCollector +class DispatchData { +public: + DispatchData(data_collector_t type): type(type) {} + DispatchData(int type_int) : DispatchData(static_cast(type_int)) {} + + void Update(uint32_t transaction_size, RISCV riscv) { + data[riscv][transaction_size]++; + } + + void Merge(const DispatchData &other) { + for (auto &riscv_and_data : other.data) { + for (auto &size_and_count : riscv_and_data.second) { + this->data[riscv_and_data.first][size_and_count.first] += size_and_count.second; + } + } + } + + void DumpStats(std::ofstream &outfile) { + // Only dump if this has data + if (data.size() == 0) + return; + outfile << fmt::format("\t{} stats:\n", type); + + // Track stats for all RISCS, as well as per RISC + DispatchStats total_stats; + map total_data; + for (auto &riscv_and_data : data) { + // Go through all data and update stats + DispatchStats riscv_stats; + for (auto &size_and_count : riscv_and_data.second) { + riscv_stats.Update(size_and_count.first, size_and_count.second); + total_data[size_and_count.first] += size_and_count.second; + } + total_stats.Update(riscv_stats); + + // Only for binaries, print for each RISC type + if (type == DISPATCH_DATA_BINARY) { + outfile << "\t " << riscv_and_data.first << " binary data:\n"; + riscv_stats.Dump(outfile, riscv_and_data.second); + } + } + + // For types other than binaries, just print once + if (type == DISPATCH_DATA_BINARY) + outfile << "\t Overall binaries data:\n"; + total_stats.Dump(outfile, total_data); + } + +private: + map> data; // RISCV -> transaction size -> count + data_collector_t type; +}; + +// Class to manage & dump dispatch data for each program +class DataCollector { +public: + // Single instance of the data collector + static DataCollector *inst; + + DataCollector() { + TT_ASSERT(inst == nullptr); + inst = this; + }; + ~DataCollector() { + inst = nullptr; + }; + + void RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv); + void RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups); + void RecordProgramRun(Program &program); + void DumpData(); + +private: + map> program_id_to_dispatch_data; + map>>> program_id_to_kernel_groups; + map program_id_to_call_count; +}; + +void DataCollector::RecordData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv) { + uint64_t program_id = program.get_id(); + if (program_id_to_dispatch_data.count(program_id) == 0) { + // If no existing data for this program, initialize starting values. + program_id_to_dispatch_data[program_id] = vector(); + for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) { + data_collector_t curr_type = static_cast(idx); + DispatchData data(curr_type); + program_id_to_dispatch_data[program_id].push_back(data); + } + } + + program_id_to_dispatch_data[program_id].at(type).Update(transaction_size, riscv); +} + +void DataCollector::RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups) { + uint64_t program_id = program.get_id(); + // Make a copy of relevant info, since user may destroy program before we dump. + for (KernelGroup &kernel_group : kernel_groups) { + kernel_id_array_t watcher_kernel_ids; + for (int idx = 0; idx < kernel_group.kernel_ids.size(); idx++) { + if (kernel_group.kernel_ids[idx]) { + watcher_kernel_ids[idx] = program.get_kernel(*kernel_group.kernel_ids[idx])->get_watcher_kernel_id(); + } + } + program_id_to_kernel_groups[program_id][core_type].push_back({watcher_kernel_ids, kernel_group.core_ranges}); + } +} + +void DataCollector::RecordProgramRun(Program &program) { + uint64_t program_id = program.get_id(); + program_id_to_call_count[program_id]++; +} + +string DispatchClassToString(enum dispatch_core_processor_classes proc_class, CoreType core_type) { + switch (core_type) { + case CoreType::WORKER: + switch (proc_class) { + case DISPATCH_CLASS_TENSIX_DM0: + return "brisc:"; + case DISPATCH_CLASS_TENSIX_DM1: + return "ncrisc:"; + case DISPATCH_CLASS_TENSIX_COMPUTE: + return "trisc:"; + default: + return ""; + } + case CoreType::ETH: + if (proc_class == DISPATCH_CLASS_ETH_DM0) + return "erisc:"; + else + return ""; + default: + TT_FATAL("Incompatible core type: {}", core_type); + } + return ""; +} + +void DataCollector::DumpData() { + std::ofstream outfile = std::ofstream("dispatch_data.txt"); + + // Extra DispatchData objects to collect data across programs + vector cross_program_data; + for (int idx = 0; idx < DISPATCH_DATA_COUNT; idx++) { + cross_program_data.push_back(new DispatchData(idx)); + } + + // Go through all programs, and dump relevant data + for (auto &id_and_data : program_id_to_dispatch_data) { + uint64_t program_id = id_and_data.first; + outfile << fmt::format("Program {}: Ran {} time(s).\n", program_id, program_id_to_call_count[program_id]); + + // Dump kernel ids for each kernel group in this program + for (auto &core_type_and_kernel_groups : program_id_to_kernel_groups[program_id]) { + CoreType core_type = core_type_and_kernel_groups.first; + vector> &kernel_groups = core_type_and_kernel_groups.second; + outfile << fmt::format("\t{} Kernel Groups: {}\n", core_type, kernel_groups.size()); + for (auto &ids_and_ranges : kernel_groups) { + // Dump kernel ids in this group + outfile << "\t\t{"; + for (int i = 0; i < DISPATCH_CLASS_MAX; i++) { + outfile << DispatchClassToString(static_cast(i), core_type); + if (ids_and_ranges.first[i]) { + outfile << *ids_and_ranges.first[i]; + } + outfile << " "; + } + outfile << "} on cores "; + + // Dump the cores this kernel group contains + outfile << ids_and_ranges.second.str() << "\n"; + } + } + + // Dump dispatch write stats + for (int type_int = 0; type_int != DISPATCH_DATA_COUNT; type_int++) { + DispatchData &data = id_and_data.second.at(type_int); + cross_program_data[type_int]->Merge(data); + data.DumpStats(outfile); + } + } + + // Dump cross-program stats + outfile << "Cross-Program Data:\n"; + for (int type_int = 0; type_int != DISPATCH_DATA_COUNT; type_int++) { + cross_program_data[type_int]->DumpStats(outfile); + delete cross_program_data[type_int]; + } + outfile.close(); +} + +DataCollector* DataCollector::inst = nullptr; + +void DumpDispatchDataAndClose() { + DataCollector::inst->DumpData(); + delete DataCollector::inst; +} + +// Helper function to init the data collector if it isn't already up. +void InitDataCollector() { + if (DataCollector::inst == nullptr) { + new DataCollector(); + std::atexit(DumpDispatchDataAndClose); + } +} + +} // end anon namespae + +namespace tt { + +void RecordDispatchData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv) { + // Do nothing if we're not enabling data collection. + if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) + return; + + InitDataCollector(); + DataCollector::inst->RecordData(program, type, transaction_size, riscv); +} + +void RecordKernelGroups(Program &program, CoreType core_type, vector &kernel_groups) { + // Do nothing if we're not enabling data collection. + if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) + return; + + InitDataCollector(); + DataCollector::inst->RecordKernelGroups(program, core_type, kernel_groups); +} + +void RecordProgramRun(Program &program) { + // Do nothing if we're not enabling data collection. + if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) + return; + + InitDataCollector(); + DataCollector::inst->RecordProgramRun(program); +} + +} // end namepsace tt diff --git a/tt_metal/impl/dispatch/data_collection.hpp b/tt_metal/impl/dispatch/data_collection.hpp new file mode 100644 index 00000000000..7377b6d592a --- /dev/null +++ b/tt_metal/impl/dispatch/data_collection.hpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "command_queue_interface.hpp" + +namespace tt { + +typedef enum e_data_collector_t { + DISPATCH_DATA_CB_CONFIG, + DISPATCH_DATA_SEMAPHORE, + DISPATCH_DATA_RTARGS, + DISPATCH_DATA_BINARY, + DISPATCH_DATA_COUNT +} data_collector_t; + +/* Record a single dispatch write, to be dumped with stats on program exit. Should only be called once per transaction + * per program (if a program is enqueued multiple times, don't call this multiple times). + * + * Arguments: + * program - program this transaction is part of. + * type - what type of transaction this counts as, one of data_collector_t. + * transaction_size - size in bytes of this transaction. + * riscv - riscv core that this transaction is used for, only relevant for DISPATCH_DATA_BINARY transactions. + */ +void RecordDispatchData(Program &program, data_collector_t type, uint32_t transaction_size, RISCV riscv = RISCV::MAX); + +// Record the KernelGroups present in this program (per core type). Should only be called per program created, not +// program enqueued. +void RecordKernelGroups(Program &program, CoreType core_type, std::vector &kernel_groups); + +// Update stats with an enqueue of given program. +void RecordProgramRun(Program &program); + +} // end namespace tt diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 11d500e680b..9901152963c 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -124,7 +124,7 @@ KernelGroup::KernelGroup() : core_ranges({}) {} KernelGroup::KernelGroup( const Program &program, CoreType core_type, - std::array, DISPATCH_CLASS_MAX> kernel_ids, + kernel_id_array_t kernel_ids, bool erisc_is_idle, int last_cb_index, const CoreRangeSet &new_ranges) : @@ -193,7 +193,7 @@ KernelGroup *Program::kernels_on_core(const CoreCoord &core, const CoreType &cor struct KernelGroupInt { bool valid; - std::array, DISPATCH_CLASS_MAX> kernel_ids; + kernel_id_array_t kernel_ids; bool operator==(const KernelGroupInt &b) const; void update(dispatch_core_processor_classes proc_class, size_t kernel_idx) { @@ -645,6 +645,7 @@ void Program::populate_dispatch_data(Device *device) { std::vector dst_base_addrs; std::vector page_offsets; std::vector lengths; + std::vector riscvs; uint32_t transfer_info_index = 0; for (size_t sub_kernel_index = 0; sub_kernel_index < binaries.size(); ++sub_kernel_index) { @@ -653,6 +654,7 @@ void Program::populate_dispatch_data(Device *device) { dst_base_addrs.resize(dst_base_addrs.size() + num_spans); page_offsets.resize(page_offsets.size() + num_spans); lengths.resize(lengths.size() + num_spans); + riscvs.resize(riscvs.size() + num_spans); kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { uint64_t relo_addr = @@ -662,6 +664,7 @@ void Program::populate_dispatch_data(Device *device) { page_offsets[transfer_info_index] = binaries_data.size() * sizeof(uint32_t) / HostMemDeviceCommand::PROGRAM_PAGE_SIZE; lengths[transfer_info_index] = len * sizeof(uint32_t); + riscvs[transfer_info_index] = sub_kernels[sub_kernel_index]; binaries_data.insert(binaries_data.end(), mem_ptr, mem_ptr + len); binaries_data.resize( @@ -670,7 +673,7 @@ void Program::populate_dispatch_data(Device *device) { }); } kernel_bins_transfer_info kb_transfer_info = { - .dst_base_addrs = dst_base_addrs, .page_offsets = page_offsets, .lengths = lengths}; + .dst_base_addrs = dst_base_addrs, .page_offsets = page_offsets, .lengths = lengths, .riscvs = riscvs}; kernel_transfer_info.insert({kernel_id, kb_transfer_info}); } } diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 9524955c635..3643e6fabad 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -32,10 +32,12 @@ namespace detail{ void AddConfigBuffer(Program &program, std::shared_ptr config_buffer); } +typedef std::array, DISPATCH_CLASS_MAX> kernel_id_array_t; + struct KernelGroup { CoreType core_type; CoreRangeSet core_ranges; - std::array, DISPATCH_CLASS_MAX> kernel_ids; + kernel_id_array_t kernel_ids; uint32_t rta_sizes[DISPATCH_CLASS_MAX]; uint32_t total_rta_size; launch_msg_t launch_msg; @@ -44,7 +46,7 @@ struct KernelGroup { KernelGroup( const Program &program, CoreType core_type, - std::array, DISPATCH_CLASS_MAX> kernel_ids, + kernel_id_array_t kernel_ids, bool erisc_is_idle, int last_cb_index, const CoreRangeSet &new_ranges); @@ -143,6 +145,7 @@ class Program { bool is_finalized() const { return this->finalized_; } void finalize(); + std::shared_ptr get_kernel(KernelHandle kernel_id) const; void capture_multi_device_dependencies() { capture_multi_device_dependencies_ = true; } bool has_multi_device_dependencies() { return capture_multi_device_dependencies_; } @@ -221,7 +224,6 @@ class Program { friend uint32_t CreateSemaphore(Program &program, const std::variant &core_spec, uint32_t initial_value, CoreType core_type); KernelHandle add_kernel(std::shared_ptr kernel, const CoreType &core_type); - std::shared_ptr get_kernel(KernelHandle kernel_id) const; CBHandle add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config); std::shared_ptr get_circular_buffer(CBHandle cb_id) const; diff --git a/tt_metal/impl/program/program_device_map.hpp b/tt_metal/impl/program/program_device_map.hpp index 444c64bd0a5..7f75ddd4a8e 100644 --- a/tt_metal/impl/program/program_device_map.hpp +++ b/tt_metal/impl/program/program_device_map.hpp @@ -24,6 +24,7 @@ struct kernel_bins_transfer_info { std::vector dst_base_addrs; // BRISC, NCRISC, TRISC etc.. std::vector page_offsets; // offsets into paged buffer in DRAM std::vector lengths; // WriteLinear lengths + std::vector riscvs; // RISC that each span is targeted for, for binaries }; struct ProgramTransferInfo { diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index c4441e12dc2..d05ec3a7ddf 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -92,6 +92,11 @@ RunTimeOptions::RunTimeOptions() { TT_THROW("Invalid TT_METAL_GTEST_NUM_HW_CQS: {}", num_cqs); } } + + const char *dispatch_data_collection_str = std::getenv("TT_METAL_DISPATCH_DATA_COLLECTION"); + if (dispatch_data_collection_str != nullptr) { + enable_dispatch_data_collection = true; + } } const std::string &RunTimeOptions::get_root_dir() { diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 2c8f0cea877..db652fdf3e9 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -99,6 +99,8 @@ class RunTimeOptions { bool validate_kernel_binaries = false; unsigned num_hw_cqs = 1; + bool enable_dispatch_data_collection = false; + public: RunTimeOptions(); @@ -239,7 +241,10 @@ class RunTimeOptions { inline void set_num_hw_cqs(unsigned num) { num_hw_cqs = num; } inline uint32_t get_watcher_debug_delay() { return watcher_debug_delay; } - void set_watcher_debug_delay(uint32_t delay) { watcher_debug_delay = delay; } + inline void set_watcher_debug_delay(uint32_t delay) { watcher_debug_delay = delay; } + + inline bool get_dispatch_data_collection_enabled() { return enable_dispatch_data_collection; } + inline void set_dispatch_data_collection_enabled(bool enable) { enable_dispatch_data_collection = enable; } private: // Helper functions to parse feature-specific environment vaiables.