diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index 145b95f8d335..1ffdd977c754 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -469,18 +469,13 @@ int main(int argc, char **argv) { 0, // prefetch_local_downstream_sem_addr 0, // prefetch_downstream_buffer_pages num_compute_cores, // max_write_packed_cores - true, // is_dram_variant - true, // is_host_variant - 0, - 0, 0, 0, 0, 0, 0, - 0, - 0, - 0 + true, // is_dram_variant + true, // is_host_variant }; std::vector spoof_prefetch_compile_args = {l1_buf_base, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index a84a8204f804..3481745c2ee6 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -1640,13 +1640,11 @@ void configure_for_single_chip(Device *device, prefetch_downstream_cb_sem, // prefetch_d only dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only - true, - true, - 0, - 0, - 0, - 0, - 0 + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0 // unused: for prefetch_hd <--> dispatch_hd }; constexpr NOC my_noc_index = NOC::NOC_0; @@ -1665,8 +1663,6 @@ void configure_for_single_chip(Device *device, prefetch_compile_args[11] = prefetch_d_buffer_base; prefetch_compile_args[12] = prefetch_d_buffer_pages * (1 << dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE); prefetch_compile_args[13] = scratch_db_base; - prefetch_compile_args[21] = true; - prefetch_compile_args[22] = false; CoreCoord phys_prefetch_d_upstream_core = packetized_path_en_g ? phys_prefetch_relay_demux_core : phys_prefetch_core_g; configure_kernel_variant(program, @@ -1690,8 +1686,6 @@ void configure_for_single_chip(Device *device, prefetch_compile_args[11] = cmddat_q_base; prefetch_compile_args[12] = cmddat_q_size_g; prefetch_compile_args[13] = 0; - prefetch_compile_args[21] = false; - prefetch_compile_args[22] = true; CoreCoord phys_prefetch_h_downstream_core = packetized_path_en_g ? phys_prefetch_relay_mux_core : phys_prefetch_d_core; configure_kernel_variant(program, @@ -1899,18 +1893,11 @@ void configure_for_single_chip(Device *device, prefetch_downstream_cb_sem, prefetch_downstream_buffer_pages, num_compute_cores, // max_write_packed_cores - true, - true, 0, 0, 0, 0, 0, - 0, - 0, - 0, - 0, - 0 }; CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g; @@ -1924,8 +1911,6 @@ void configure_for_single_chip(Device *device, dispatch_compile_args[12] = dispatch_downstream_cb_sem; dispatch_compile_args[13] = dispatch_h_cb_sem; dispatch_compile_args[14] = dispatch_d_preamble_size; - dispatch_compile_args[20] = true; - dispatch_compile_args[21] = false; CoreCoord phys_dispatch_d_downstream_core = packetized_path_en_g ? phys_dispatch_relay_mux_core : phys_dispatch_h_core; configure_kernel_variant(program, @@ -1946,8 +1931,6 @@ void configure_for_single_chip(Device *device, dispatch_compile_args[12] = dispatch_h_cb_sem; dispatch_compile_args[13] = dispatch_downstream_cb_sem; dispatch_compile_args[14] = 0; // preamble size - dispatch_compile_args[20] = false; - dispatch_compile_args[21] = true; CoreCoord phys_dispatch_h_upstream_core = packetized_path_en_g ? phys_dispatch_relay_demux_core : phys_dispatch_core; configure_kernel_variant(program, @@ -2289,6 +2272,11 @@ void configure_for_multi_chip(Device *device, prefetch_downstream_cb_sem, // prefetch_d only dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only + 0, // unused: for prefetch_d <--> dispatch_d + 0, // unused: for prefetch_d <--> dispatch_d + 0, // unused: for prefetch_d <--> dispatch_d + 0, // unused: for prefetch_d <--> dispatch_d + 0 // unused: for prefetch_d <--> dispatch_d }; constexpr NOC my_noc_index = NOC::NOC_0; @@ -2623,7 +2611,12 @@ void configure_for_multi_chip(Device *device, NOC_XY_ENCODING(phys_prefetch_core_g.x, phys_prefetch_core_g.y), prefetch_downstream_cb_sem, prefetch_downstream_buffer_pages, - num_compute_cores + num_compute_cores, + 0, + 0, + 0, + 0, + 0, }; CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g; diff --git a/tests/ttnn/unit_tests/test_single_device_events.py b/tests/ttnn/unit_tests/test_single_device_events.py index 3955d16d8a18..866a66204bd3 100644 --- a/tests/ttnn/unit_tests/test_single_device_events.py +++ b/tests/ttnn/unit_tests/test_single_device_events.py @@ -14,7 +14,7 @@ @pytest.mark.parametrize("shape", [(3, 1, 512, 512)]) -@pytest.mark.parametrize("device_params", [{"num_command_queues": 2}], indirect=True) +@pytest.mark.parametrize("device_params", [{"num_command_queues": 1}], indirect=True) def test_single_device_events(device, shape): pytest.skip("Needs Eth dispatch to run on WH") # Enable Program Cache and Async Mode diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp index a7c50e77889a..964a37cb5c4a 100644 --- a/tt_metal/common/core_descriptor.hpp +++ b/tt_metal/common/core_descriptor.hpp @@ -154,4 +154,17 @@ inline CoreCoord get_physical_core_coordinate(const tt_cxy_pair &logical_locatio return soc_desc.get_physical_core_from_logical_core(CoreCoord(logical_location.x, logical_location.y), core_type); } +inline std::tuple get_physical_worker_grid_config(chip_id_t chip, uint8_t num_hw_cqs, CoreType dispatch_core_type) { + // Get logical compute grid dimensions and num workers + auto worker_grid = tt::get_compute_grid_size(chip, num_hw_cqs, dispatch_core_type); + std::size_t tensix_num_worker_cols = worker_grid.x; + std::size_t tensix_num_worker_rows = worker_grid.y; + uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows; + const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip); + // Get physical compute grid range based on SOC Desc and Logical Coords + CoreCoord tensix_worker_start_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0 + CoreCoord tensix_worker_end_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER); + CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys); + return std::make_tuple(tensix_num_worker_cores, tensix_worker_physical_grid); +} } // namespace tt diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 978ecfbec935..5064400ef548 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -714,13 +714,13 @@ void Device::update_workers_build_settings(std::vectorget_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core); + compile_args[21] = dispatch_s_buffer_base; + compile_args[22] = prefetch_d_settings.consumer_slave_semaphore_id; // Semaphore on prefetch to handshake with dispatch_s + compile_args[23] = dispatch_s_settings.producer_semaphore_id; // Semaphore on dispatch_s to handshake with prefetch + compile_args[24] = dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size(); + compile_args[25] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core); + compile_args[26] = true; // is_dram_variant + compile_args[27] = false; // is_host_variant prefetch_d_idx++; // move on to next prefetcher } break; @@ -1169,13 +1170,18 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH_D][dispatch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_d auto dispatch_s_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_S][dispatch_d_idx]); // 1 to 1 mapping bw dispatch_s and dispatch_d auto dispatch_core_type = dispatch_d_settings.dispatch_core_type; + // The NOC encoding for this should be passed through the settings + auto chip_id = core.chip; + auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(chip_id, num_hw_cqs, dispatch_core_type); dispatch_d_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core); dispatch_d_settings.downstream_cores.push_back(mux_d_settings.worker_physical_core); - dispatch_d_settings.compile_args.resize(35); + dispatch_d_settings.compile_args.resize(27); auto& compile_args = dispatch_d_settings.compile_args; compile_args[0] = dispatch_d_settings.cb_start_address; compile_args[1] = dispatch_d_settings.cb_log_page_size; @@ -1197,15 +1203,13 @@ void Device::update_workers_build_settings(std::vectorget_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core); + compile_args[20] = dispatch_s_settings.consumer_semaphore_id; + compile_args[21] = this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid); + compile_args[22] = tensix_worker_go_signal_addr; + compile_args[23] = eth_worker_go_signal_addr; + compile_args[24] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core); + compile_args[25] = true; // is_dram_variant + compile_args[26] = false; // is_host_variant dispatch_d_idx++; // move on to next dispatcher } break; @@ -1223,15 +1227,8 @@ void Device::update_workers_build_settings(std::vectorget_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core) + this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core), + true, // is_dram_variant + true // is_host_variant }; configure_kernel_variant( @@ -1755,12 +1752,7 @@ void Device::compile_command_queue_programs() { tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), dispatch_core_type); // prefetch_sem tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_pages(), dispatch_core_type); // sync with dispatch_s - std::size_t tensix_num_worker_cols = this->compute_with_storage_grid_size().x; - std::size_t tensix_num_worker_rows = this->compute_with_storage_grid_size().y; - uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows; - CoreCoord tensix_worker_start_phys = this->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0 - CoreCoord tensix_worker_end_phys = this->physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER); - CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys); + auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(this->id(), num_hw_cqs, dispatch_core_type); uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries; uint32_t eth_worker_go_signal_addr = 0; if (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1) { @@ -1787,21 +1779,13 @@ void Device::compile_command_queue_programs() { 0, // unused prefetch_local_downstream_sem_addr 0, // unused prefetch_downstream_buffer_pages num_compute_cores, // max_write_packed_cores + dispatch_s_sync_sem_id, // used to notify dispatch_s that its safe to send a go signal + this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid), // used by dispatch_d to mcast go signals when dispatch_s is not enabled + tensix_worker_go_signal_addr, // used by dispatch_d to mcast go signals when dispatch_s is not enabled + eth_worker_go_signal_addr, // used by dispatch_d to mcast go signals when dispatch_s is not enabled + this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core), // dispatch_s core coords (should migrate to CTA) true, // is_dram_variant true, // is_host_variant - dispatch_s_buffer_base, - dispatch_s_sem, - prefetch_dispatch_s_sync_sem, - dispatch_s_sync_sem_id, - this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid), - tensix_num_worker_cores, - tensix_worker_go_signal_addr, - eth_worker_go_signal_addr, - dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size(), - this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core), - this->get_noc_unicast_encoding(NOC::NOC_1, dispatch_physical_core), - dispatch_core_type == CoreType::ETH, - DISPATCH_MESSAGE_ADDR }; configure_kernel_variant( @@ -2088,19 +2072,6 @@ void Device::compile_command_queue_programs() { } cq_id = 0; for (auto [dispatch_d_core, dispatch_d_settings] : device_worker_variants[DispatchWorkerType::DISPATCH_D]) { - std::size_t tensix_num_worker_cols = this->compute_with_storage_grid_size().x; - std::size_t tensix_num_worker_rows = this->compute_with_storage_grid_size().y; - uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows; - CoreCoord tensix_worker_start_phys = this->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0 - CoreCoord tensix_worker_end_phys = this->physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER); - CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys); - uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries; - uint32_t eth_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries; - dispatch_d_settings.compile_args[26] = this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid); - dispatch_d_settings.compile_args[27] = tensix_num_worker_cores; - dispatch_d_settings.compile_args[28] = tensix_worker_go_signal_addr; - dispatch_d_settings.compile_args[29] = eth_worker_go_signal_addr; - for (auto sem : dispatch_d_settings.semaphores) { //size of semaphores vector is number of needed semaphores on the core. //Value of each vector entry is the initialization value for the semaphore. diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index ff34a73d008f..d2bbf2a2cdb4 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -42,14 +42,13 @@ constexpr uint32_t prefetch_h_noc_xy = get_compile_time_arg_val(16); constexpr uint32_t prefetch_h_local_downstream_sem_addr = get_compile_time_arg_val(17); constexpr uint32_t prefetch_h_max_credits = get_compile_time_arg_val(18); constexpr uint32_t packed_write_max_unicast_sub_cmds = get_compile_time_arg_val(19); // Number of cores in compute grid -constexpr uint32_t is_d_variant = get_compile_time_arg_val(20); -constexpr uint32_t is_h_variant = get_compile_time_arg_val(21); -constexpr uint32_t dispatch_s_sem_id = get_compile_time_arg_val(25); -constexpr uint32_t worker_mcast_grid = get_compile_time_arg_val(26); -constexpr uint32_t num_worker_cores_to_mcast = get_compile_time_arg_val(27); -constexpr uint32_t mcast_go_signal_addr = get_compile_time_arg_val(28); -constexpr uint32_t unicast_go_signal_addr = get_compile_time_arg_val(29); -constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(31); // currently getting dispatch_s coords through RTAs. Migrate to CTAs. +constexpr uint32_t dispatch_s_sem_id = get_compile_time_arg_val(20); +constexpr uint32_t worker_mcast_grid = get_compile_time_arg_val(21); +constexpr uint32_t mcast_go_signal_addr = get_compile_time_arg_val(22); +constexpr uint32_t unicast_go_signal_addr = get_compile_time_arg_val(23); +constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(24); // currently getting dispatch_s coords through RTAs. Migrate to CTAs. +constexpr uint32_t is_d_variant = get_compile_time_arg_val(25); +constexpr uint32_t is_h_variant = get_compile_time_arg_val(26); constexpr uint8_t upstream_noc_index = UPSTREAM_NOC_INDEX; constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); @@ -828,7 +827,8 @@ void process_go_signal_mcast_cmd() { while (*worker_sem_addr < cmd->mcast.wait_count); if (cmd->mcast.mcast_flag & 0x1) { uint64_t dst = get_noc_addr_helper(worker_mcast_grid, mcast_go_signal_addr); - noc_async_write_multicast_one_packet((uint32_t)(&aligned_go_signal), dst, sizeof(uint32_t), num_worker_cores_to_mcast); + // packed_write_max_unicast_sub_cmds is the total number of compute cores (num_mcast_dests for this txn) + noc_async_write_multicast_one_packet((uint32_t)(&aligned_go_signal), dst, sizeof(uint32_t), packed_write_max_unicast_sub_cmds); } if (cmd->mcast.mcast_flag & 0x2) { for (int core_idx = 0; core_idx < num_unicast_cores; core_idx++) { diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index c782d964437d..2f3463cba9d4 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -45,14 +45,15 @@ constexpr uint32_t upstream_cb_sem_id = get_compile_time_arg_val(18); constexpr uint32_t cmddat_q_log_page_size = get_compile_time_arg_val(19); constexpr uint32_t cmddat_q_blocks = get_compile_time_arg_val(20); -constexpr uint32_t is_d_variant = get_compile_time_arg_val(21); -constexpr uint32_t is_h_variant = get_compile_time_arg_val(22); - -constexpr uint32_t dispatch_s_buffer_base = get_compile_time_arg_val(23); -constexpr uint32_t my_dispatch_s_cb_sem_id = get_compile_time_arg_val(24); -constexpr uint32_t downstream_dispatch_s_cb_sem_id = get_compile_time_arg_val(25); -constexpr uint32_t dispatch_s_buffer_size = get_compile_time_arg_val(26); -constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(27); // currently getting dispatch_s coords through RTAs. Migrate to CTAs. +// used for prefetch_d <--> dispatch_s data path +constexpr uint32_t dispatch_s_buffer_base = get_compile_time_arg_val(21); +constexpr uint32_t my_dispatch_s_cb_sem_id = get_compile_time_arg_val(22); +constexpr uint32_t downstream_dispatch_s_cb_sem_id = get_compile_time_arg_val(23); +constexpr uint32_t dispatch_s_buffer_size = get_compile_time_arg_val(24); +constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(25); // currently getting dispatch_s coords through RTAs. Migrate to CTAs. + +constexpr uint32_t is_d_variant = get_compile_time_arg_val(26); +constexpr uint32_t is_h_variant = get_compile_time_arg_val(27); constexpr uint8_t my_noc_index = NOC_INDEX; constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y));