diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
index 145b95f8d335..1ffdd977c754 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
@@ -469,18 +469,13 @@ int main(int argc, char **argv) {
              0,    // prefetch_local_downstream_sem_addr
              0,    // prefetch_downstream_buffer_pages
              num_compute_cores, // max_write_packed_cores
-             true, // is_dram_variant
-             true, // is_host_variant
-             0,
-             0,
              0,
              0,
              0,
              0,
              0,
-             0,
-             0,
-             0
+             true, // is_dram_variant
+             true, // is_host_variant
             };
         std::vector<uint32_t> spoof_prefetch_compile_args =
             {l1_buf_base,
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
index a84a8204f804..3481745c2ee6 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -1640,13 +1640,11 @@ void configure_for_single_chip(Device *device,
         prefetch_downstream_cb_sem, // prefetch_d only
         dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
         dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
-        true,
-        true,
-        0,
-        0,
-        0,
-        0,
-        0
+        0, // unused: for prefetch_hd <--> dispatch_hd
+        0, // unused: for prefetch_hd <--> dispatch_hd
+        0, // unused: for prefetch_hd <--> dispatch_hd
+        0, // unused: for prefetch_hd <--> dispatch_hd
+        0  // unused: for prefetch_hd <--> dispatch_hd
     };
 
     constexpr NOC my_noc_index = NOC::NOC_0;
@@ -1665,8 +1663,6 @@ void configure_for_single_chip(Device *device,
         prefetch_compile_args[11] = prefetch_d_buffer_base;
         prefetch_compile_args[12] = prefetch_d_buffer_pages * (1 << dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE);
         prefetch_compile_args[13] = scratch_db_base;
-        prefetch_compile_args[21] = true;
-        prefetch_compile_args[22] = false;
         CoreCoord phys_prefetch_d_upstream_core =
             packetized_path_en_g ? phys_prefetch_relay_demux_core : phys_prefetch_core_g;
         configure_kernel_variant<true, false>(program,
@@ -1690,8 +1686,6 @@ void configure_for_single_chip(Device *device,
         prefetch_compile_args[11] = cmddat_q_base;
         prefetch_compile_args[12] = cmddat_q_size_g;
         prefetch_compile_args[13] = 0;
-        prefetch_compile_args[21] = false;
-        prefetch_compile_args[22] = true;
         CoreCoord phys_prefetch_h_downstream_core =
             packetized_path_en_g ? phys_prefetch_relay_mux_core : phys_prefetch_d_core;
         configure_kernel_variant<false, true>(program,
@@ -1899,18 +1893,11 @@ void configure_for_single_chip(Device *device,
          prefetch_downstream_cb_sem,
          prefetch_downstream_buffer_pages,
          num_compute_cores, // max_write_packed_cores
-         true,
-         true,
          0,
          0,
          0,
          0,
          0,
-         0,
-         0,
-         0,
-         0,
-         0
     };
 
     CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g;
@@ -1924,8 +1911,6 @@ void configure_for_single_chip(Device *device,
         dispatch_compile_args[12] = dispatch_downstream_cb_sem;
         dispatch_compile_args[13] = dispatch_h_cb_sem;
         dispatch_compile_args[14] = dispatch_d_preamble_size;
-        dispatch_compile_args[20] = true;
-        dispatch_compile_args[21] = false;
         CoreCoord phys_dispatch_d_downstream_core =
             packetized_path_en_g ? phys_dispatch_relay_mux_core : phys_dispatch_h_core;
         configure_kernel_variant<true, false>(program,
@@ -1946,8 +1931,6 @@ void configure_for_single_chip(Device *device,
         dispatch_compile_args[12] = dispatch_h_cb_sem;
         dispatch_compile_args[13] = dispatch_downstream_cb_sem;
         dispatch_compile_args[14] = 0; // preamble size
-        dispatch_compile_args[20] = false;
-        dispatch_compile_args[21] = true;
         CoreCoord phys_dispatch_h_upstream_core =
             packetized_path_en_g ? phys_dispatch_relay_demux_core : phys_dispatch_core;
         configure_kernel_variant<false, true>(program,
@@ -2289,6 +2272,11 @@ void configure_for_multi_chip(Device *device,
         prefetch_downstream_cb_sem, // prefetch_d only
         dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
         dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
+        0, // unused: for prefetch_d <--> dispatch_d
+        0, // unused: for prefetch_d <--> dispatch_d
+        0, // unused: for prefetch_d <--> dispatch_d
+        0, // unused: for prefetch_d <--> dispatch_d
+        0  // unused: for prefetch_d <--> dispatch_d
     };
 
     constexpr NOC my_noc_index = NOC::NOC_0;
@@ -2623,7 +2611,12 @@ void configure_for_multi_chip(Device *device,
          NOC_XY_ENCODING(phys_prefetch_core_g.x, phys_prefetch_core_g.y),
          prefetch_downstream_cb_sem,
          prefetch_downstream_buffer_pages,
-         num_compute_cores
+         num_compute_cores,
+         0,
+         0,
+         0,
+         0,
+         0,
     };
 
     CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g;
diff --git a/tests/ttnn/unit_tests/test_single_device_events.py b/tests/ttnn/unit_tests/test_single_device_events.py
index 3955d16d8a18..866a66204bd3 100644
--- a/tests/ttnn/unit_tests/test_single_device_events.py
+++ b/tests/ttnn/unit_tests/test_single_device_events.py
@@ -14,7 +14,7 @@
 
 
 @pytest.mark.parametrize("shape", [(3, 1, 512, 512)])
-@pytest.mark.parametrize("device_params", [{"num_command_queues": 2}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"num_command_queues": 1}], indirect=True)
 def test_single_device_events(device, shape):
     pytest.skip("Needs Eth dispatch to run on WH")
     # Enable Program Cache and Async Mode
diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
index a7c50e77889a..964a37cb5c4a 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/common/core_descriptor.hpp
@@ -154,4 +154,17 @@ inline CoreCoord get_physical_core_coordinate(const tt_cxy_pair &logical_locatio
     return soc_desc.get_physical_core_from_logical_core(CoreCoord(logical_location.x, logical_location.y), core_type);
 }
 
+inline std::tuple<uint32_t, CoreRange> get_physical_worker_grid_config(chip_id_t chip, uint8_t num_hw_cqs, CoreType dispatch_core_type) {
+    // Get logical compute grid dimensions and num workers
+    auto worker_grid = tt::get_compute_grid_size(chip, num_hw_cqs, dispatch_core_type);
+    std::size_t tensix_num_worker_cols = worker_grid.x;
+    std::size_t tensix_num_worker_rows = worker_grid.y;
+    uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows;
+    const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip);
+    // Get physical compute grid range based on SOC Desc and Logical Coords
+    CoreCoord tensix_worker_start_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0
+    CoreCoord tensix_worker_end_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER);
+    CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys);
+    return std::make_tuple(tensix_num_worker_cores, tensix_worker_physical_grid);
+}
 }   // namespace tt
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 978ecfbec935..5064400ef548 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -714,13 +714,13 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                     compile_args[18] = 0; //prefetch_downstream_cb_sem, // prefetch_d only
                     compile_args[19] = dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE;
                     compile_args[20] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only
-                    compile_args[21] = false;  // is_dram_variant
-                    compile_args[22] = true;    // is_host_variant
-                    compile_args[23] = 0;
-                    compile_args[24] = 0;
-                    compile_args[25] = 0;
-                    compile_args[26] = 0;
-                    compile_args[27] = 0;
+                    compile_args[21] = 0; // unused: prefetch_d only
+                    compile_args[22] = 0; // unused: prefetch_d only
+                    compile_args[23] = 0; // unused: prefetch_d only
+                    compile_args[24] = 0; // unused: prefetch_d only
+                    compile_args[25] = 0; // unused: prefetch_d only
+                    compile_args[26] = false;  // is_dram_variant
+                    compile_args[27] = true;    // is_host_variant
                 }
                 break;
             }
@@ -920,7 +920,7 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                     auto dispatch_core_type = settings.dispatch_core_type;
                     settings.upstream_cores.push_back(demux_settings.worker_physical_core);
                     settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0));
-                    settings.compile_args.resize(32);
+                    settings.compile_args.resize(27);
                     auto& compile_args = settings.compile_args;
                     compile_args[0] = settings.cb_start_address;
                     compile_args[1] = settings.cb_log_page_size;
@@ -942,13 +942,14 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                     compile_args[17] = prefetch_h_settings.producer_semaphore_id, // sem_id on prefetch_h that dispatch_d is meant to increment, to resume sending of cmds post exec_buf stall
                     compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // XXXX should this be mux pages?
                     compile_args[19] = settings.num_compute_cores;
-                    compile_args[20] = false; // is_dram_variant
-                    compile_args[21] = true; // is_host_variant
-                    compile_args[22] = 0;
-                    compile_args[23] = 0;
-                    compile_args[24] = 0;
-                    compile_args[25] = 0;
-                    compile_args[31] = 0;
+                    compile_args[20] = 0; // unused: dispatch_d only
+                    compile_args[21] = 0; // unused: dispatch_d only
+                    compile_args[22] = 0; // unused: dispatch_d only
+                    compile_args[23] = 0; // unused: dispatch_d only
+                    compile_args[24] = 0; // unused: dispatch_d only
+                    compile_args[25] = false; // is_dram_variant
+                    compile_args[26] = true; // is_host_variant
+
                     dispatch_idx++;
                 }
                 break;
@@ -1149,13 +1150,13 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                     compile_args[18] = demux_sem++; //prefetch_downstream_cb_sem, // prefetch_d only
                     compile_args[19] = prefetch_d_settings.cb_log_page_size;
                     compile_args[20] = dispatch_constants::PREFETCH_D_BUFFER_BLOCKS; // prefetch_d only
-                    compile_args[21] = true;  // is_dram_variant
-                    compile_args[22] = false; // is_host_variant
-                    compile_args[23] = dispatch_s_buffer_base;
-                    compile_args[24] = prefetch_d_settings.consumer_slave_semaphore_id; // Semaphore on prefetch to handshake with dispatch_s
-                    compile_args[25] = dispatch_s_settings.producer_semaphore_id; // Semaphore on dispatch_s to handshake with prefetch
-                    compile_args[26] = dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size();
-                    compile_args[27] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core);
+                    compile_args[21] = dispatch_s_buffer_base;
+                    compile_args[22] = prefetch_d_settings.consumer_slave_semaphore_id; // Semaphore on prefetch to handshake with dispatch_s
+                    compile_args[23] = dispatch_s_settings.producer_semaphore_id; // Semaphore on dispatch_s to handshake with prefetch
+                    compile_args[24] = dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size();
+                    compile_args[25] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core);
+                    compile_args[26] = true;  // is_dram_variant
+                    compile_args[27] = false; // is_host_variant
                     prefetch_d_idx++; // move on to next prefetcher
                 }
                 break;
@@ -1169,13 +1170,18 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                 uint32_t sem = 0;
                 int dispatch_d_idx = 0;
                 uint32_t mux_sem = mux_d_settings.consumer_semaphore_id;
+                uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries;
+                    uint32_t eth_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries;
                 for (auto&[core, dispatch_d_settings] : device_worker_variants[DispatchWorkerType::DISPATCH_D]) {
                     auto prefetch_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH_D][dispatch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_d
                     auto dispatch_s_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_S][dispatch_d_idx]); // 1 to 1 mapping bw dispatch_s and dispatch_d
                     auto dispatch_core_type = dispatch_d_settings.dispatch_core_type;
+                    // The NOC encoding for this should be passed through the settings
+                    auto chip_id = core.chip;
+                    auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(chip_id, num_hw_cqs, dispatch_core_type);
                     dispatch_d_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core);
                     dispatch_d_settings.downstream_cores.push_back(mux_d_settings.worker_physical_core);
-                    dispatch_d_settings.compile_args.resize(35);
+                    dispatch_d_settings.compile_args.resize(27);
                     auto& compile_args = dispatch_d_settings.compile_args;
                     compile_args[0] = dispatch_d_settings.cb_start_address;
                     compile_args[1] = dispatch_d_settings.cb_log_page_size;
@@ -1197,15 +1203,13 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                     compile_args[17] = 1; //prefetch_downstream_cb_sem,
                     compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // mux buffer size is a function of num_cqs
                     compile_args[19] = dispatch_d_settings.num_compute_cores;
-                    compile_args[20] = true; // is_dram_variant
-                    compile_args[21] = false; // is_host_variant
-                    compile_args[22] = 0; //dispatch_s_buffer_base;
-                    compile_args[23] = 0; // dispatch_d_settings.dispatch_s_semaphore_id;
-                    compile_args[24] = 0; //prefetch_d_settings.consumer_slave_semaphore_id;
-                    compile_args[25] = dispatch_s_settings.consumer_semaphore_id;
-                    //  need to add more compile args here
-                    compile_args[30] = 0; //dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size();
-                    compile_args[31] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core);
+                    compile_args[20] = dispatch_s_settings.consumer_semaphore_id;
+                    compile_args[21] = this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid);
+                    compile_args[22] = tensix_worker_go_signal_addr;
+                    compile_args[23] = eth_worker_go_signal_addr;
+                    compile_args[24] = this->get_noc_unicast_encoding(NOC::NOC_0, dispatch_s_settings.worker_physical_core);
+                    compile_args[25] = true; // is_dram_variant
+                    compile_args[26] = false; // is_host_variant
                     dispatch_d_idx++; // move on to next dispatcher
                 }
                 break;
@@ -1223,15 +1227,8 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
                         dispatch_s_settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0));
                         auto dispatch_core_type = dispatch_s_settings.dispatch_core_type;
                         auto chip_id = core.chip;
-                        auto worker_grid = tt::get_compute_grid_size(chip_id, num_hw_cqs, dispatch_core_type);
-                        std::size_t tensix_num_worker_cols = worker_grid.x;
-                        std::size_t tensix_num_worker_rows = worker_grid.y;
-                        uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows;
-                        const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id);
-                        CoreCoord tensix_worker_start_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0
-                        CoreCoord tensix_worker_end_phys = soc_desc.get_physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER);
-                        CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys);
-
+                        // The NOC encoding for this should be passed through the settings
+                        auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(chip_id, num_hw_cqs, dispatch_core_type);
                         dispatch_s_settings.compile_args.resize(14);
                         auto& compile_args = dispatch_s_settings.compile_args;
                         compile_args[0] = dispatch_s_settings.cb_start_address;
@@ -1727,13 +1724,13 @@ void Device::compile_command_queue_programs() {
                 0, //prefetch_downstream_cb_sem, // prefetch_d only
                 dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
                 dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
-                true,   // is_dram_variant
-                true,    // is_host_variant
                 dispatch_s_buffer_base,
                 prefetch_dispatch_s_sync_sem,
                 dispatch_s_sem,
                 dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size(),
-                this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core)
+                this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core),
+                true,   // is_dram_variant
+                true    // is_host_variant
             };
 
             configure_kernel_variant(
@@ -1755,12 +1752,7 @@ void Device::compile_command_queue_programs() {
             tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).dispatch_buffer_pages(), dispatch_core_type); // prefetch_sem
             tt::tt_metal::CreateSemaphore(*command_queue_program_ptr, prefetch_core, dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_pages(), dispatch_core_type); // sync with dispatch_s
 
-            std::size_t tensix_num_worker_cols = this->compute_with_storage_grid_size().x;
-            std::size_t tensix_num_worker_rows = this->compute_with_storage_grid_size().y;
-            uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows;
-            CoreCoord tensix_worker_start_phys = this->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0
-            CoreCoord tensix_worker_end_phys = this->physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER);
-            CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys);
+            auto [tensix_num_worker_cores, tensix_worker_physical_grid] = get_physical_worker_grid_config(this->id(), num_hw_cqs, dispatch_core_type);
             uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries;
             uint32_t eth_worker_go_signal_addr = 0;
             if (hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) != -1) {
@@ -1787,21 +1779,13 @@ void Device::compile_command_queue_programs() {
                 0,      // unused prefetch_local_downstream_sem_addr
                 0,      // unused prefetch_downstream_buffer_pages
                 num_compute_cores, // max_write_packed_cores
+                dispatch_s_sync_sem_id, // used to notify dispatch_s that its safe to send a go signal
+                this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid), // used by dispatch_d to mcast go signals when dispatch_s is not enabled
+                tensix_worker_go_signal_addr, // used by dispatch_d to mcast go signals when dispatch_s is not enabled
+                eth_worker_go_signal_addr, // used by dispatch_d to mcast go signals when dispatch_s is not enabled
+                this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core), // dispatch_s core coords (should migrate to CTA)
                 true,   // is_dram_variant
                 true,    // is_host_variant
-                dispatch_s_buffer_base,
-                dispatch_s_sem,
-                prefetch_dispatch_s_sync_sem,
-                dispatch_s_sync_sem_id,
-                this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid),
-                tensix_num_worker_cores,
-                tensix_worker_go_signal_addr,
-                eth_worker_go_signal_addr,
-                dispatch_constants::get(dispatch_core_type).dispatch_s_buffer_size(),
-                this->get_noc_unicast_encoding(my_noc_index, dispatch_s_physical_core),
-                this->get_noc_unicast_encoding(NOC::NOC_1, dispatch_physical_core),
-                dispatch_core_type == CoreType::ETH,
-                DISPATCH_MESSAGE_ADDR
             };
 
             configure_kernel_variant(
@@ -2088,19 +2072,6 @@ void Device::compile_command_queue_programs() {
         }
         cq_id = 0;
         for (auto [dispatch_d_core, dispatch_d_settings] : device_worker_variants[DispatchWorkerType::DISPATCH_D]) {
-            std::size_t tensix_num_worker_cols = this->compute_with_storage_grid_size().x;
-            std::size_t tensix_num_worker_rows = this->compute_with_storage_grid_size().y;
-            uint32_t tensix_num_worker_cores = tensix_num_worker_cols * tensix_num_worker_rows;
-            CoreCoord tensix_worker_start_phys = this->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::WORKER); // Logical Worker Coords start at 0,0
-            CoreCoord tensix_worker_end_phys = this->physical_core_from_logical_core(CoreCoord(tensix_num_worker_cols - 1, tensix_num_worker_rows - 1), CoreType::WORKER);
-            CoreRange tensix_worker_physical_grid = CoreRange(tensix_worker_start_phys, tensix_worker_end_phys);
-            uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries;
-            uint32_t eth_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::LAUNCH) + sizeof(launch_msg_t) * launch_msg_buffer_num_entries;
-            dispatch_d_settings.compile_args[26] = this->get_noc_multicast_encoding(NOC::NOC_0, tensix_worker_physical_grid);
-            dispatch_d_settings.compile_args[27] = tensix_num_worker_cores;
-            dispatch_d_settings.compile_args[28] = tensix_worker_go_signal_addr;
-            dispatch_d_settings.compile_args[29] = eth_worker_go_signal_addr;
-
             for (auto sem : dispatch_d_settings.semaphores) {
                 //size of semaphores vector is number of needed semaphores on the core.
                 //Value of each vector entry is the initialization value for the semaphore.
diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
index ff34a73d008f..d2bbf2a2cdb4 100644
--- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
@@ -42,14 +42,13 @@ constexpr uint32_t prefetch_h_noc_xy = get_compile_time_arg_val(16);
 constexpr uint32_t prefetch_h_local_downstream_sem_addr = get_compile_time_arg_val(17);
 constexpr uint32_t prefetch_h_max_credits = get_compile_time_arg_val(18);
 constexpr uint32_t packed_write_max_unicast_sub_cmds = get_compile_time_arg_val(19); // Number of cores in compute grid
-constexpr uint32_t is_d_variant = get_compile_time_arg_val(20);
-constexpr uint32_t is_h_variant = get_compile_time_arg_val(21);
-constexpr uint32_t dispatch_s_sem_id = get_compile_time_arg_val(25);
-constexpr uint32_t worker_mcast_grid = get_compile_time_arg_val(26);
-constexpr uint32_t num_worker_cores_to_mcast = get_compile_time_arg_val(27);
-constexpr uint32_t mcast_go_signal_addr = get_compile_time_arg_val(28);
-constexpr uint32_t unicast_go_signal_addr = get_compile_time_arg_val(29);
-constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(31); // currently getting dispatch_s coords through RTAs. Migrate to CTAs.
+constexpr uint32_t dispatch_s_sem_id = get_compile_time_arg_val(20);
+constexpr uint32_t worker_mcast_grid = get_compile_time_arg_val(21);
+constexpr uint32_t mcast_go_signal_addr = get_compile_time_arg_val(22);
+constexpr uint32_t unicast_go_signal_addr = get_compile_time_arg_val(23);
+constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(24); // currently getting dispatch_s coords through RTAs. Migrate to CTAs.
+constexpr uint32_t is_d_variant = get_compile_time_arg_val(25);
+constexpr uint32_t is_h_variant = get_compile_time_arg_val(26);
 
 constexpr uint8_t upstream_noc_index = UPSTREAM_NOC_INDEX;
 constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y));
@@ -828,7 +827,8 @@ void process_go_signal_mcast_cmd() {
     while (*worker_sem_addr < cmd->mcast.wait_count);
     if (cmd->mcast.mcast_flag & 0x1) {
         uint64_t dst = get_noc_addr_helper(worker_mcast_grid, mcast_go_signal_addr);
-        noc_async_write_multicast_one_packet((uint32_t)(&aligned_go_signal), dst, sizeof(uint32_t), num_worker_cores_to_mcast);
+        // packed_write_max_unicast_sub_cmds is the total number of compute cores (num_mcast_dests for this txn)
+        noc_async_write_multicast_one_packet((uint32_t)(&aligned_go_signal), dst, sizeof(uint32_t), packed_write_max_unicast_sub_cmds);
     }
     if (cmd->mcast.mcast_flag & 0x2) {
         for (int core_idx = 0; core_idx < num_unicast_cores; core_idx++) {
diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
index c782d964437d..2f3463cba9d4 100644
--- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
@@ -45,14 +45,15 @@ constexpr uint32_t upstream_cb_sem_id = get_compile_time_arg_val(18);
 constexpr uint32_t cmddat_q_log_page_size = get_compile_time_arg_val(19);
 constexpr uint32_t cmddat_q_blocks = get_compile_time_arg_val(20);
 
-constexpr uint32_t is_d_variant = get_compile_time_arg_val(21);
-constexpr uint32_t is_h_variant = get_compile_time_arg_val(22);
-
-constexpr uint32_t dispatch_s_buffer_base = get_compile_time_arg_val(23);
-constexpr uint32_t my_dispatch_s_cb_sem_id = get_compile_time_arg_val(24);
-constexpr uint32_t downstream_dispatch_s_cb_sem_id = get_compile_time_arg_val(25);
-constexpr uint32_t dispatch_s_buffer_size = get_compile_time_arg_val(26);
-constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(27); // currently getting dispatch_s coords through RTAs. Migrate to CTAs.
+// used for prefetch_d <--> dispatch_s data path
+constexpr uint32_t dispatch_s_buffer_base = get_compile_time_arg_val(21);
+constexpr uint32_t my_dispatch_s_cb_sem_id = get_compile_time_arg_val(22);
+constexpr uint32_t downstream_dispatch_s_cb_sem_id = get_compile_time_arg_val(23);
+constexpr uint32_t dispatch_s_buffer_size = get_compile_time_arg_val(24);
+constexpr uint32_t dispatch_s_noc_xy = get_compile_time_arg_val(25); // currently getting dispatch_s coords through RTAs. Migrate to CTAs.
+
+constexpr uint32_t is_d_variant = get_compile_time_arg_val(26);
+constexpr uint32_t is_h_variant = get_compile_time_arg_val(27);
 
 constexpr uint8_t my_noc_index = NOC_INDEX;
 constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y));