From 26c2803fe341c72108301c595efe753245d3637d Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Mon, 9 Sep 2024 15:22:14 +0530
Subject: [PATCH 1/4] #12376: Support for non-32 Height in Width Sharded Conv2d

---
 tests/ttnn/unit_tests/operations/test_new_conv2d.py | 9 +++++++--
 ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp     | 7 +++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index 43533a8b7fd..a6826b633a9 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -322,13 +322,17 @@ def run_conv_with_split(
 @pytest.mark.parametrize(
     "output_channels, input_channels, input_height, input_width, filter_height, filter_width, pad_h, pad_w, act_block_w_div",
     (
+        (128, 128, 8, 8, 3, 3, 0, 0, 1),
         (128, 256, 8, 8, 3, 3, 1, 1, 1),
+        (576, 576, 8, 8, 3, 3, 0, 0, 1),
+        (960, 960, 4, 4, 3, 3, 0, 0, 1),
         (256, 2048, 8, 8, 3, 3, 1, 1, 8),
         (512, 2048, 16, 16, 3, 3, 1, 1, 4),
-        (768, 768, 8, 8, 3, 3, 1, 1, 1),
         (768, 768, 16, 16, 3, 3, 1, 1, 1),
         (1280, 1280, 16, 16, 3, 3, 1, 1, 1),
         (1280, 2560, 16, 16, 3, 3, 1, 1, 2),
+        (1280, 2560, 16, 16, 3, 3, 0, 0, 2),
+
     ),
 )
 @pytest.mark.parametrize(
@@ -384,7 +388,7 @@ def test_conv_ws(
     torch_input_tensor_nchw = torch_input_tensor_nchw.broadcast_to(conv_input_shape).float()
     torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1))
 
-    torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float()
+    torch_weight_tensor = torch.ones(conv_weight_shape, dtype=torch.bfloat16).float()
 
     tt_bias_tensor = None
     torch_bias_tensor = None
@@ -457,6 +461,7 @@ def test_conv_ws(
 
     # torch_output_tensor is in row major layout and NHWC shape
     # NHWC to NCHW
+    # torch_output_tensor = torch_output_tensor[:, :, : batch_size * out_height * out_width, :]
     torch_output_tensor = torch_output_tensor.reshape(batch_size, out_height, out_width, output_channels)
 
     torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index bdf570f175a..85f4534ee3c 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -229,13 +229,13 @@ OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_o
     TT_ASSERT(conv_output_mem_config.shard_spec.has_value());
     const auto& shard_spec = conv_output_mem_config.shard_spec.value();
     const auto& shard_shape = shard_spec.shape;
-    TT_ASSERT(shard_shape[0] % 32 == 0);
+    // TT_ASSERT(shard_shape[0] % 32 == 0);
     TT_ASSERT(shard_shape[1] % 32 == 0);
     return {
         .grid_size = shard_spec.grid.bounding_box().grid_size(),
         .num_cores_nhw = num_cores_nhw,
         .num_cores_c = num_cores_c,
-        .per_core_out_matrix_height_ntiles = shard_shape[0] / 32,
+        .per_core_out_matrix_height_ntiles = tt::round_up(shard_shape[0], 32) / 32,
         .per_core_out_matrix_width_ntiles = shard_shape[1] / 32,
     };
 }
@@ -765,8 +765,7 @@ std::tuple<ttnn::Tensor, uint32_t, uint32_t, ttnn::Tensor, std::optional<ttnn::T
             sliding_window_config.pad_hw.first==0 &&
             sliding_window_config.pad_hw.second==0
             );
-        if(bypass_halo)
-        {
+        if(bypass_halo) {
             // call conv micro op
             auto conv_output = optimized_conv_new(
                 input_tensor_post_tm,

From 8889f7da4b1f789692bcb58cd1684342b3bfa4e5 Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Wed, 11 Sep 2024 11:05:45 +0530
Subject: [PATCH 2/4] #12376: Support for non-32 Height in Width Sharded Conv2d

---
 tests/ttnn/unit_tests/operations/test_new_conv2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index a6826b633a9..4ddf497dd97 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -328,8 +328,7 @@ def run_conv_with_split(
         (960, 960, 4, 4, 3, 3, 0, 0, 1),
         (256, 2048, 8, 8, 3, 3, 1, 1, 8),
         (512, 2048, 16, 16, 3, 3, 1, 1, 4),
-        (768, 768, 16, 16, 3, 3, 1, 1, 1),
-        (1280, 1280, 16, 16, 3, 3, 1, 1, 1),
+        (768, 768, 16, 16, 3, 3, 0, 0, 1),
         (1280, 2560, 16, 16, 3, 3, 1, 1, 2),
         (1280, 2560, 16, 16, 3, 3, 0, 0, 2),
 
@@ -388,7 +387,7 @@ def test_conv_ws(
     torch_input_tensor_nchw = torch_input_tensor_nchw.broadcast_to(conv_input_shape).float()
     torch_input_tensor = torch.permute(torch_input_tensor_nchw, (0, 2, 3, 1))
 
-    torch_weight_tensor = torch.ones(conv_weight_shape, dtype=torch.bfloat16).float()
+    torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float()
 
     tt_bias_tensor = None
     torch_bias_tensor = None
@@ -463,12 +462,13 @@ def test_conv_ws(
     # NHWC to NCHW
     # torch_output_tensor = torch_output_tensor[:, :, : batch_size * out_height * out_width, :]
     torch_output_tensor = torch_output_tensor.reshape(batch_size, out_height, out_width, output_channels)
-
+    logger.info(f"Output Shape : {torch_output_tensor.shape}")
     torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
     reader_patterns_cache.clear()
 
     pcc = 0.94
     passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
+    logger.info(f"{pcc_msg} Threshold : {pcc}")
     if not passing:
         logger.error("Fails with PCC ", pcc_msg)
     assert passing

From d9fe62a10f5c99a4cbd0abc8d7a370ab7e4c62b6 Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Wed, 11 Sep 2024 09:46:58 +0000
Subject: [PATCH 3/4] #0: Skipped WS Conv2d on Grayskull

---
 tests/ttnn/unit_tests/operations/test_new_conv2d.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index 4ddf497dd97..33fd62f35fa 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -317,6 +317,7 @@ def run_conv_with_split(
     assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
 
 
+@skip_for_grayskull()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize("stride", [1, 2])
 @pytest.mark.parametrize(
@@ -331,7 +332,6 @@ def run_conv_with_split(
         (768, 768, 16, 16, 3, 3, 0, 0, 1),
         (1280, 2560, 16, 16, 3, 3, 1, 1, 2),
         (1280, 2560, 16, 16, 3, 3, 0, 0, 2),
-
     ),
 )
 @pytest.mark.parametrize(
@@ -363,12 +363,6 @@ def test_conv_ws(
     weights_dtype,
     activations_dtype,
 ):
-    if is_grayskull():
-        if input_channels >= 2048:
-            pytest.skip("Skipping on grayskull due to insufficient L1")
-        if input_channels >= 768 and input_height >= 10:
-            pytest.skip("Skipping on grayskull due to insufficient L1")
-
     stride_h = stride
     stride_w = stride
     batch_size = 2

From 0441baeb8bfe626bc691c4ead259ff4457655382 Mon Sep 17 00:00:00 2001
From: Sankar Manoj <smanoj@tenstorrent.com>
Date: Thu, 12 Sep 2024 10:10:18 +0530
Subject: [PATCH 4/4] #0: Bug Fix

---
 ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index 85f4534ee3c..aa5daf1510e 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -229,7 +229,7 @@ OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_o
     TT_ASSERT(conv_output_mem_config.shard_spec.has_value());
     const auto& shard_spec = conv_output_mem_config.shard_spec.value();
     const auto& shard_shape = shard_spec.shape;
-    // TT_ASSERT(shard_shape[0] % 32 == 0);
+    TT_ASSERT(conv_output_mem_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED || shard_shape[0] % 32 == 0);
     TT_ASSERT(shard_shape[1] % 32 == 0);
     return {
         .grid_size = shard_spec.grid.bounding_box().grid_size(),