Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#12376: Support for non-32 Height in Width Sharded Conv2d #12382

Merged
merged 4 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,18 +317,21 @@ def run_conv_with_split(
assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)


@skip_for_grayskull()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do other tests cover these test scenarios on grayskull?
If not, why would these test scenarios note be relevant for grayskull?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are not planning on supporting these new features on gray skull.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the cases in the already existing test (before this PR) supported by grayskull?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of them were skipped, due to an OOM error. The width sharded Conv2d which is tested by this code was implemented to support a wormhole only model.

@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize("stride", [1, 2])
@pytest.mark.parametrize(
"output_channels, input_channels, input_height, input_width, filter_height, filter_width, pad_h, pad_w, act_block_w_div",
(
(128, 128, 8, 8, 3, 3, 0, 0, 1),
(128, 256, 8, 8, 3, 3, 1, 1, 1),
(576, 576, 8, 8, 3, 3, 0, 0, 1),
(960, 960, 4, 4, 3, 3, 0, 0, 1),
(256, 2048, 8, 8, 3, 3, 1, 1, 8),
(512, 2048, 16, 16, 3, 3, 1, 1, 4),
(768, 768, 8, 8, 3, 3, 1, 1, 1),
(768, 768, 16, 16, 3, 3, 1, 1, 1),
(1280, 1280, 16, 16, 3, 3, 1, 1, 1),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why removed these test cases?

(768, 768, 16, 16, 3, 3, 0, 0, 1),
(1280, 2560, 16, 16, 3, 3, 1, 1, 2),
(1280, 2560, 16, 16, 3, 3, 0, 0, 2),
),
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -360,12 +363,6 @@ def test_conv_ws(
weights_dtype,
activations_dtype,
):
if is_grayskull():
if input_channels >= 2048:
pytest.skip("Skipping on grayskull due to insufficient L1")
if input_channels >= 768 and input_height >= 10:
pytest.skip("Skipping on grayskull due to insufficient L1")

stride_h = stride
stride_w = stride
batch_size = 2
Expand Down Expand Up @@ -457,13 +454,15 @@ def test_conv_ws(

# torch_output_tensor is in row major layout and NHWC shape
# NHWC to NCHW
# torch_output_tensor = torch_output_tensor[:, :, : batch_size * out_height * out_width, :]
torch_output_tensor = torch_output_tensor.reshape(batch_size, out_height, out_width, output_channels)

logger.info(f"Output Shape : {torch_output_tensor.shape}")
torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
reader_patterns_cache.clear()

pcc = 0.94
passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)
logger.info(f"{pcc_msg} Threshold : {pcc}")
if not passing:
logger.error("Fails with PCC ", pcc_msg)
assert passing
Expand Down
7 changes: 3 additions & 4 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,13 @@ OptimizedConvParallelizationConfig determine_conv_op_parallel_config_from_conv_o
TT_ASSERT(conv_output_mem_config.shard_spec.has_value());
const auto& shard_spec = conv_output_mem_config.shard_spec.value();
const auto& shard_shape = shard_spec.shape;
TT_ASSERT(shard_shape[0] % 32 == 0);
TT_ASSERT(conv_output_mem_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED || shard_shape[0] % 32 == 0);
TT_ASSERT(shard_shape[1] % 32 == 0);
return {
.grid_size = shard_spec.grid.bounding_box().grid_size(),
.num_cores_nhw = num_cores_nhw,
.num_cores_c = num_cores_c,
.per_core_out_matrix_height_ntiles = shard_shape[0] / 32,
.per_core_out_matrix_height_ntiles = tt::round_up(shard_shape[0], 32) / 32,
.per_core_out_matrix_width_ntiles = shard_shape[1] / 32,
};
}
Expand Down Expand Up @@ -765,8 +765,7 @@ std::tuple<ttnn::Tensor, uint32_t, uint32_t, ttnn::Tensor, std::optional<ttnn::T
sliding_window_config.pad_hw.first==0 &&
sliding_window_config.pad_hw.second==0
);
if(bypass_halo)
{
if(bypass_halo) {
// call conv micro op
auto conv_output = optimized_conv_new(
input_tensor_post_tm,
Expand Down
Loading