microsoft · fs-eire · Sep 27, 2024 · Sep 14, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -47,11 +47,11 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", WebGpuSupportedNumberTypes()),
     Transpose);
 
-const std::string AppendPermFunction(gsl::span<const size_t> perm) {
+const std::string AppendPermFunction(gsl::span<const int64_t> perm) {
   std::ostringstream ss;
   ss.imbue(std::locale::classic());
-  ss << "fn perm(i: y_indices_t)->x_indices_t {\n"
-        "  var a: x_indices_t;\n";
+  ss << "fn perm(i: output_indices_t)->a_indices_t {\n"
+        "  var a: a_indices_t;\n";
   for (size_t i = 0; i < perm.size(); ++i) {
     ss << "  a[" << perm[i] << "] = i[" << i << "];\n";
   }
@@ -60,21 +60,53 @@ const std::string AppendPermFunction(gsl::span<const size_t> perm) {
   return ss.str();
 }
 
+auto SqueezeShape(const gsl::span<const int64_t>& shape, const gsl::span<const size_t>& adjusted_perm, InlinedVector<int64_t>& new_shape, InlinedVector<int64_t>& new_perm) {
+  for (auto i = 0; i < shape.size(); ++i) {
+    if (shape[i] != 1) {
+      new_shape.push_back(shape[i]);
+    }
+    if (shape[adjusted_perm[i]] != 1) {
+      new_perm.push_back(adjusted_perm[i]);
+    }
+  }
+};
+
 Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  const auto& output = shader.AddOutput("y", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  shader.AppendImplementation(AppendPermFunction(this->perm_));
-  shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"),
-                             "  let indices = ", output.OffsetToIndices("global_idx"),
-                             ";\n"
-                             "  let x_indices = perm(indices); \n"
-                             "  ",
-                             output.SetByOffset("global_idx", input.GetByIndices("x_indices")));
+  const auto& input = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+
+  if (use_shared_) {
+    const auto tile_size = std::to_string(TILE_SIZE);
+    shader.AppendImplementation("var<workgroup> tile : array<array<output_value_t, tile_size + 1>, tile_size>;\n");
+    shader.SetMainFunctionBody(
+        "  let stride = (uniforms.output_shape[1] - 1) / tile_size + 1;\n"
+        "  let workgroup_id_x = workgroup_idx % stride;\n"
+        "  let workgroup_id_y = workgroup_idx / stride;\n"
+        "  let input_col = workgroup_id_y * tile_size + local_id.x;\n"
+        "  let input_row = workgroup_id_x * tile_size + local_id.y;\n"
+        "  if (input_row < uniforms.a_shape[0] && input_col < uniforms.a_shape[1]) {\n"
+        "    tile[local_id.y][local_id.x] = " +
+        input.GetByIndices("a_indices_t(input_row, input_col)") +
+        ";\n"
+        "  }\n"
+        "  workgroupBarrier();\n"
+        "  let output_col = workgroup_id_x * tile_size + local_id.x;\n"
+        "  let output_row = workgroup_id_y * tile_size + local_id.y;\n"
+        "  if (output_row < uniforms.output_shape[0] && output_col < uniforms.output_shape[1]) {\n    " +
+        output.SetByIndices("output_indices_t(output_row, output_col)", "tile[local_id.x][local_id.y]") + "\n  }");
+  } else {
+    shader.AppendImplementation(AppendPermFunction(this->perm_));
+    shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"),
+                               "  let indices = ", output.OffsetToIndices("global_idx"),
+                               ";\n"
+                               "  let x_indices = perm(indices);\n",
+                               "  ",
+                               output.SetByOffset("global_idx", input.GetByIndices("x_indices")));
+  }
   return Status::OK();
 }
 
 Status Transpose::ComputeInternal(ComputeContext& context) const {
-  // TODO: there is an optimized version of transpose to port.
   const auto* input_tensor = context.Input(0);
   const TensorShape& input_shape = input_tensor->Shape();
   int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
@@ -86,16 +118,43 @@ Status Transpose::ComputeInternal(ComputeContext& context) const {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
 
+  InlinedVector<int64_t> new_shape{};
+  InlinedVector<int64_t> new_perm{};
+  SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm);
+  const bool channels_last = new_perm == InlinedVector<int64_t>({2, 3, 1});
+  const bool channels_first = new_perm == InlinedVector<int64_t>({3, 1, 2});
+  const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first;
+  auto new_input_shape = input_shape;
+  TensorShape new_output_shape(output_dims);
+  if (use_shared) {
+    new_input_shape = channels_last
+                          ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]})
+                      : channels_first
+                          ? TensorShape({new_shape[0] * new_shape[1], new_shape[2]})
+                          : new_shape;
+    new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
+  }
+
   uint32_t output_size = gsl::narrow_cast<int32_t>(input_tensor->Shape().Size());
-  TransposeProgram program{*p_perm};
+  TransposeProgram program{*p_perm, use_shared};
+  const auto tile_size = TransposeProgram::TILE_SIZE;
+  if (use_shared) {
+    program.SetWorkgroupSize(tile_size, tile_size, 1);
+  }
+
   program
       .CacheHint(absl::StrJoin(*p_perm, "-"))
-      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
-      .AddOutputs({output_tensor})
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
+      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
+      .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + tile_size - 1) / tile_size),
+                            static_cast<uint32_t>(((new_output_shape[0] + tile_size - 1) / tile_size)))
       .AddUniformVariables({
           {static_cast<uint32_t>(output_size)},
       });
+
+  use_shared ? program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + tile_size - 1) / tile_size),
+                                            static_cast<uint32_t>(((new_output_shape[0] + tile_size - 1) / tile_size)))
+             : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
   return context.RunProgram(program);
 }
 

diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h
@@ -13,16 +13,19 @@ namespace webgpu {
 
 class TransposeProgram final : public Program<TransposeProgram> {
  public:
-  TransposeProgram(const gsl::span<const size_t>& permutations)
-      : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()) {
+  TransposeProgram(const gsl::span<const size_t>& permutations, bool use_shared)
+      : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()), use_shared_(use_shared) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
+  constexpr static const uint32_t TILE_SIZE = 16;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+  WEBGPU_PROGRAM_DEFINE_CONSTANTS({"tile_size", TILE_SIZE});
 
  private:
-  InlinedVector<size_t> perm_;
+  InlinedVector<int64_t> perm_;
+  const bool use_shared_;
 };
 
 class Transpose final : public WebGpuKernel, public TransposeBase {