[Adreno][OpenCL] Get rid of extra memory copy (#12286)

* Add annotation pass for device_copy where we get buffers but expect textures * Fix issues with running device_copy * Get rid of extra memory copy * Fix build after cherry-picking * Fix lint * Fix CI * Apply comments Co-authored-by: Andrey Malyshev <elvin.nnov@gmail.com>
apache · Aug 12, 2022 · 5deb95a · 5deb95a
1 parent e8de88e
commit 5deb95a
Show file tree

Hide file tree

Showing 11 changed files with 312 additions and 63 deletions.
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
@@ -257,3 +257,21 @@ def schedule_pool_adreno(attrs, outs, target):
         if attrs.layout == "NCHW4c":
             return topi.adreno.schedule_pool(outs, attrs.layout)
         return topi.cuda.schedule_pool(outs, attrs.layout)
+
+
+@schedule_injective.register(["adreno"])
+def schedule_injective_adreno(attrs, outs, target):
+    """schedule injective ops for adreno"""
+    with target:
+        return topi.adreno.schedule_injective(outs)
+
+
+@concatenate_strategy.register(["adreno"])
+def concatenate_strategy_adreno(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.transform.concatenate),
+        wrap_topi_schedule(topi.adreno.schedule_injective),
+        name="concatenate.adreno",
+    )
+    return strategy
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
@@ -25,3 +25,4 @@
 from .conv2d_alter_op import *
 from .conv2d_nchw_winograd import *
 from .conv2d_nhwc_winograd import *
+from .injective import schedule_injective
diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -279,28 +279,35 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
     ):  # len(latest.op.axis) == 4:
         # manage scheduling of datacopy
         pad_data, kernel = s[conv].op.input_tensors
-        pack_data = pad_data.op.input_tensors[0]
-        bind_data_copy(s[pack_data])
+        if "pad_temp" in pad_data.op.name:
+            pack_data = pad_data.op.input_tensors[0]
+            bind_data_copy(s[pack_data])
+        else:
+            bind_data_copy(s[pad_data])
         bind_data_copy(s[kernel])
 
     pad_data, kernel = s[conv].op.input_tensors
 
-    s[pad_data].compute_inline()
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # create cache stage
-    AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-    bind_data_copy(s[AT])
     if (
         autotvm.GLOBAL_SCOPE.in_tuning
         or isinstance(kernel.op, tvm.te.ComputeOp)
         and "filter_pack" in kernel.op.tag
     ):
+        if "pad_temp" in pad_data.op.name:
+            s[pad_data].compute_inline()
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
         WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
         bind_data_copy(s[WT])
+    elif "pad_temp" in pad_data.op.name:
+        s[pad_data].compute_inline()
+        # create cache stage
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
 
     # tile and bind spatial axes
     n, fc, y, x, fb = s[latest_blocked].op.axis

diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -275,28 +275,35 @@ def schedule_conv2d_NHWC(cfg, s, output):
     ):  # len(latest.op.axis) == 4:
         # manage scheduling of datacopy
         pad_data, kernel = s[conv].op.input_tensors
-        pack_data = pad_data.op.input_tensors[0]
-        bind_data_copy(s[pack_data])
+        if "pad_temp" in pad_data.op.name:
+            pack_data = pad_data.op.input_tensors[0]
+            bind_data_copy(s[pack_data])
+        else:
+            bind_data_copy(s[pad_data])
         bind_data_copy(s[kernel])
 
     pad_data, kernel = s[conv].op.input_tensors
 
-    s[pad_data].compute_inline()
-
-    s[conv].set_scope("local")
-    if latest_blocked == latest and output != latest:
-        s[output].compute_inline()
-
-    # create cache stage
-    AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-    bind_data_copy(s[AT])
     if (
         autotvm.GLOBAL_SCOPE.in_tuning
         or isinstance(kernel.op, tvm.te.ComputeOp)
         and "filter_pack" in kernel.op.tag
     ):
+        if "pad_temp" in pad_data.op.name:
+            s[pad_data].compute_inline()
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
         WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
         bind_data_copy(s[WT])
+    elif "pad_temp" in pad_data.op.name:
+        s[pad_data].compute_inline()
+        # create cache stage
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
 
     # tile and bind spatial axes
     n, y, x, fc, fb = s[latest_blocked].op.axis

diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -253,13 +253,17 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
     ):  # len(latest.op.axis) == 4:
         # manage scheduling of datacopy
         pad_data, kernel = s[conv].op.input_tensors
-        pack_data = pad_data.op.input_tensors[0]
-        bind_data_copy(s[pack_data])
+        if "pad_temp" in pad_data.op.name:
+            pack_data = pad_data.op.input_tensors[0]
+            bind_data_copy(s[pack_data])
+        else:
+            bind_data_copy(s[pad_data])
         bind_data_copy(s[kernel])
 
     pad_data, kernel = s[conv].op.input_tensors
 
-    s[pad_data].compute_inline()
+    if "pad_temp" in pad_data.op.name:
+        s[pad_data].compute_inline()
 
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:

diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -247,13 +247,17 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
     ):  # len(latest.op.axis) == 4:
         # manage scheduling of datacopy
         pad_data, kernel = s[conv].op.input_tensors
-        pack_data = pad_data.op.input_tensors[0]
-        bind_data_copy(s[pack_data])
+        if "pad_temp" in pad_data.op.name:
+            pack_data = pad_data.op.input_tensors[0]
+            bind_data_copy(s[pack_data])
+        else:
+            bind_data_copy(s[pad_data])
         bind_data_copy(s[kernel])
 
     pad_data, kernel = s[conv].op.input_tensors
 
-    s[pad_data].compute_inline()
+    if "pad_temp" in pad_data.op.name:
+        s[pad_data].compute_inline()
 
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:

diff --git a/python/tvm/topi/adreno/injective.py b/python/tvm/topi/adreno/injective.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable,
+"""Schedule for composition of injective operator"""
+import tvm
+from tvm import te
+from .utils import bind_data_copy
+from .. import utils
+
+
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+
+    bind_data_copy(sch[out])
+    return sch
+
+
+def schedule_injective(outs):
+    """Schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of injective in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    tvm.te.schedule.AutoInlineInjective(s)
+    for out in outs:
+        if not utils.is_empty_shape(out.shape):
+            schedule_injective_from_existing(s, out)
+    return s
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
@@ -474,7 +474,19 @@ def add_pad(
         pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[x_axis] - input_latest_w
     if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]:
         pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[y_axis] - input_latest_h
-    return nn.pad(data, pad_before, pad_after, name="pad_temp")
+    if (
+        pad_before[0] == 0
+        and pad_before[1] == 0
+        and pad_before[2] == 0
+        and pad_before[3] == 0
+        and pad_after[0] == 0
+        and pad_after[1] == 0
+        and pad_after[2] == 0
+        and pad_after[3] == 0
+    ):
+        return data
+    else:
+        return nn.pad(data, pad_before, pad_after, name="pad_temp")
 
 
 def bind_data_copy(stage, axis_to_vectorize=None):
@@ -522,6 +534,15 @@ def bind_data_copy(stage, axis_to_vectorize=None):
             stage.bind(thread, te.thread_axis("threadIdx.x"))
             if shape[-1] == 4:
                 stage.vectorize(axes[-1])
+        # 1024 is the maximum work group size for Adreno devices.
+        # See: CL_DEVICE_MAX_WORK_GROUP_SIZE
+        elif shape[-1] > 1024:
+            ftc = numpy.prod(shape[:-1])
+            div = get_div(ftc, 1024)
+            by, ty = stage.split(axes[-1], factor=div)
+            stage.bind(fused, te.thread_axis("blockIdx.x"))
+            stage.bind(by, te.thread_axis("blockIdx.y"))
+            stage.bind(ty, te.thread_axis("threadIdx.y"))
         else:
             stage.bind(fused, te.thread_axis("blockIdx.x"))
             stage.bind(*axes[-1:], te.thread_axis("threadIdx.x"))