From 5deb95a9472002c9fa36a150f9e348f4276d63c5 Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Fri, 12 Aug 2022 05:53:39 +0300 Subject: [PATCH] [Adreno][OpenCL] Get rid of extra memory copy (#12286) * Add annotation pass for device_copy where we get buffers but expect textures * Fix issues with running device_copy * Get rid of extra memory copy * Fix build after cherry-picking * Fix lint * Fix CI * Apply comments Co-authored-by: Andrey Malyshev --- python/tvm/relay/op/strategy/adreno.py | 18 +++ python/tvm/topi/adreno/__init__.py | 1 + python/tvm/topi/adreno/conv2d_nchw.py | 29 ++-- python/tvm/topi/adreno/conv2d_nhwc.py | 29 ++-- .../tvm/topi/adreno/depthwise_conv2d_nchw.py | 10 +- .../tvm/topi/adreno/depthwise_conv2d_nhwc.py | 10 +- python/tvm/topi/adreno/injective.py | 66 ++++++++ python/tvm/topi/adreno/utils.py | 23 ++- .../transforms/annotate_texture_storage.cc | 152 +++++++++++++++--- src/runtime/opencl/opencl_device_api.cc | 29 +++- .../python/relay/test_conv2d_nchw_texture.py | 8 +- 11 files changed, 312 insertions(+), 63 deletions(-) create mode 100644 python/tvm/topi/adreno/injective.py diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py index cc082c9d61c3..a537fa1e7b90 100644 --- a/python/tvm/relay/op/strategy/adreno.py +++ b/python/tvm/relay/op/strategy/adreno.py @@ -257,3 +257,21 @@ def schedule_pool_adreno(attrs, outs, target): if attrs.layout == "NCHW4c": return topi.adreno.schedule_pool(outs, attrs.layout) return topi.cuda.schedule_pool(outs, attrs.layout) + + +@schedule_injective.register(["adreno"]) +def schedule_injective_adreno(attrs, outs, target): + """schedule injective ops for adreno""" + with target: + return topi.adreno.schedule_injective(outs) + + +@concatenate_strategy.register(["adreno"]) +def concatenate_strategy_adreno(attrs, inputs, out_type, target): + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_concat(topi.transform.concatenate), + wrap_topi_schedule(topi.adreno.schedule_injective), + name="concatenate.adreno", + ) + return strategy diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py index 57a9013b1a2a..227ca6aa9a48 100644 --- a/python/tvm/topi/adreno/__init__.py +++ b/python/tvm/topi/adreno/__init__.py @@ -25,3 +25,4 @@ from .conv2d_alter_op import * from .conv2d_nchw_winograd import * from .conv2d_nhwc_winograd import * +from .injective import schedule_injective diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py index 16ecaa84d040..65cd8e0150a8 100644 --- a/python/tvm/topi/adreno/conv2d_nchw.py +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -279,28 +279,35 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): ): # len(latest.op.axis) == 4: # manage scheduling of datacopy pad_data, kernel = s[conv].op.input_tensors - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + bind_data_copy(s[pad_data]) bind_data_copy(s[kernel]) pad_data, kernel = s[conv].op.input_tensors - s[pad_data].compute_inline() - - s[conv].set_scope("local") - if latest_blocked == latest and output != latest: - s[output].compute_inline() - - # create cache stage - AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) - bind_data_copy(s[AT]) if ( autotvm.GLOBAL_SCOPE.in_tuning or isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag ): + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) bind_data_copy(s[WT]) + elif "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + # create cache stage + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() # tile and bind spatial axes n, fc, y, x, fb = s[latest_blocked].op.axis diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py index ce7bf0ccc956..b377169ca8c9 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc.py +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -275,28 +275,35 @@ def schedule_conv2d_NHWC(cfg, s, output): ): # len(latest.op.axis) == 4: # manage scheduling of datacopy pad_data, kernel = s[conv].op.input_tensors - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + bind_data_copy(s[pad_data]) bind_data_copy(s[kernel]) pad_data, kernel = s[conv].op.input_tensors - s[pad_data].compute_inline() - - s[conv].set_scope("local") - if latest_blocked == latest and output != latest: - s[output].compute_inline() - - # create cache stage - AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) - bind_data_copy(s[AT]) if ( autotvm.GLOBAL_SCOPE.in_tuning or isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag ): + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) bind_data_copy(s[WT]) + elif "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + # create cache stage + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) + + s[conv].set_scope("local") + if latest_blocked == latest and output != latest: + s[output].compute_inline() # tile and bind spatial axes n, y, x, fc, fb = s[latest_blocked].op.axis diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py index a11c3f3d36b8..37713b4584b9 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -253,13 +253,17 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output): ): # len(latest.op.axis) == 4: # manage scheduling of datacopy pad_data, kernel = s[conv].op.input_tensors - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + bind_data_copy(s[pad_data]) bind_data_copy(s[kernel]) pad_data, kernel = s[conv].op.input_tensors - s[pad_data].compute_inline() + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() s[conv].set_scope("local") if latest_blocked == latest and output != latest: diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py index 117daf825d06..2b228b444fca 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -247,13 +247,17 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output): ): # len(latest.op.axis) == 4: # manage scheduling of datacopy pad_data, kernel = s[conv].op.input_tensors - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + bind_data_copy(s[pad_data]) bind_data_copy(s[kernel]) pad_data, kernel = s[conv].op.input_tensors - s[pad_data].compute_inline() + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() s[conv].set_scope("local") if latest_blocked == latest and output != latest: diff --git a/python/tvm/topi/adreno/injective.py b/python/tvm/topi/adreno/injective.py new file mode 100644 index 000000000000..52ab0eab33fb --- /dev/null +++ b/python/tvm/topi/adreno/injective.py @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, +"""Schedule for composition of injective operator""" +import tvm +from tvm import te +from .utils import bind_data_copy +from .. import utils + + +def schedule_injective_from_existing(sch, out): + """Schedule for injective op from existing schedule. + + Parameters + ---------- + sch: Schedule + The schedule to update. + out: Tensor + The tensor representing the injective op. + + Returns + ------- + sch: Schedule + The updated schedule. + """ + + bind_data_copy(sch[out]) + return sch + + +def schedule_injective(outs): + """Schedule for injective op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of injective in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + tvm.te.schedule.AutoInlineInjective(s) + for out in outs: + if not utils.is_empty_shape(out.shape): + schedule_injective_from_existing(s, out) + return s diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py index ea19e7d77dad..6ad5271744b2 100644 --- a/python/tvm/topi/adreno/utils.py +++ b/python/tvm/topi/adreno/utils.py @@ -474,7 +474,19 @@ def add_pad( pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[x_axis] - input_latest_w if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]: pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[y_axis] - input_latest_h - return nn.pad(data, pad_before, pad_after, name="pad_temp") + if ( + pad_before[0] == 0 + and pad_before[1] == 0 + and pad_before[2] == 0 + and pad_before[3] == 0 + and pad_after[0] == 0 + and pad_after[1] == 0 + and pad_after[2] == 0 + and pad_after[3] == 0 + ): + return data + else: + return nn.pad(data, pad_before, pad_after, name="pad_temp") def bind_data_copy(stage, axis_to_vectorize=None): @@ -522,6 +534,15 @@ def bind_data_copy(stage, axis_to_vectorize=None): stage.bind(thread, te.thread_axis("threadIdx.x")) if shape[-1] == 4: stage.vectorize(axes[-1]) + # 1024 is the maximum work group size for Adreno devices. + # See: CL_DEVICE_MAX_WORK_GROUP_SIZE + elif shape[-1] > 1024: + ftc = numpy.prod(shape[:-1]) + div = get_div(ftc, 1024) + by, ty = stage.split(axes[-1], factor=div) + stage.bind(fused, te.thread_axis("blockIdx.x")) + stage.bind(by, te.thread_axis("blockIdx.y")) + stage.bind(ty, te.thread_axis("threadIdx.y")) else: stage.bind(fused, te.thread_axis("blockIdx.x")) stage.bind(*axes[-1:], te.thread_axis("threadIdx.x")) diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc index 3dd918d962f5..b3ed28db4574 100644 --- a/src/relay/transforms/annotate_texture_storage.cc +++ b/src/relay/transforms/annotate_texture_storage.cc @@ -23,7 +23,7 @@ * storage scope related information. * * - CollectStorageInfo returns a mapping from relay expr - * to a list of output storage scopes for each output. + * to a map of storage scopes for each call argument. * These scopes are used during memory planning as well * as downstream when doing codegen and in the graph runtime when doing runtime dataspace * allocations. @@ -42,6 +42,8 @@ #include #include +#include "../op/memory/device_copy.h" +#include "../op/memory/memory.h" #include "../transforms/device_aware_visitors.h" namespace tvm { @@ -55,15 +57,17 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { public: StorageInfo() : transform::DeviceAwareExprVisitor(Optional()) {} - static Map> GetStorageMap(const Expr& expr) { + static Map>> GetStorageMap(const Expr& expr) { StorageInfo storage_info; storage_info.VisitExpr(expr); storage_info.LegalizeProducerStorage(); - Map> storage_map; + Map>> storage_map = storage_info.accept_textures_; for (auto& kv : storage_info.storage_scope_) { std::vector storage_scopes; std::copy(kv.second.begin(), kv.second.end(), std::back_inserter(storage_scopes)); - storage_map.Set(GetRef(kv.first), Array{storage_scopes}); + Map> ent; + ent.Set(Expr(), Array{storage_scopes}); + storage_map.Set(GetRef(kv.first), ent); } // Filling the input arguments by "global" scope to handle PlanDevice algo which propagates @@ -75,7 +79,9 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { // even without verification of the consumer's outputs scope if (storage_info.CanConsumeTextures(cs.second) && storage_map.find(GetRef(cs.first)) == storage_map.end()) { - storage_map.Set(GetRef(cs.first), Array{"global"}); + Map> ent; + ent.Set(Expr(), Array{"global"}); + storage_map.Set(GetRef(cs.first), ent); } } @@ -85,6 +91,25 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { if (storage_map.count(a.first)) { for (const auto& v : a.second) { storage_map.Set(v, storage_map[a.first]); + if (storage_map[a.first][Expr()][0] == "global" && + storage_info.accept_textures_.count(v)) { + Map> ent; + ent.Set(Expr(), storage_info.accept_textures_[v][Expr()]); + storage_map.Set(v, ent); + for (const auto& calls : storage_info.accept_textures_[v]) { + if (calls.first != Expr()) { + if (storage_map.count(a.first)) { + Map> ent_call = storage_map[a.first]; + ent_call.Set(calls.first, calls.second); + storage_map.Set(a.first, ent_call); + } else { + Map> ent_call; + ent_call.Set(calls.first, calls.second); + storage_map.Set(a.first, ent_call); + } + } + } + } } } } @@ -109,6 +134,18 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { void VisitExpr_(const ConstantNode* cn) final { ApplyConsumerScopeToInputs(cn); } + void DeviceAwareVisitExpr_(const FunctionNode* function_node) final { + if (!function_node->HasNonzeroAttr(attr::kPrimitive)) { + for (auto&& param : function_node->params) { + auto virtual_device = GetVirtualDevice(param); + param->virtual_device_ = + VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, + virtual_device->target, "global"); + } + } + transform::DeviceAwareExprVisitor::DeviceAwareVisitExpr_(function_node); + } + void DeviceAwareVisitExpr_(const CallNode* call) final { // Check the contents of this primitive function if (const auto* fn = call->op.as()) { @@ -135,6 +172,23 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { } for (size_t i = 0; i < fn->params.size(); i++) { args_to_vars_[call->args[i]].push_back(fn->params[i]); + // adding info about arguments if they can be converted to texture + for (const auto& ttype : FlattenTupleType(fn->params[i]->checked_type())) { + std::string scope = Scope(ttype->shape, GetVirtualDevice(GetRef(call))); + if (scope.find("global.texture") != std::string::npos) { + if (accept_textures_.count(fn->params[i])) { + Map> ent = accept_textures_[fn->params[i]]; + ent.Set(GetRef(call), Array{scope}); + ent.Set(Expr(), Array{scope}); + accept_textures_.Set(fn->params[i], ent); + } else { + Map> ent; + ent.Set(GetRef(call), Array{scope}); + ent.Set(Expr(), Array{scope}); + accept_textures_.Set(fn->params[i], ent); + } + } + } } } // Add consumer storage scope information for call arguments @@ -164,11 +218,6 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { if (consumer_storage_scopes_.count(arg.operator->()) && GetConsumerScope(consumer_storage_scopes_[arg.operator->()]) != "global.texture") { storage_scope_.erase(arg.operator->()); - if (const auto* cn = arg.as()) { - if (const auto* fn = cn->op.as()) { - storage_scope_.erase(fn->body.operator->()); - } - } } } } @@ -336,6 +385,16 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { if (attrs->layout == "NCHW4c") { supports_texture_storage = true; } + } else if (const OpNode* opnode = call->op.as()) { + auto fpattern = Op::GetAttrMap("TOpPattern"); + auto pattern = fpattern[GetRef(opnode)]; + if (pattern <= kInjective) { + if (const auto* ttype = call->checked_type().as()) { + if (ttype->shape.size() == 5) { + supports_texture_storage = true; + } + } + } } return supports_texture_storage; @@ -350,6 +409,8 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { std::unordered_map> consumer_storage_scopes_; /*! \brief mapping of arguments to call to function variables*/ std::unordered_map, ObjectPtrHash, ObjectPtrEqual> args_to_vars_; + /*! \brief mapping of arguments that can be converted to texture*/ + Map>> accept_textures_; }; } // namespace @@ -365,43 +426,84 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator { using VarMap = std::unordered_map; public: - explicit RewriteVDStorageScopes(const Map>& storage_scope) + explicit RewriteVDStorageScopes(const Map>>& storage_scope) : transform::DeviceAwareExprMutator(Optional()), storage_scope_(storage_scope) {} Function Rewrite(const Expr& expr) { return Downcast(Mutate(expr)); } Expr VisitExpr_(const VarNode* vn) final { if (storage_scope_.find(GetRef(vn)) != storage_scope_.end() && - storage_scope_[GetRef(vn)][0] != "global") { + storage_scope_[GetRef(vn)].find(Expr()) != storage_scope_[GetRef(vn)].end() && + storage_scope_[GetRef(vn)][Expr()][0] != "global") { Var c = Var(vn->vid, vn->type_annotation, vn->span); auto virtual_device = GetVirtualDevice(GetRef(vn)); c->virtual_device_ = VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, - virtual_device->target, storage_scope_[GetRef(vn)][0]); + virtual_device->target, storage_scope_[GetRef(vn)][Expr()][0]); return c; } return GetRef(vn); } Expr VisitExpr_(const ConstantNode* vn) final { - if (storage_scope_.find(GetRef(vn)) != storage_scope_.end()) { + if (storage_scope_.find(GetRef(vn)) != storage_scope_.end() && + storage_scope_[GetRef(vn)].find(Expr()) != storage_scope_[GetRef(vn)].end()) { Expr c = Constant(vn->data, vn->span); auto virtual_device = GetVirtualDevice(GetRef(vn)); - c = OnDevice(c, - VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, - virtual_device->target, storage_scope_[GetRef(vn)][0]), - true); + c = OnDevice( + c, + VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, + virtual_device->target, storage_scope_[GetRef(vn)][Expr()][0]), + true); return c; } return GetRef(vn); } Expr DeviceAwareVisitExpr_(const CallNode* call_node) final { - auto new_call = transform::DeviceAwareExprMutator::DeviceAwareVisitExpr_(call_node); + // we need to duplicate ExprMutator::VisitExpr_ to correct argument scopes and + // put device_copy + auto new_op = this->Mutate(call_node->op); + + tvm::Array ty_args; + ty_args.reserve(call_node->type_args.size()); + + for (auto ty_arg : call_node->type_args) { + auto new_ty_arg = this->VisitType(ty_arg); + ty_args.push_back(new_ty_arg); + } + + tvm::Array call_args; + call_args.reserve(call_node->args.size()); + for (auto arg : call_node->args) { + auto new_arg = this->Mutate(arg); + // verification if we need to put device_copy + if (storage_scope_.count(arg) && storage_scope_[arg].count(GetRef(call_node))) { + auto virtual_device = GetVirtualDevice(GetRef(call_node)); + VirtualDevice virtual_device_from = + VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, + virtual_device->target, virtual_device->memory_scope); + VirtualDevice virtual_device_to = + VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, + virtual_device->target, storage_scope_[arg][GetRef(call_node)][0]); + new_arg = DeviceCopy(new_arg, virtual_device_from, virtual_device_to); + new_arg = OnDevice( + new_arg, + VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, + virtual_device->target, storage_scope_[arg][GetRef(call_node)][0]), + true); + } + call_args.push_back(new_arg); + } + + auto new_call = WithFields(GetRef(call_node), new_op, call_args, {}, ty_args); + auto virtual_device = GetVirtualDevice(GetRef(call_node)); std::string memory_scope = ""; - if (storage_scope_.find(GetRef(call_node)) != storage_scope_.end()) { - memory_scope = storage_scope_[GetRef(call_node)][0]; + if (storage_scope_.find(GetRef(call_node)) != storage_scope_.end() && + storage_scope_[GetRef(call_node)].find(Expr()) != + storage_scope_[GetRef(call_node)].end()) { + memory_scope = storage_scope_[GetRef(call_node)][Expr()][0]; } else if (virtual_device->memory_scope != "") { memory_scope = virtual_device->memory_scope; } else if (!call_node->op.as()) { @@ -418,12 +520,12 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator { } private: - Map> storage_scope_; + Map>> storage_scope_; VarMap new_vars_; Array current_function_scope_; }; -Map> CollectTextureStorage(const Expr& expr) { +Map>> CollectTextureStorage(const Expr& expr) { return StorageInfo::GetStorageMap(expr); } @@ -479,7 +581,7 @@ class CollectVirtualDevices : public transform::DeviceAwareExprVisitor { * \param expr The expression. * \return The device based storage mapping. */ -Map> CollectStorageInfo(const Expr& expr) { +Map>> CollectStorageInfo(const Expr& expr) { std::set device_types = CollectVirtualDevices().GetDevices(expr); // TODO(amalyshe): current approach collects all targets withing graph and call the only // function corresponding to all these targets in alphabetic order @@ -490,7 +592,7 @@ Map> CollectStorageInfo(const Expr& expr) { ftarget_prefix += (std::string(".") + dev_id); } - Map> storage_info = {}; + Map>> storage_info = {}; if (const auto* f = runtime::Registry::Get(ftarget_prefix + "._CollectStorageInfo")) { storage_info = (*f)(expr); } diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index cea0acc07c0b..d67864287dbc 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -273,12 +273,31 @@ void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHand if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) { const auto* from_desc = static_cast(from->data); - ICHECK(from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) - << "Device to device copying is currently only implemented for OpenCL buffer storage"; auto* to_desc = static_cast(to->data); - OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer, - from->byte_offset, to->byte_offset, nbytes, 0, nullptr, - nullptr)); + if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D && + from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer, + to_desc->buffer, from->byte_offset, to->byte_offset, nbytes, + 0, nullptr, nullptr)); + } else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D && + from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + auto image_info = GetImageInfo(to_desc, to); + OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer, + to_desc->buffer, from->byte_offset, image_info.origin, + image_info.region, 0, nullptr, nullptr)); + } else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D && + from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) { + auto image_info = GetImageInfo(from_desc, from); + OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer, + to_desc->buffer, image_info.origin, image_info.region, + to->byte_offset, 0, nullptr, nullptr)); + } else { + auto to_image_info = GetImageInfo(to_desc, to); + auto from_image_info = GetImageInfo(from_desc, from); + OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer, + from_image_info.origin, to_image_info.origin, + to_image_info.region, 0, nullptr, nullptr)); + } } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) { const auto* from_desc = static_cast(from->data); switch (from_desc->layout) { diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py index 58590998fdd2..6eadd8fc1c7a 100644 --- a/tests/python/relay/test_conv2d_nchw_texture.py +++ b/tests/python/relay/test_conv2d_nchw_texture.py @@ -590,8 +590,8 @@ def test_residual_block(): } static_memory_scope = [ - "", "global", + "global.texture", "global.texture-weight", "global.texture-weight", "global.texture", @@ -830,8 +830,8 @@ def test_pooling_branching_texture_params(): } static_memory_scope = [ - "", "global", + "global.texture", "global.texture-weight", "global.texture", "global.texture", @@ -957,8 +957,8 @@ def test_branching_texture_params(): } static_memory_scope = [ - "", "global", + "global.texture", "global.texture-weight", "global.texture", "global.texture-weight", @@ -1046,8 +1046,8 @@ def test_conv2d_different_lowering_same_op(): } static_memory_scope = [ - "", "global", + "global.texture", "global.texture-weight", "global.texture", "global.texture",