From 3731a8ca45e01865be78e4b98cc541af76d88d62 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 4 Aug 2022 14:35:10 +0100 Subject: [PATCH] [ETHOSN] Get buffer sizes from the compiled network (#12160) The NPU support library compiler sometimes adds padding to input tensors which means the buffer sizes calculated at runtime can sometimes be smaller than necessary. Instead, buffer sizes are now collected at compile time and passed to the runtime so that they match the sizes expected by the compiled network. This was seen when running a fully connected operation with an input that is not a multiple of 1024, so testing has been added to cover this case. Additionally changed the fully connected test case to use pytest parameterization as part of a general cleanup, and fixed an issue with specifying a different output shape and weights with more than 1 output channel. Change-Id: Iad319d75326b9ac41950de982603660a084dc27b --- src/relay/backend/contrib/ethosn/codegen.cc | 17 ++++ .../backend/contrib/ethosn/codegen_ethosn.h | 13 +++ src/runtime/contrib/ethosn/ethosn_device.cc | 38 ++++---- src/runtime/contrib/ethosn/ethosn_device.h | 8 +- src/runtime/contrib/ethosn/ethosn_runtime.cc | 14 ++- src/runtime/contrib/ethosn/ethosn_runtime.h | 4 + .../test_ethosn/test_fullyconnected.py | 95 ++++++++++--------- .../contrib/test_ethosn/test_networks.py | 20 ++-- 8 files changed, 131 insertions(+), 78 deletions(-) diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index b09ed6844987..9fb8fcd4cdeb 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -629,6 +629,7 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const // Determine the order that the inputs/outputs are in and how that corresponds to the // order that the TVM runtime will expect them in auto input_output_order = GetInputOutputOrder(network_with_ids, compiled_network); + auto io_sizes = GetIOSizes(compiled_network); // Use the order information to create an 'ordered' network with includes how to map // the inputs/outputs from the TVM runtime to the inputs/outputs of the compiled network runtime::ethosn::OrderedCompiledNetwork ordered_network; @@ -636,6 +637,8 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const ordered_network.compiled_cmm = std::move(compiled_network); ordered_network.inputs = input_output_order.first; ordered_network.outputs = input_output_order.second; + ordered_network.input_sizes = io_sizes.first; + ordered_network.output_sizes = io_sizes.second; return ordered_network; } @@ -684,6 +687,20 @@ std::pair, std::vector> EthosnCompiler::GetInput return std::make_pair(input_order, output_order); } +std::pair, std::vector> EthosnCompiler::GetIOSizes( + const std::unique_ptr& compiled_network) { + std::vector input_sizes; + std::vector output_sizes; + for (const sl::InputBufferInfo info : compiled_network->GetInputBufferInfos()) { + input_sizes.push_back(info.m_Size); + } + for (const sl::OutputBufferInfo info : compiled_network->GetOutputBufferInfos()) { + output_sizes.push_back(info.m_Size); + } + + return std::make_pair(input_sizes, output_sizes); +} + std::unique_ptr EthosnCompiler::m_Queries; EthosnError EthosnCompiler::SupportedSetup() { diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h index d2208bd3133c..6d26cc7daacc 100644 --- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h +++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h @@ -348,6 +348,19 @@ class EthosnCompiler { static std::pair, std::vector> GetInputOutputOrder( NetworkWithIDs network, const std::unique_ptr& compiled_network); + /*! + * \brief Determine the input and output sizes of a compiled network. + * + * These need to be queried from the compiled network as the compiler can choose + * to add additional padding on the input/output in certain cases. + * + * \param compiled_network The network compiled by the NPU compiler. + * \return Pair of vectors of buffer sizes for both the inputs and outputs of the + * network. + */ + static std::pair, std::vector> GetIOSizes( + const std::unique_ptr& compiled_network); + /*! * \brief Query interface used to determine if the Ethos-N hardware supports an operation * with the supplied parameters. diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc index 7aa8dac57e02..4adecfd4df07 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.cc +++ b/src/runtime/contrib/ethosn/ethosn_device.cc @@ -95,28 +95,28 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector* outputs) { } void CreateBuffers(std::vector >* fm, - const std::vector& tensors, bool input) { - int index = 0; - for (auto buffer : tensors) { - auto* data = static_cast(buffer->data); - // The NPU only needs the size of the tensor * uint8_t. - auto data_size = static_cast(GetDataSize(*buffer)); + const std::vector& tensors, const std::vector& tensor_sizes, + bool input) { + for (size_t i = 0; i < tensors.size(); i++) { + auto* data = static_cast(tensors[i]->data); if (input) { - (*fm)[index++] = std::make_shared(data, data_size, dl::DataFormat::NHWC); + (*fm)[i] = std::make_shared(data, tensor_sizes[i], dl::DataFormat::NHWC); } else { - (*fm)[index++] = std::make_shared(data_size, dl::DataFormat::NHWC); + (*fm)[i] = std::make_shared(tensor_sizes[i], dl::DataFormat::NHWC); } } } #if _ETHOSN_API_VERSION_ <= 2102 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network, - const std::vector& input_order, - const std::vector& output_order) { + const std::vector& input_order, const std::vector& output_order, + const std::vector& input_sizes, + const std::vector& output_sizes) { #else bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, - const std::vector& input_order, - const std::vector& output_order) { + const std::vector& input_order, const std::vector& output_order, + const std::vector& input_sizes, + const std::vector& output_sizes) { #endif // Unpack parameters uint8_t argc = 0; @@ -133,11 +133,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, // Set up input buffers std::vector > ifm(inputs.size()); - CreateBuffers(&ifm, inputs, true); + CreateBuffers(&ifm, inputs, input_sizes, true); // Set up output buffers std::vector > ofm(outputs.size()); - CreateBuffers(&ofm, outputs, false); + CreateBuffers(&ofm, outputs, output_sizes, false); // Raw pointers for the inference dl::Buffer* ifm_raw[inputs.size()]; @@ -231,12 +231,14 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result") // Allow the ethos-n support code to be tested without a device #if _ETHOSN_API_VERSION_ <= 2102 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network, - const std::vector& input_order, - const std::vector& output_order) { + const std::vector& input_order, const std::vector& output_order, + const std::vector& input_sizes, + const std::vector& output_sizes) { #else bool Inference(tvm::runtime::TVMArgs args, dl::Network* /* npu */, - const std::vector& input_order, - const std::vector& output_order) { + const std::vector& input_order, const std::vector& output_order, + const std::vector& input_sizes, + const std::vector& output_sizes) { #endif std::vector outputs; for (int argc = input_order.size(); argc < args.size(); argc++) { diff --git a/src/runtime/contrib/ethosn/ethosn_device.h b/src/runtime/contrib/ethosn/ethosn_device.h index d69be62aa603..2d1e536ef8e7 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.h +++ b/src/runtime/contrib/ethosn/ethosn_device.h @@ -41,10 +41,12 @@ using tvm::runtime::TVMArgs; #if _ETHOSN_API_VERSION_ <= 2102 bool Inference(TVMArgs args, sl::CompiledNetwork* npu, const std::vector& input_order, - const std::vector& output_order); + const std::vector& output_order, const std::vector& input_sizes, + const std::vector& output_sizes); #else -bool Inference(TVMArgs args, dl::Network* npu, const std::vector& input_order, - const std::vector& output_order); +bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, + const std::vector& input_order, const std::vector& output_order, + const std::vector& input_sizes, const std::vector& output_sizes); #endif } // namespace ethosn diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc index 962d4db47eb9..295ff537b379 100644 --- a/src/runtime/contrib/ethosn/ethosn_runtime.cc +++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc @@ -60,6 +60,8 @@ EthosnModule::EthosnModule(std::vector* cmms) { #endif network_map_[it.name].inputs = it.inputs; network_map_[it.name].outputs = it.outputs; + network_map_[it.name].input_sizes = it.input_sizes; + network_map_[it.name].output_sizes = it.output_sizes; } } @@ -69,10 +71,12 @@ PackedFunc EthosnModule::GetFunction(const std::string& name, return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) { #if _ETHOSN_API_VERSION_ <= 2102 *rv = Inference(args, network_map_[name].compiled_cmm.get(), network_map_[name].inputs, - network_map_[name].outputs); + network_map_[name].outputs, network_map_[name].input_sizes, + network_map_[name].output_sizes); #else *rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs, - network_map_[name].outputs); + network_map_[name].outputs, network_map_[name].input_sizes, + network_map_[name].output_sizes); #endif }); } else { @@ -90,8 +94,10 @@ void EthosnModule::SaveToBinary(dmlc::Stream* stream) { stream->Write(ss.str()); stream->Write(it.second.inputs.size()); stream->Write(&it.second.inputs[0], sizeof(uint32_t) * it.second.inputs.size()); + stream->Write(&it.second.input_sizes[0], sizeof(uint32_t) * it.second.input_sizes.size()); stream->Write(it.second.outputs.size()); stream->Write(&it.second.outputs[0], sizeof(uint32_t) * it.second.outputs.size()); + stream->Write(&it.second.output_sizes[0], sizeof(uint32_t) * it.second.output_sizes.size()); } } @@ -128,12 +134,16 @@ Module EthosnModule::LoadFromBinary(void* strm) { compiled.inputs.resize(size); // Read the order of inputs stream->Read(&compiled.inputs[0], sizeof(uint32_t) * size); + compiled.input_sizes.resize(size); + stream->Read(&compiled.input_sizes[0], sizeof(uint32_t) * size); // Read the number of outputs stream->Read(&output_size); size = static_cast(output_size); compiled.outputs.resize(size); // Read the order of outputs stream->Read(&compiled.outputs[0], sizeof(uint32_t) * size); + compiled.output_sizes.resize(size); + stream->Read(&compiled.output_sizes[0], sizeof(uint32_t) * size); } auto n = make_object(&cmms); return Module(n); diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h index ed5d04143e8e..b60250754b31 100644 --- a/src/runtime/contrib/ethosn/ethosn_runtime.h +++ b/src/runtime/contrib/ethosn/ethosn_runtime.h @@ -52,6 +52,8 @@ struct OrderedCompiledNetwork { std::string name; std::vector inputs; std::vector outputs; + std::vector input_sizes; + std::vector output_sizes; }; class EthosnModule : public ModuleNode { @@ -88,8 +90,10 @@ class EthosnModule : public ModuleNode { * std::string : serialized command stream * size_t : number of inputs * std::vector : order of inputs + * std::vector : buffer sizes for inputs * size_t : number of outputs * std::vector : order of outputs + * std::vector : buffer sizes for outputs * ] * number of functions */ static Module LoadFromBinary(void* strm); diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py index 4171c672721f..95f68622d912 100644 --- a/tests/python/contrib/test_ethosn/test_fullyconnected.py +++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py @@ -42,9 +42,9 @@ def _get_model( units=weight_shape[0], out_dtype="int32", ) - b = tvm.nd.array(np.random.randint(0, high=255, size=(shape[0],), dtype="int32")) + b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32")) biasc = relay.const(b, "int32") - bias = relay.nn.bias_add(fc, biasc, axis=0) + bias = relay.nn.bias_add(fc, biasc) req = relay.qnn.op.requantize( bias, relay.const(input_sc * kernel_sc, "float32"), # input zero scale @@ -58,55 +58,60 @@ def _get_model( @requires_ethosn -@pytest.mark.parametrize("dtype", ["uint8"]) -def test_fullyconnected(dtype): - zp_min = np.iinfo(dtype).min - zp_max = np.iinfo(dtype).max - trials = [ - ((1, 1024), zp_min + 71, 0.580, zp_max - 176, 1.498), - ((1, 4096), zp_min + 166, 1.724, zp_max - 138, 0.180), - ((1, 16384), zp_min + 101, 1.372, zp_max - 234, 1.346), - ] +@pytest.mark.parametrize( + "shape,out_channels", + [ + ((1, 1024), 64), + ((1, 16384), 1), + ((1, 1280), 1000), + ], +) +@pytest.mark.parametrize( + "dtype,input_zp,input_sc,kernel_zp,kernel_sc", + [ + ("uint8", 71, 0.580, 176, 1.498), + ("uint8", 166, 1.724, 138, 0.180), + ("int8", 71, 0.580, 0, 1.498), + ("int8", 120, 1.724, 0, 0.180), + ], +) +def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc): + """ + Test fully connected offloading. + """ np.random.seed(0) - for shape, input_zp, input_sc, kernel_zp, kernel_sc in trials: - kernel_zp = ( - 0 - if dtype == "int8" - else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + 1 - ) - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ), - } - outputs = [] - output_zp, output_sc = tei.get_conv2d_qnn_params( - dtype, + inputs = { + "a": tvm.nd.array( + np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype) + ), + } + + outputs = [] + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, + input_zp, + input_sc, + kernel_zp, + kernel_sc, + shape[0], + shape[1], + 1, + ) + for npu in [False, True]: + model, params = _get_model( + shape, + (out_channels, shape[1]), input_zp, input_sc, kernel_zp, kernel_sc, - shape[0], - shape[1], - 1, + output_zp, + output_sc, + dtype, ) - for npu in [False, True]: - model, params = _get_model( - shape, - shape, - input_zp, - input_sc, # input zp, sc - kernel_zp, - kernel_sc, # kernel - output_zp, - output_sc, # output - dtype, - ) - mod = tei.make_module(model, params) - outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) - tei.verify(outputs, dtype, 1) + mod = tei.make_module(model, params) + outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) + tei.verify(outputs, dtype, 1) @requires_ethosn diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index 066b2e4a9f22..c23bd01e0960 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -124,9 +124,9 @@ def test_mobilenet_v1(): # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. if tei.get_ethosn_api_version() == 2205: - _compile_hash = {"cb12b5469d78af81f4704488e3857755"} + _compile_hash = {"50186822915909303e813205db80e032"} elif tei.get_ethosn_api_version() == 2111: - _compile_hash = {"5d1c6a6bd4df8963866cc90405bf92dd"} + _compile_hash = {"c523c3c2bb9add1fee508217eb73af1a"} elif tei.get_ethosn_api_version() == 2102: _compile_hash = {"46ccafc840633633aca441645e41b444"} if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO": @@ -154,9 +154,9 @@ def test_resnet_50_int8(): # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. if tei.get_ethosn_api_version() == 2205: - _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "64905a4ff2dbde08078ccc9f44ad711d"} + _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "4225fa951c145bb1e48e28cad6a3bdd4"} else: - _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "434f0c65c41e24d5482142c88b3438fe"} + _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "5b9d72b9accfea7ed89eb09ca0aa5487"} _test_image_network( model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/" "models/Quantized/resnet_50_quantized.tflite", @@ -177,9 +177,9 @@ def test_inception_v3(): # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. if tei.get_ethosn_api_version() == 2205: - _compile_hash = {"85ef702ad3628c598db8c72060c70a61"} + _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"} elif tei.get_ethosn_api_version() == 2111: - _compile_hash = {"e6abe33a7bc4a4170da53eefa6577bba"} + _compile_hash = {"88db2c7928240be9833c1b5ef367de28"} elif tei.get_ethosn_api_version() == 2102: _compile_hash = {"43dc2097127eb224c0191b1a15f8acca"} if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO": @@ -206,9 +206,9 @@ def test_inception_v4(): # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. if tei.get_ethosn_api_version() == 2205: - _compile_hash = {"91a980eaf53881f4f109a1a7578e422b"} + _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"} elif tei.get_ethosn_api_version() == 2111: - _compile_hash = {"42e43c323ed8202f7b720ba9029bbcb7"} + _compile_hash = {"37648682f97cbbcecdc13945b7f2212f"} elif tei.get_ethosn_api_version() == 2102: _compile_hash = {"fab6c2297502f95d33079c6ce1a737f9"} if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO": @@ -235,9 +235,9 @@ def test_ssd_mobilenet_v1(): # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. if tei.get_ethosn_api_version() == 2205: - _compile_hash = {"d804ce3496a776c48f719b4062d5e5c3", "afb68ca8f452d1f4a674b457b5e30f59"} + _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"} elif tei.get_ethosn_api_version() == 2111: - _compile_hash = {"a37f900601b9493bd142e8aed16205e5", "afb68ca8f452d1f4a674b457b5e30f59"} + _compile_hash = {"7b8b0a3ad7cfe1695dee187f21f03785", "6b699f94795785d31b39940a5cf84a81"} elif tei.get_ethosn_api_version() == 2102: _compile_hash = {"7795b6c67178da9d1f9b98063bad75b1", "10826406ae724e52f360a06c35ced09d"} if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":