From 3731a8ca45e01865be78e4b98cc541af76d88d62 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 4 Aug 2022 14:35:10 +0100
Subject: [PATCH] [ETHOSN] Get buffer sizes from the compiled network (#12160)

The NPU support library compiler sometimes adds padding to input
tensors which means the buffer sizes calculated at runtime can
sometimes be smaller than necessary. Instead, buffer sizes are now
collected at compile time and passed to the runtime so that they match
the sizes expected by the compiled network. This was seen when running
a fully connected operation with an input that is not a multiple of
1024, so testing has been added to cover this case.

Additionally changed the fully connected test case to use pytest
parameterization as part of a general cleanup, and fixed an issue
with specifying a different output shape and weights with more than 1
output channel.

Change-Id: Iad319d75326b9ac41950de982603660a084dc27b
---
 src/relay/backend/contrib/ethosn/codegen.cc   | 17 ++++
 .../backend/contrib/ethosn/codegen_ethosn.h   | 13 +++
 src/runtime/contrib/ethosn/ethosn_device.cc   | 38 ++++----
 src/runtime/contrib/ethosn/ethosn_device.h    |  8 +-
 src/runtime/contrib/ethosn/ethosn_runtime.cc  | 14 ++-
 src/runtime/contrib/ethosn/ethosn_runtime.h   |  4 +
 .../test_ethosn/test_fullyconnected.py        | 95 ++++++++++---------
 .../contrib/test_ethosn/test_networks.py      | 20 ++--
 8 files changed, 131 insertions(+), 78 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index b09ed6844987..9fb8fcd4cdeb 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -629,6 +629,7 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   // Determine the order that the inputs/outputs are in and how that corresponds to the
   // order that the TVM runtime will expect them in
   auto input_output_order = GetInputOutputOrder(network_with_ids, compiled_network);
+  auto io_sizes = GetIOSizes(compiled_network);
   // Use the order information to create an 'ordered' network with includes how to map
   // the inputs/outputs from the TVM runtime to the inputs/outputs of the compiled network
   runtime::ethosn::OrderedCompiledNetwork ordered_network;
@@ -636,6 +637,8 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   ordered_network.compiled_cmm = std::move(compiled_network);
   ordered_network.inputs = input_output_order.first;
   ordered_network.outputs = input_output_order.second;
+  ordered_network.input_sizes = io_sizes.first;
+  ordered_network.output_sizes = io_sizes.second;
   return ordered_network;
 }
 
@@ -684,6 +687,20 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetInput
   return std::make_pair(input_order, output_order);
 }
 
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetIOSizes(
+    const std::unique_ptr<sl::CompiledNetwork>& compiled_network) {
+  std::vector<uint32_t> input_sizes;
+  std::vector<uint32_t> output_sizes;
+  for (const sl::InputBufferInfo info : compiled_network->GetInputBufferInfos()) {
+    input_sizes.push_back(info.m_Size);
+  }
+  for (const sl::OutputBufferInfo info : compiled_network->GetOutputBufferInfos()) {
+    output_sizes.push_back(info.m_Size);
+  }
+
+  return std::make_pair(input_sizes, output_sizes);
+}
+
 std::unique_ptr<sl::SupportQueries> EthosnCompiler::m_Queries;
 
 EthosnError EthosnCompiler::SupportedSetup() {
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index d2208bd3133c..6d26cc7daacc 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -348,6 +348,19 @@ class EthosnCompiler {
   static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetInputOutputOrder(
       NetworkWithIDs network, const std::unique_ptr<sl::CompiledNetwork>& compiled_network);
 
+  /*!
+   * \brief Determine the input and output sizes of a compiled network.
+   *
+   * These need to be queried from the compiled network as the compiler can choose
+   * to add additional padding on the input/output in certain cases.
+   *
+   * \param compiled_network The network compiled by the NPU compiler.
+   * \return Pair of vectors of buffer sizes for both the inputs and outputs of the
+   * network.
+   */
+  static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetIOSizes(
+      const std::unique_ptr<sl::CompiledNetwork>& compiled_network);
+
   /*!
    * \brief Query interface used to determine if the Ethos-N hardware supports an operation
    * with the supplied parameters.
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 7aa8dac57e02..4adecfd4df07 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -95,28 +95,28 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector<DLTensor*>* outputs) {
 }
 
 void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer> >* fm,
-                   const std::vector<DLTensor*>& tensors, bool input) {
-  int index = 0;
-  for (auto buffer : tensors) {
-    auto* data = static_cast<uint8_t*>(buffer->data);
-    // The NPU only needs the size of the tensor * uint8_t.
-    auto data_size = static_cast<uint32_t>(GetDataSize(*buffer));
+                   const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
+                   bool input) {
+  for (size_t i = 0; i < tensors.size(); i++) {
+    auto* data = static_cast<uint8_t*>(tensors[i]->data);
     if (input) {
-      (*fm)[index++] = std::make_shared<dl::Buffer>(data, data_size, dl::DataFormat::NHWC);
+      (*fm)[i] = std::make_shared<dl::Buffer>(data, tensor_sizes[i], dl::DataFormat::NHWC);
     } else {
-      (*fm)[index++] = std::make_shared<dl::Buffer>(data_size, dl::DataFormat::NHWC);
+      (*fm)[i] = std::make_shared<dl::Buffer>(tensor_sizes[i], dl::DataFormat::NHWC);
     }
   }
 }
 
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #else
 bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #endif
   // Unpack parameters
   uint8_t argc = 0;
@@ -133,11 +133,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
 
   // Set up input buffers
   std::vector<std::shared_ptr<dl::Buffer> > ifm(inputs.size());
-  CreateBuffers(&ifm, inputs, true);
+  CreateBuffers(&ifm, inputs, input_sizes, true);
 
   // Set up output buffers
   std::vector<std::shared_ptr<dl::Buffer> > ofm(outputs.size());
-  CreateBuffers(&ofm, outputs, false);
+  CreateBuffers(&ofm, outputs, output_sizes, false);
 
   // Raw pointers for the inference
   dl::Buffer* ifm_raw[inputs.size()];
@@ -231,12 +231,14 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
 // Allow the ethos-n support code to be tested without a device
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #else
 bool Inference(tvm::runtime::TVMArgs args, dl::Network* /* npu */,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #endif
   std::vector<DLTensor*> outputs;
   for (int argc = input_order.size(); argc < args.size(); argc++) {
diff --git a/src/runtime/contrib/ethosn/ethosn_device.h b/src/runtime/contrib/ethosn/ethosn_device.h
index d69be62aa603..2d1e536ef8e7 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.h
+++ b/src/runtime/contrib/ethosn/ethosn_device.h
@@ -41,10 +41,12 @@ using tvm::runtime::TVMArgs;
 
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(TVMArgs args, sl::CompiledNetwork* npu, const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order);
+               const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes);
 #else
-bool Inference(TVMArgs args, dl::Network* npu, const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order);
+bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
 #endif
 
 }  // namespace ethosn
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index 962d4db47eb9..295ff537b379 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -60,6 +60,8 @@ EthosnModule::EthosnModule(std::vector<OrderedCompiledNetwork>* cmms) {
 #endif
     network_map_[it.name].inputs = it.inputs;
     network_map_[it.name].outputs = it.outputs;
+    network_map_[it.name].input_sizes = it.input_sizes;
+    network_map_[it.name].output_sizes = it.output_sizes;
   }
 }
 
@@ -69,10 +71,12 @@ PackedFunc EthosnModule::GetFunction(const std::string& name,
     return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
 #if _ETHOSN_API_VERSION_ <= 2102
       *rv = Inference(args, network_map_[name].compiled_cmm.get(), network_map_[name].inputs,
-                      network_map_[name].outputs);
+                      network_map_[name].outputs, network_map_[name].input_sizes,
+                      network_map_[name].output_sizes);
 #else
       *rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
-                      network_map_[name].outputs);
+                      network_map_[name].outputs, network_map_[name].input_sizes,
+                      network_map_[name].output_sizes);
 #endif
     });
   } else {
@@ -90,8 +94,10 @@ void EthosnModule::SaveToBinary(dmlc::Stream* stream) {
     stream->Write(ss.str());
     stream->Write(it.second.inputs.size());
     stream->Write(&it.second.inputs[0], sizeof(uint32_t) * it.second.inputs.size());
+    stream->Write(&it.second.input_sizes[0], sizeof(uint32_t) * it.second.input_sizes.size());
     stream->Write(it.second.outputs.size());
     stream->Write(&it.second.outputs[0], sizeof(uint32_t) * it.second.outputs.size());
+    stream->Write(&it.second.output_sizes[0], sizeof(uint32_t) * it.second.output_sizes.size());
   }
 }
 
@@ -128,12 +134,16 @@ Module EthosnModule::LoadFromBinary(void* strm) {
     compiled.inputs.resize(size);
     // Read the order of inputs
     stream->Read(&compiled.inputs[0], sizeof(uint32_t) * size);
+    compiled.input_sizes.resize(size);
+    stream->Read(&compiled.input_sizes[0], sizeof(uint32_t) * size);
     // Read the number of outputs
     stream->Read<uint64_t>(&output_size);
     size = static_cast<size_t>(output_size);
     compiled.outputs.resize(size);
     // Read the order of outputs
     stream->Read(&compiled.outputs[0], sizeof(uint32_t) * size);
+    compiled.output_sizes.resize(size);
+    stream->Read(&compiled.output_sizes[0], sizeof(uint32_t) * size);
   }
   auto n = make_object<EthosnModule>(&cmms);
   return Module(n);
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h
index ed5d04143e8e..b60250754b31 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.h
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.h
@@ -52,6 +52,8 @@ struct OrderedCompiledNetwork {
   std::string name;
   std::vector<uint32_t> inputs;
   std::vector<uint32_t> outputs;
+  std::vector<uint32_t> input_sizes;
+  std::vector<uint32_t> output_sizes;
 };
 
 class EthosnModule : public ModuleNode {
@@ -88,8 +90,10 @@ class EthosnModule : public ModuleNode {
    *         std::string : serialized command stream
    *         size_t      : number of inputs
    *         std::vector : order of inputs
+   *         std::vector : buffer sizes for inputs
    *         size_t      : number of outputs
    *         std::vector : order of outputs
+   *         std::vector : buffer sizes for outputs
    *       ] * number of functions
    */
   static Module LoadFromBinary(void* strm);
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index 4171c672721f..95f68622d912 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -42,9 +42,9 @@ def _get_model(
         units=weight_shape[0],
         out_dtype="int32",
     )
-    b = tvm.nd.array(np.random.randint(0, high=255, size=(shape[0],), dtype="int32"))
+    b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32"))
     biasc = relay.const(b, "int32")
-    bias = relay.nn.bias_add(fc, biasc, axis=0)
+    bias = relay.nn.bias_add(fc, biasc)
     req = relay.qnn.op.requantize(
         bias,
         relay.const(input_sc * kernel_sc, "float32"),  # input zero scale
@@ -58,55 +58,60 @@ def _get_model(
 
 
 @requires_ethosn
-@pytest.mark.parametrize("dtype", ["uint8"])
-def test_fullyconnected(dtype):
-    zp_min = np.iinfo(dtype).min
-    zp_max = np.iinfo(dtype).max
-    trials = [
-        ((1, 1024), zp_min + 71, 0.580, zp_max - 176, 1.498),
-        ((1, 4096), zp_min + 166, 1.724, zp_max - 138, 0.180),
-        ((1, 16384), zp_min + 101, 1.372, zp_max - 234, 1.346),
-    ]
+@pytest.mark.parametrize(
+    "shape,out_channels",
+    [
+        ((1, 1024), 64),
+        ((1, 16384), 1),
+        ((1, 1280), 1000),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype,input_zp,input_sc,kernel_zp,kernel_sc",
+    [
+        ("uint8", 71, 0.580, 176, 1.498),
+        ("uint8", 166, 1.724, 138, 0.180),
+        ("int8", 71, 0.580, 0, 1.498),
+        ("int8", 120, 1.724, 0, 0.180),
+    ],
+)
+def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc):
+    """
+    Test fully connected offloading.
+    """
     np.random.seed(0)
-    for shape, input_zp, input_sc, kernel_zp, kernel_sc in trials:
-        kernel_zp = (
-            0
-            if dtype == "int8"
-            else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + 1
-        )
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            ),
-        }
-        outputs = []
-        output_zp, output_sc = tei.get_conv2d_qnn_params(
-            dtype,
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        ),
+    }
+
+    outputs = []
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype,
+        input_zp,
+        input_sc,
+        kernel_zp,
+        kernel_sc,
+        shape[0],
+        shape[1],
+        1,
+    )
+    for npu in [False, True]:
+        model, params = _get_model(
+            shape,
+            (out_channels, shape[1]),
             input_zp,
             input_sc,
             kernel_zp,
             kernel_sc,
-            shape[0],
-            shape[1],
-            1,
+            output_zp,
+            output_sc,
+            dtype,
         )
-        for npu in [False, True]:
-            model, params = _get_model(
-                shape,
-                shape,
-                input_zp,
-                input_sc,  # input zp, sc
-                kernel_zp,
-                kernel_sc,  # kernel
-                output_zp,
-                output_sc,  # output
-                dtype,
-            )
-            mod = tei.make_module(model, params)
-            outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
-        tei.verify(outputs, dtype, 1)
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 066b2e4a9f22..c23bd01e0960 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -124,9 +124,9 @@ def test_mobilenet_v1():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() == 2205:
-        _compile_hash = {"cb12b5469d78af81f4704488e3857755"}
+        _compile_hash = {"50186822915909303e813205db80e032"}
     elif tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"5d1c6a6bd4df8963866cc90405bf92dd"}
+        _compile_hash = {"c523c3c2bb9add1fee508217eb73af1a"}
     elif tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"46ccafc840633633aca441645e41b444"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -154,9 +154,9 @@ def test_resnet_50_int8():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() == 2205:
-        _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "64905a4ff2dbde08078ccc9f44ad711d"}
+        _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "4225fa951c145bb1e48e28cad6a3bdd4"}
     else:
-        _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "434f0c65c41e24d5482142c88b3438fe"}
+        _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "5b9d72b9accfea7ed89eb09ca0aa5487"}
     _test_image_network(
         model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
         "models/Quantized/resnet_50_quantized.tflite",
@@ -177,9 +177,9 @@ def test_inception_v3():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() == 2205:
-        _compile_hash = {"85ef702ad3628c598db8c72060c70a61"}
+        _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"}
     elif tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"e6abe33a7bc4a4170da53eefa6577bba"}
+        _compile_hash = {"88db2c7928240be9833c1b5ef367de28"}
     elif tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"43dc2097127eb224c0191b1a15f8acca"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -206,9 +206,9 @@ def test_inception_v4():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() == 2205:
-        _compile_hash = {"91a980eaf53881f4f109a1a7578e422b"}
+        _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"}
     elif tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"42e43c323ed8202f7b720ba9029bbcb7"}
+        _compile_hash = {"37648682f97cbbcecdc13945b7f2212f"}
     elif tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"fab6c2297502f95d33079c6ce1a737f9"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -235,9 +235,9 @@ def test_ssd_mobilenet_v1():
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() == 2205:
-        _compile_hash = {"d804ce3496a776c48f719b4062d5e5c3", "afb68ca8f452d1f4a674b457b5e30f59"}
+        _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"}
     elif tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"a37f900601b9493bd142e8aed16205e5", "afb68ca8f452d1f4a674b457b5e30f59"}
+        _compile_hash = {"7b8b0a3ad7cfe1695dee187f21f03785", "6b699f94795785d31b39940a5cf84a81"}
     elif tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"7795b6c67178da9d1f9b98063bad75b1", "10826406ae724e52f360a06c35ced09d"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":