Skip to content

Commit

Permalink
[ETHOSN] Get buffer sizes from the compiled network (#12160)
Browse files Browse the repository at this point in the history
The NPU support library compiler sometimes adds padding to input
tensors which means the buffer sizes calculated at runtime can
sometimes be smaller than necessary. Instead, buffer sizes are now
collected at compile time and passed to the runtime so that they match
the sizes expected by the compiled network. This was seen when running
a fully connected operation with an input that is not a multiple of
1024, so testing has been added to cover this case.

Additionally changed the fully connected test case to use pytest
parameterization as part of a general cleanup, and fixed an issue
with specifying a different output shape and weights with more than 1
output channel.

Change-Id: Iad319d75326b9ac41950de982603660a084dc27b
  • Loading branch information
lhutton1 authored Aug 4, 2022
1 parent 9f360a0 commit 3731a8c
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 78 deletions.
17 changes: 17 additions & 0 deletions src/relay/backend/contrib/ethosn/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -629,13 +629,16 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
// Determine the order that the inputs/outputs are in and how that corresponds to the
// order that the TVM runtime will expect them in
auto input_output_order = GetInputOutputOrder(network_with_ids, compiled_network);
auto io_sizes = GetIOSizes(compiled_network);
// Use the order information to create an 'ordered' network with includes how to map
// the inputs/outputs from the TVM runtime to the inputs/outputs of the compiled network
runtime::ethosn::OrderedCompiledNetwork ordered_network;
ordered_network.name = gvar->name_hint;
ordered_network.compiled_cmm = std::move(compiled_network);
ordered_network.inputs = input_output_order.first;
ordered_network.outputs = input_output_order.second;
ordered_network.input_sizes = io_sizes.first;
ordered_network.output_sizes = io_sizes.second;
return ordered_network;
}

Expand Down Expand Up @@ -684,6 +687,20 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetInput
return std::make_pair(input_order, output_order);
}

std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetIOSizes(
const std::unique_ptr<sl::CompiledNetwork>& compiled_network) {
std::vector<uint32_t> input_sizes;
std::vector<uint32_t> output_sizes;
for (const sl::InputBufferInfo info : compiled_network->GetInputBufferInfos()) {
input_sizes.push_back(info.m_Size);
}
for (const sl::OutputBufferInfo info : compiled_network->GetOutputBufferInfos()) {
output_sizes.push_back(info.m_Size);
}

return std::make_pair(input_sizes, output_sizes);
}

std::unique_ptr<sl::SupportQueries> EthosnCompiler::m_Queries;

EthosnError EthosnCompiler::SupportedSetup() {
Expand Down
13 changes: 13 additions & 0 deletions src/relay/backend/contrib/ethosn/codegen_ethosn.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,19 @@ class EthosnCompiler {
static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetInputOutputOrder(
NetworkWithIDs network, const std::unique_ptr<sl::CompiledNetwork>& compiled_network);

/*!
* \brief Determine the input and output sizes of a compiled network.
*
* These need to be queried from the compiled network as the compiler can choose
* to add additional padding on the input/output in certain cases.
*
* \param compiled_network The network compiled by the NPU compiler.
* \return Pair of vectors of buffer sizes for both the inputs and outputs of the
* network.
*/
static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetIOSizes(
const std::unique_ptr<sl::CompiledNetwork>& compiled_network);

/*!
* \brief Query interface used to determine if the Ethos-N hardware supports an operation
* with the supplied parameters.
Expand Down
38 changes: 20 additions & 18 deletions src/runtime/contrib/ethosn/ethosn_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,28 +95,28 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector<DLTensor*>* outputs) {
}

void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer> >* fm,
const std::vector<DLTensor*>& tensors, bool input) {
int index = 0;
for (auto buffer : tensors) {
auto* data = static_cast<uint8_t*>(buffer->data);
// The NPU only needs the size of the tensor * uint8_t.
auto data_size = static_cast<uint32_t>(GetDataSize(*buffer));
const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
bool input) {
for (size_t i = 0; i < tensors.size(); i++) {
auto* data = static_cast<uint8_t*>(tensors[i]->data);
if (input) {
(*fm)[index++] = std::make_shared<dl::Buffer>(data, data_size, dl::DataFormat::NHWC);
(*fm)[i] = std::make_shared<dl::Buffer>(data, tensor_sizes[i], dl::DataFormat::NHWC);
} else {
(*fm)[index++] = std::make_shared<dl::Buffer>(data_size, dl::DataFormat::NHWC);
(*fm)[i] = std::make_shared<dl::Buffer>(tensor_sizes[i], dl::DataFormat::NHWC);
}
}
}

#if _ETHOSN_API_VERSION_ <= 2102
bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order) {
const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
const std::vector<uint32_t>& input_sizes,
const std::vector<uint32_t>& output_sizes) {
#else
bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order) {
const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
const std::vector<uint32_t>& input_sizes,
const std::vector<uint32_t>& output_sizes) {
#endif
// Unpack parameters
uint8_t argc = 0;
Expand All @@ -133,11 +133,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,

// Set up input buffers
std::vector<std::shared_ptr<dl::Buffer> > ifm(inputs.size());
CreateBuffers(&ifm, inputs, true);
CreateBuffers(&ifm, inputs, input_sizes, true);

// Set up output buffers
std::vector<std::shared_ptr<dl::Buffer> > ofm(outputs.size());
CreateBuffers(&ofm, outputs, false);
CreateBuffers(&ofm, outputs, output_sizes, false);

// Raw pointers for the inference
dl::Buffer* ifm_raw[inputs.size()];
Expand Down Expand Up @@ -231,12 +231,14 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
// Allow the ethos-n support code to be tested without a device
#if _ETHOSN_API_VERSION_ <= 2102
bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order) {
const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
const std::vector<uint32_t>& input_sizes,
const std::vector<uint32_t>& output_sizes) {
#else
bool Inference(tvm::runtime::TVMArgs args, dl::Network* /* npu */,
const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order) {
const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
const std::vector<uint32_t>& input_sizes,
const std::vector<uint32_t>& output_sizes) {
#endif
std::vector<DLTensor*> outputs;
for (int argc = input_order.size(); argc < args.size(); argc++) {
Expand Down
8 changes: 5 additions & 3 deletions src/runtime/contrib/ethosn/ethosn_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ using tvm::runtime::TVMArgs;

#if _ETHOSN_API_VERSION_ <= 2102
bool Inference(TVMArgs args, sl::CompiledNetwork* npu, const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order);
const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
const std::vector<uint32_t>& output_sizes);
#else
bool Inference(TVMArgs args, dl::Network* npu, const std::vector<uint32_t>& input_order,
const std::vector<uint32_t>& output_order);
bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
#endif

} // namespace ethosn
Expand Down
14 changes: 12 additions & 2 deletions src/runtime/contrib/ethosn/ethosn_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ EthosnModule::EthosnModule(std::vector<OrderedCompiledNetwork>* cmms) {
#endif
network_map_[it.name].inputs = it.inputs;
network_map_[it.name].outputs = it.outputs;
network_map_[it.name].input_sizes = it.input_sizes;
network_map_[it.name].output_sizes = it.output_sizes;
}
}

Expand All @@ -69,10 +71,12 @@ PackedFunc EthosnModule::GetFunction(const std::string& name,
return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
#if _ETHOSN_API_VERSION_ <= 2102
*rv = Inference(args, network_map_[name].compiled_cmm.get(), network_map_[name].inputs,
network_map_[name].outputs);
network_map_[name].outputs, network_map_[name].input_sizes,
network_map_[name].output_sizes);
#else
*rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
network_map_[name].outputs);
network_map_[name].outputs, network_map_[name].input_sizes,
network_map_[name].output_sizes);
#endif
});
} else {
Expand All @@ -90,8 +94,10 @@ void EthosnModule::SaveToBinary(dmlc::Stream* stream) {
stream->Write(ss.str());
stream->Write(it.second.inputs.size());
stream->Write(&it.second.inputs[0], sizeof(uint32_t) * it.second.inputs.size());
stream->Write(&it.second.input_sizes[0], sizeof(uint32_t) * it.second.input_sizes.size());
stream->Write(it.second.outputs.size());
stream->Write(&it.second.outputs[0], sizeof(uint32_t) * it.second.outputs.size());
stream->Write(&it.second.output_sizes[0], sizeof(uint32_t) * it.second.output_sizes.size());
}
}

Expand Down Expand Up @@ -128,12 +134,16 @@ Module EthosnModule::LoadFromBinary(void* strm) {
compiled.inputs.resize(size);
// Read the order of inputs
stream->Read(&compiled.inputs[0], sizeof(uint32_t) * size);
compiled.input_sizes.resize(size);
stream->Read(&compiled.input_sizes[0], sizeof(uint32_t) * size);
// Read the number of outputs
stream->Read<uint64_t>(&output_size);
size = static_cast<size_t>(output_size);
compiled.outputs.resize(size);
// Read the order of outputs
stream->Read(&compiled.outputs[0], sizeof(uint32_t) * size);
compiled.output_sizes.resize(size);
stream->Read(&compiled.output_sizes[0], sizeof(uint32_t) * size);
}
auto n = make_object<EthosnModule>(&cmms);
return Module(n);
Expand Down
4 changes: 4 additions & 0 deletions src/runtime/contrib/ethosn/ethosn_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ struct OrderedCompiledNetwork {
std::string name;
std::vector<uint32_t> inputs;
std::vector<uint32_t> outputs;
std::vector<uint32_t> input_sizes;
std::vector<uint32_t> output_sizes;
};

class EthosnModule : public ModuleNode {
Expand Down Expand Up @@ -88,8 +90,10 @@ class EthosnModule : public ModuleNode {
* std::string : serialized command stream
* size_t : number of inputs
* std::vector : order of inputs
* std::vector : buffer sizes for inputs
* size_t : number of outputs
* std::vector : order of outputs
* std::vector : buffer sizes for outputs
* ] * number of functions
*/
static Module LoadFromBinary(void* strm);
Expand Down
95 changes: 50 additions & 45 deletions tests/python/contrib/test_ethosn/test_fullyconnected.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def _get_model(
units=weight_shape[0],
out_dtype="int32",
)
b = tvm.nd.array(np.random.randint(0, high=255, size=(shape[0],), dtype="int32"))
b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32"))
biasc = relay.const(b, "int32")
bias = relay.nn.bias_add(fc, biasc, axis=0)
bias = relay.nn.bias_add(fc, biasc)
req = relay.qnn.op.requantize(
bias,
relay.const(input_sc * kernel_sc, "float32"), # input zero scale
Expand All @@ -58,55 +58,60 @@ def _get_model(


@requires_ethosn
@pytest.mark.parametrize("dtype", ["uint8"])
def test_fullyconnected(dtype):
zp_min = np.iinfo(dtype).min
zp_max = np.iinfo(dtype).max
trials = [
((1, 1024), zp_min + 71, 0.580, zp_max - 176, 1.498),
((1, 4096), zp_min + 166, 1.724, zp_max - 138, 0.180),
((1, 16384), zp_min + 101, 1.372, zp_max - 234, 1.346),
]
@pytest.mark.parametrize(
"shape,out_channels",
[
((1, 1024), 64),
((1, 16384), 1),
((1, 1280), 1000),
],
)
@pytest.mark.parametrize(
"dtype,input_zp,input_sc,kernel_zp,kernel_sc",
[
("uint8", 71, 0.580, 176, 1.498),
("uint8", 166, 1.724, 138, 0.180),
("int8", 71, 0.580, 0, 1.498),
("int8", 120, 1.724, 0, 0.180),
],
)
def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc):
"""
Test fully connected offloading.
"""
np.random.seed(0)
for shape, input_zp, input_sc, kernel_zp, kernel_sc in trials:
kernel_zp = (
0
if dtype == "int8"
else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + 1
)
inputs = {
"a": tvm.nd.array(
np.random.randint(
np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
)
),
}
outputs = []
output_zp, output_sc = tei.get_conv2d_qnn_params(
dtype,
inputs = {
"a": tvm.nd.array(
np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
),
}

outputs = []
output_zp, output_sc = tei.get_conv2d_qnn_params(
dtype,
input_zp,
input_sc,
kernel_zp,
kernel_sc,
shape[0],
shape[1],
1,
)
for npu in [False, True]:
model, params = _get_model(
shape,
(out_channels, shape[1]),
input_zp,
input_sc,
kernel_zp,
kernel_sc,
shape[0],
shape[1],
1,
output_zp,
output_sc,
dtype,
)
for npu in [False, True]:
model, params = _get_model(
shape,
shape,
input_zp,
input_sc, # input zp, sc
kernel_zp,
kernel_sc, # kernel
output_zp,
output_sc, # output
dtype,
)
mod = tei.make_module(model, params)
outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
tei.verify(outputs, dtype, 1)
mod = tei.make_module(model, params)
outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
tei.verify(outputs, dtype, 1)


@requires_ethosn
Expand Down
20 changes: 10 additions & 10 deletions tests/python/contrib/test_ethosn/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ def test_mobilenet_v1():
# version or a change in the Ethos-N codegen. To update this requires running
# on hardware that isn't available in CI.
if tei.get_ethosn_api_version() == 2205:
_compile_hash = {"cb12b5469d78af81f4704488e3857755"}
_compile_hash = {"50186822915909303e813205db80e032"}
elif tei.get_ethosn_api_version() == 2111:
_compile_hash = {"5d1c6a6bd4df8963866cc90405bf92dd"}
_compile_hash = {"c523c3c2bb9add1fee508217eb73af1a"}
elif tei.get_ethosn_api_version() == 2102:
_compile_hash = {"46ccafc840633633aca441645e41b444"}
if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
Expand Down Expand Up @@ -154,9 +154,9 @@ def test_resnet_50_int8():
# version or a change in the Ethos-N codegen. To update this requires running
# on hardware that isn't available in CI.
if tei.get_ethosn_api_version() == 2205:
_compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "64905a4ff2dbde08078ccc9f44ad711d"}
_compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "4225fa951c145bb1e48e28cad6a3bdd4"}
else:
_compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "434f0c65c41e24d5482142c88b3438fe"}
_compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "5b9d72b9accfea7ed89eb09ca0aa5487"}
_test_image_network(
model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
"models/Quantized/resnet_50_quantized.tflite",
Expand All @@ -177,9 +177,9 @@ def test_inception_v3():
# version or a change in the Ethos-N codegen. To update this requires running
# on hardware that isn't available in CI.
if tei.get_ethosn_api_version() == 2205:
_compile_hash = {"85ef702ad3628c598db8c72060c70a61"}
_compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"}
elif tei.get_ethosn_api_version() == 2111:
_compile_hash = {"e6abe33a7bc4a4170da53eefa6577bba"}
_compile_hash = {"88db2c7928240be9833c1b5ef367de28"}
elif tei.get_ethosn_api_version() == 2102:
_compile_hash = {"43dc2097127eb224c0191b1a15f8acca"}
if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
Expand All @@ -206,9 +206,9 @@ def test_inception_v4():
# version or a change in the Ethos-N codegen. To update this requires running
# on hardware that isn't available in CI.
if tei.get_ethosn_api_version() == 2205:
_compile_hash = {"91a980eaf53881f4f109a1a7578e422b"}
_compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"}
elif tei.get_ethosn_api_version() == 2111:
_compile_hash = {"42e43c323ed8202f7b720ba9029bbcb7"}
_compile_hash = {"37648682f97cbbcecdc13945b7f2212f"}
elif tei.get_ethosn_api_version() == 2102:
_compile_hash = {"fab6c2297502f95d33079c6ce1a737f9"}
if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
Expand All @@ -235,9 +235,9 @@ def test_ssd_mobilenet_v1():
# version or a change in the Ethos-N codegen. To update this requires running
# on hardware that isn't available in CI.
if tei.get_ethosn_api_version() == 2205:
_compile_hash = {"d804ce3496a776c48f719b4062d5e5c3", "afb68ca8f452d1f4a674b457b5e30f59"}
_compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"}
elif tei.get_ethosn_api_version() == 2111:
_compile_hash = {"a37f900601b9493bd142e8aed16205e5", "afb68ca8f452d1f4a674b457b5e30f59"}
_compile_hash = {"7b8b0a3ad7cfe1695dee187f21f03785", "6b699f94795785d31b39940a5cf84a81"}
elif tei.get_ethosn_api_version() == 2102:
_compile_hash = {"7795b6c67178da9d1f9b98063bad75b1", "10826406ae724e52f360a06c35ced09d"}
if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
Expand Down

0 comments on commit 3731a8c

Please sign in to comment.