diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 0f90de5f7a13..8b89da90f41f 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -29,8 +29,8 @@ InnerProduct_riscv::InnerProduct_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zvfh - support_fp16_storage = true; +#if NCNN_ZVFH + support_fp16_storage = cpu_support_riscv_zvfh(); #endif #endif // __riscv_vector @@ -57,7 +57,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH if (opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -153,9 +153,9 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt } #endif +#if NCNN_ZVFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zvfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -524,572 +524,4 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } -#if __riscv_vector && __riscv_zvfh -int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - int out_elempack = 1; - - if (opt.use_packing_layout) - { - out_elempack = num_output % packn == 0 ? packn : 1; - } - - // src = inch-outch - // dst = pb-inch-outch/pb - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); - - for (int p = 0; p < num_input; p++) - { - for (int j = 0; j < out_elempack; j++) - { - *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]); - } - } - } - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - if (bottom_blob.dims == 2 && bottom_blob.w == num_input) - { - // gemm - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { - if (elempack == packn && num_output_elempack == packn) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - for (int l = 0; l < packn; l++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vfmv_v_f_f32m2(bias_data[p * packn + l], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); - - _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); - - m += packn; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - } - - if (elempack == 1 && num_output_elempack == packn) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); - - _sum = __riscv_vfmacc_vf_f32m2(_sum, *m, _w, vl); - - m += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - - if (elempack == packn && num_output_elempack == 1) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vfmv_v_f_f32m2(bias_data[p], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); - - _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); - - m += packn; - kptr += 1; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - - if (elempack == 1 && num_output_elempack == 1) - { - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - for (int i = 0; i < num_input; i++) - { - sum += (float)m[i] * (float)kptr[i]; - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[0] = (__fp16)sum; - outptr += 1; - } - } - } - - return 0; - } - - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; - - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } - - size_t elemsize = bottom_blob_flattened.elemsize; - int elempack = bottom_blob_flattened.elempack; - - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == packn) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - const __fp16* kptr = weight_data_tm.row(p); - - const __fp16* sptr = bottom_blob_flattened; - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); - - _sum = __riscv_vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl); - - sptr += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __fp16* outptr = (__fp16*)top_blob; - __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); - } - } - - if (out_elempack == 1) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const __fp16* kptr = weight_data_tm.row<__fp16>(p); - - const __fp16* sptr = bottom_blob_flattened; - - int i = 0; - for (; i < num_input; i++) - { - float v = (float)(*sptr); - float k = (float)(*kptr); - - sum += v * k; - - sptr++; - kptr++; - } - - sum = activation_ss(sum, activation_type, activation_params); - - __fp16* outptr = (__fp16*)top_blob; - outptr[p] = (__fp16)sum; - } - } - - return 0; -} - -int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - if (bottom_blob.dims == 2 && bottom_blob.w == num_input) - { - // gemm - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { - if (elempack == packn && num_output_elempack == packn) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - for (int l = 0; l < packn; l++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); - - if (bias_term) - { - _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); - - _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); - - m += packn; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - } - - if (elempack == 1 && num_output_elempack == packn) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); - - _sum = __riscv_vfmacc_vf_f16m1(_sum, *m, _w, vl); - - m += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - - if (elempack == packn && num_output_elempack == 1) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); - - _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); - - m += packn; - kptr += 1; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __riscv_vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - - if (elempack == 1 && num_output_elempack == 1) - { - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - for (int i = 0; i < num_input; i++) - { - sum += (float)(m[i] * kptr[i]); - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[0] = (__fp16)sum; - outptr += 1; - } - } - } - - return 0; - } - - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; - - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } - - size_t elemsize = bottom_blob_flattened.elemsize; - int elempack = bottom_blob_flattened.elempack; - - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == packn) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - const __fp16* kptr = weight_data_tm.row(p); - - const __fp16* sptr = bottom_blob_flattened; - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); - - _sum = __riscv_vfmacc_vf_f16m1(_sum, *sptr, _w, vl); - - sptr += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __fp16* outptr = (__fp16*)top_blob; - __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl); - } - } - - if (out_elempack == 1) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const __fp16* kptr = weight_data_tm.row<__fp16>(p); - - const __fp16* sptr = bottom_blob_flattened; - - int i = 0; - for (; i < num_input; i++) - { - __fp16 v = *sptr; - __fp16 k = *kptr; - - sum += (float)(v * k); - - sptr++; - kptr++; - } - - sum = activation_ss(sum, activation_type, activation_params); - - __fp16* outptr = (__fp16*)top_blob; - outptr[p] = (__fp16)sum; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zvfh - } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index 0a75d79bbb51..4c6384a2e0da 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -30,7 +30,7 @@ class InnerProduct_riscv : public InnerProduct virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/innerproduct_riscv_zvfh.cpp b/src/layer/riscv/innerproduct_riscv_zvfh.cpp new file mode 100644 index 000000000000..86c32c0a9930 --- /dev/null +++ b/src/layer/riscv/innerproduct_riscv_zvfh.cpp @@ -0,0 +1,594 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "innerproduct_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if __riscv_zvfh +int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) +{ + const int packn = csrr_vlenb() / 2; + + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]); + } + } + } + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int packn = csrr_vlenb() / 2; + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + if (elempack == packn && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + for (int l = 0; l < packn; l++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f32m2(bias_data[p * packn + l], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); + + m += packn; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + } + + if (elempack == 1 && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *m, _w, vl); + + m += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + + if (elempack == packn && num_output_elempack == 1) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f32m2(bias_data[p], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); + + m += packn; + kptr += 1; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + + if (elempack == 1 && num_output_elempack == 1) + { + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + for (int i = 0; i < num_input; i++) + { + sum += (float)m[i] * (float)kptr[i]; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = (__fp16)sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == packn) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + const __fp16* kptr = weight_data_tm.row(p); + + const __fp16* sptr = bottom_blob_flattened; + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl); + + sptr += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __fp16* outptr = (__fp16*)top_blob; + __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + } + } + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const __fp16* kptr = weight_data_tm.row<__fp16>(p); + + const __fp16* sptr = bottom_blob_flattened; + + int i = 0; + for (; i < num_input; i++) + { + float v = (float)(*sptr); + float k = (float)(*kptr); + + sum += v * k; + + sptr++; + kptr++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + __fp16* outptr = (__fp16*)top_blob; + outptr[p] = (__fp16)sum; + } + } + + return 0; +} + +int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int packn = csrr_vlenb() / 2; + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + if (elempack == packn && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + for (int l = 0; l < packn; l++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); + + m += packn; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + } + + if (elempack == 1 && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *m, _w, vl); + + m += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + + if (elempack == packn && num_output_elempack == 1) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); + + m += packn; + kptr += 1; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + + if (elempack == 1 && num_output_elempack == 1) + { + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + for (int i = 0; i < num_input; i++) + { + sum += (float)(m[i] * kptr[i]); + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = (__fp16)sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == packn) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + const __fp16* kptr = weight_data_tm.row(p); + + const __fp16* sptr = bottom_blob_flattened; + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *sptr, _w, vl); + + sptr += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __fp16* outptr = (__fp16*)top_blob; + __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl); + } + } + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const __fp16* kptr = weight_data_tm.row<__fp16>(p); + + const __fp16* sptr = bottom_blob_flattened; + + int i = 0; + for (; i < num_input; i++) + { + __fp16 v = *sptr; + __fp16 k = *kptr; + + sum += (float)(v * k); + + sptr++; + kptr++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + __fp16* outptr = (__fp16*)top_blob; + outptr[p] = (__fp16)sum; + } + } + + return 0; +} +#endif // __riscv_zvfh + +} // namespace ncnn diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp index fca4ba18ac3f..0d7e9d7cd2df 100644 --- a/src/layer/riscv/interp_riscv.cpp +++ b/src/layer/riscv/interp_riscv.cpp @@ -27,20 +27,14 @@ namespace ncnn { #if __riscv_vector #include "interp_bicubic_packn.h" #include "interp_bilinear_packn.h" -#if __riscv_zvfh -#include "interp_bicubic_fp16s.h" -#include "interp_bicubic_packn_fp16s.h" -#include "interp_bilinear_fp16s.h" -#include "interp_bilinear_packn_fp16s.h" -#endif #endif Interp_riscv::Interp_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zvfh - support_fp16_storage = true; +#if NCNN_ZVFH + support_fp16_storage = cpu_support_riscv_zvfh(); #endif #endif // __riscv_vector } @@ -51,9 +45,9 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector const Mat& reference_blob = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; +#if NCNN_ZVFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zvfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -489,729 +483,4 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector return 0; } -#if __riscv_vector && __riscv_zvfh -int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& reference_blob = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - - int h = bottom_blob.h; - int w = bottom_blob.w; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - int outw = reference_blob.w; - int outh = reference_blob.h; - - if (dims == 1) - { - top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < w; q++) - { - Mat top_blob_c = top_blob.channel(q); - vfloat16m1_t _v = __riscv_vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl); - top_blob_c.fill(_v); - } - - return 0; - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < w; q++) - { - Mat top_blob_c = top_blob.channel(q); - const __fp16* ptr = bottom_blob; - top_blob_c.fill(ptr[q]); - } - - return 0; - } - - if (dims == 2) - { - if (outw == w) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 1) // nearest - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - - vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); - __riscv_vse16_v_f16m1(outptr, _p, vl); - - outptr += packn; - } - } - } - - if (resize_type == 2) // bilinear - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - linear_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); - vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl); - - __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); - - alphap += 2; - outptr += packn; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); - vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); - vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); - - __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); - - alphap += 4; - outptr += packn; - } - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 1) // nearest - { - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - *outptr++ = ptr[in_x]; - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outw * 2]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - linear_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - float a0 = alphap[0]; - float a1 = alphap[1]; - *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1); - alphap += 2; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outw * 4]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - float a0 = alphap[0]; - float a1 = alphap[1]; - float a2 = alphap[2]; - float a3 = alphap[3]; - *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3); - alphap += 4; - } - } - - delete[] buf; - } - - return 0; - } - - if (outw == w && outh == h) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 1) // nearest - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - const float hs = output_height ? h / (float)outh : 1.f / height_scale; - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - for (int y = 0; y < outh; y++) - { - int in_y = std::min((int)(y * hs), (h - 1)); - - const __fp16* ptr = src.row(in_y); - __fp16* outptr = dst.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - - vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); - __riscv_vse16_v_f16m1(outptr, _p, vl); - - outptr += packn; - } - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; - float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; - - linear_coeffs(w, outw, xofs, alpha, align_corner); - linear_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; - float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - cubic_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 1) // nearest - { - const float hs = output_height ? h / (float)outh : 1.f / height_scale; - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - for (int y = 0; y < outh; y++) - { - int in_y = std::min((int)(y * hs), (h - 1)); - - const __fp16* ptr = src.row(in_y); - __fp16* outptr = dst.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - *outptr++ = ptr[in_x]; - } - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; - float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; - - linear_coeffs(w, outw, xofs, alpha, align_corner); - linear_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; - float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - cubic_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; -} - -int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& reference_blob = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - - int h = bottom_blob.h; - int w = bottom_blob.w; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - int outw = reference_blob.w; - int outh = reference_blob.h; - - if (dims == 1 || resize_type == 1) // nearest - { - return forward_fp16s(bottom_blobs, top_blobs, opt); - } - - if (dims == 2) - { - if (outw == w) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 2) // bilinear - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl); - - __riscv_vse16_v_f16m1(outptr, _p, vl); - - alphap += 2; - outptr += packn; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); - vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); - vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); - - __riscv_vse16_v_f16m1(outptr, _p, vl); - - alphap += 4; - outptr += packn; - } - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outw * 2]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - __fp16 a0 = alphap[0]; - __fp16 a1 = alphap[1]; - *outptr++ = Sp[0] * a0 + Sp[1] * a1; - alphap += 2; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outw * 4]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - __fp16 a0 = alphap[0]; - __fp16 a1 = alphap[1]; - __fp16 a2 = alphap[2]; - __fp16 a3 = alphap[3]; - *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; - alphap += 4; - } - } - - delete[] buf; - } - - return 0; - } - - if (outw == w && outh == h) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; -} -#endif // __riscv_vector && __riscv_zvfh - } // namespace ncnn diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h index c361c0b6454f..4b79c755edcb 100644 --- a/src/layer/riscv/interp_riscv.h +++ b/src/layer/riscv/interp_riscv.h @@ -27,7 +27,7 @@ class Interp_riscv : public Interp virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif diff --git a/src/layer/riscv/interp_riscv_zvfh.cpp b/src/layer/riscv/interp_riscv_zvfh.cpp new file mode 100644 index 000000000000..7ebbcc236aff --- /dev/null +++ b/src/layer/riscv/interp_riscv_zvfh.cpp @@ -0,0 +1,761 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "interp_riscv.h" + +#if __riscv_vector +#include +#include "riscv_usability.h" +#endif // __riscv_vector + +namespace ncnn { + +#include "interp_bicubic.h" +#include "interp_bilinear.h" + +#if __riscv_vector +#if __riscv_zvfh +#include "interp_bicubic_fp16s.h" +#include "interp_bicubic_packn_fp16s.h" +#include "interp_bilinear_fp16s.h" +#include "interp_bilinear_packn_fp16s.h" +#endif +#endif + +#if __riscv_zvfh +int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const int packn = csrr_vlenb() / 2; + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1) + { + top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + vfloat16m1_t _v = __riscv_vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl); + top_blob_c.fill(_v); + } + + return 0; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + const __fp16* ptr = bottom_blob; + top_blob_c.fill(ptr[q]); + } + + return 0; + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (elempack == packn) + { + if (resize_type == 1) // nearest + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); + __riscv_vse16_v_f16m1(outptr, _p, vl); + + outptr += packn; + } + } + } + + if (resize_type == 2) // bilinear + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + alphap += 2; + outptr += packn; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + alphap += 4; + outptr += packn; + } + } + + delete[] buf; + } + + return 0; + } + + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1); + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3); + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (elempack == packn) + { + if (resize_type == 1) // nearest + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const __fp16* ptr = src.row(in_y); + __fp16* outptr = dst.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); + __riscv_vse16_v_f16m1(outptr, _p, vl); + + outptr += packn; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } + + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const __fp16* ptr = src.row(in_y); + __fp16* outptr = dst.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} + +int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const int packn = csrr_vlenb() / 2; + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1 || resize_type == 1) // nearest + { + return forward_fp16s(bottom_blobs, top_blobs, opt); + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (elempack == packn) + { + if (resize_type == 2) // bilinear + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl); + + __riscv_vse16_v_f16m1(outptr, _p, vl); + + alphap += 2; + outptr += packn; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); + vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + + __riscv_vse16_v_f16m1(outptr, _p, vl); + + alphap += 4; + outptr += packn; + } + } + + delete[] buf; + } + + return 0; + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + __fp16 a0 = alphap[0]; + __fp16 a1 = alphap[1]; + *outptr++ = Sp[0] * a0 + Sp[1] * a1; + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + __fp16 a0 = alphap[0]; + __fp16 a1 = alphap[1]; + __fp16 a2 = alphap[2]; + __fp16 a3 = alphap[3]; + *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (elempack == packn) + { + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} +#endif // __riscv_zvfh + +} // namespace ncnn diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp index 0f0c8e3d2da5..9091e3e8b2ad 100644 --- a/src/layer/riscv/pooling_riscv.cpp +++ b/src/layer/riscv/pooling_riscv.cpp @@ -28,8 +28,8 @@ Pooling_riscv::Pooling_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zvfh - support_fp16_storage = true; +#if NCNN_ZVFH + support_fp16_storage = cpu_support_riscv_zvfh(); #endif #endif // __riscv_vector } @@ -55,9 +55,9 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return Pooling::forward(bottom_blob, top_blob, opt); } +#if NCNN_ZVFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zvfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -308,655 +308,4 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return Pooling::forward(bottom_blob, top_blob, opt); } -#if __riscv_vector && __riscv_zvfh -int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - // max value in NxN window - // avg value in NxN window - - const int packn = csrr_vlenb() / 2; - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); - - if (global_pooling) - { - top_blob.create(channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int size = w * h; - - if (pooling_type == PoolMethod_MAX) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); - for (int i = 0; i < size; i++) - { - vfloat16m1_t _val = __riscv_vle16_v_f16m1(ptr, vl); - _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); - ptr += packn; - } - - __fp16* outptr = top_blob; - __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl); - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - __fp16 max = (__fp16)-FLT_MAX; - for (int i = 0; i < size; i++) - { - max = std::max(max, ptr[i]); - } - - __fp16* outptr = top_blob; - outptr[q] = max; - } - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - for (int i = 0; i < size; i++) - { - vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl); - _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); - ptr += packn; - } - - vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / size, vl); - - __fp16* outptr = top_blob; - __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - float sum = 0.f; - for (int i = 0; i < size; i++) - { - sum += (float)ptr[i]; - } - - __fp16* outptr = top_blob; - outptr[q] = (__fp16)(sum / size); - } - } - } - - return 0; - } - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_w) / stride_w + 1; - int outh = (h - kernel_h) / stride_h + 1; - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w - kernel_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2++; - } - p2 += gap; - } - } - - if (pooling_type == PoolMethod_MAX) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); - } - - __riscv_vse16_v_f16m1(outptr + j * packn, _max, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - __fp16 max = (__fp16)-FLT_MAX; - - for (int k = 0; k < maxk; k++) - { - __fp16 val = sptr[space_ofs[k]]; - max = std::max(max, val); - } - - outptr[j] = max; - } - - outptr += outw; - } - } - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (avgpool_count_include_pad == 0) - { - int wtailpad = 0; - int htailpad = 0; - - if (pad_mode == 0) // full padding - { - wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; - htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; - } - - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl), vl); - _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); - area += 1; - } - } - - vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / area, vl); - __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - float sum = 0.f; - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - float val = (float)(m.row(sy)[sx]); - sum += val; - area += 1; - } - } - - outptr[j] = (__fp16)(sum / area); - } - - outptr += outw; - } - } - } - } - - if (avgpool_count_include_pad == 1) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - const float inv_maxk = 1.f / maxk; - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl); - _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); - } - - vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, inv_maxk, vl); - __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - float sum = 0.f; - - for (int k = 0; k < maxk; k++) - { - float val = (float)(sptr[space_ofs[k]]); - sum += val; - } - - outptr[j] = (__fp16)(sum / maxk); - } - - outptr += outw; - } - } - } - } - } - - return 0; -} - -int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - // max value in NxN window - // avg value in NxN window - - if (pooling_type == PoolMethod_MAX || global_pooling) - { - return forward_fp16s(bottom_blob, top_blob, opt); - } - - const int packn = csrr_vlenb() / 2; - const size_t vl = __riscv_vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_w) / stride_w + 1; - int outh = (h - kernel_h) / stride_h + 1; - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w - kernel_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2++; - } - p2 += gap; - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (avgpool_count_include_pad == 0) - { - int wtailpad = 0; - int htailpad = 0; - - if (pad_mode == 0) // full padding - { - wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; - htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; - } - - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - vfloat16m1_t _val = __riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl); - _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); - area += 1; - } - } - - vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl); - __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - __fp16 sum = (__fp16)0.f; - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - __fp16 val = m.row(sy)[sx]; - sum += val; - area += 1; - } - } - - outptr[j] = sum / area; - } - - outptr += outw; - } - } - } - } - - if (avgpool_count_include_pad == 1) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - const __fp16 inv_maxk = (__fp16)(1.f / maxk); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); - } - - vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, inv_maxk, vl); - __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - __fp16 sum = (__fp16)0.f; - - for (int k = 0; k < maxk; k++) - { - __fp16 val = sptr[space_ofs[k]]; - sum += val; - } - - outptr[j] = sum / maxk; - } - - outptr += outw; - } - } - } - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zvfh - } // namespace ncnn diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h index 9521c7eef91e..4c14577b3ca6 100644 --- a/src/layer/riscv/pooling_riscv.h +++ b/src/layer/riscv/pooling_riscv.h @@ -28,7 +28,7 @@ class Pooling_riscv : public Pooling virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/pooling_riscv_zvfh.cpp b/src/layer/riscv/pooling_riscv_zvfh.cpp new file mode 100644 index 000000000000..214c56975a0d --- /dev/null +++ b/src/layer/riscv/pooling_riscv_zvfh.cpp @@ -0,0 +1,678 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pooling_riscv.h" + +#include + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +namespace ncnn { + +#if __riscv_zvfh +int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // max value in NxN window + // avg value in NxN window + + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + if (global_pooling) + { + top_blob.create(channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int size = w * h; + + if (pooling_type == PoolMethod_MAX) + { + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(ptr, vl); + _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); + ptr += packn; + } + + __fp16* outptr = top_blob; + __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl); + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + __fp16 max = (__fp16)-FLT_MAX; + for (int i = 0; i < size; i++) + { + max = std::max(max, ptr[i]); + } + + __fp16* outptr = top_blob; + outptr[q] = max; + } + } + } + + if (pooling_type == PoolMethod_AVE) + { + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + ptr += packn; + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / size, vl); + + __fp16* outptr = top_blob; + __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + float sum = 0.f; + for (int i = 0; i < size; i++) + { + sum += (float)ptr[i]; + } + + __fp16* outptr = top_blob; + outptr[q] = (__fp16)(sum / size); + } + } + } + + return 0; + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_MAX) + { + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); + } + + __riscv_vse16_v_f16m1(outptr + j * packn, _max, vl); + } + + outptr += outw * packn; + } + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + __fp16 max = (__fp16)-FLT_MAX; + + for (int k = 0; k < maxk; k++) + { + __fp16 val = sptr[space_ofs[k]]; + max = std::max(max, val); + } + + outptr[j] = max; + } + + outptr += outw; + } + } + } + } + + if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + area += 1; + } + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / area, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + + outptr += outw * packn; + } + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + float sum = 0.f; + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + float val = (float)(m.row(sy)[sx]); + sum += val; + area += 1; + } + } + + outptr[j] = (__fp16)(sum / area); + } + + outptr += outw; + } + } + } + } + + if (avgpool_count_include_pad == 1) + { + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + const float inv_maxk = 1.f / maxk; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, inv_maxk, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + + outptr += outw * packn; + } + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + float sum = 0.f; + + for (int k = 0; k < maxk; k++) + { + float val = (float)(sptr[space_ofs[k]]); + sum += val; + } + + outptr[j] = (__fp16)(sum / maxk); + } + + outptr += outw; + } + } + } + } + } + + return 0; +} + +int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // max value in NxN window + // avg value in NxN window + + if (pooling_type == PoolMethod_MAX || global_pooling) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } + + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl); + _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); + area += 1; + } + } + + vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); + } + + outptr += outw * packn; + } + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + __fp16 sum = (__fp16)0.f; + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + __fp16 val = m.row(sy)[sx]; + sum += val; + area += 1; + } + } + + outptr[j] = sum / area; + } + + outptr += outw; + } + } + } + } + + if (avgpool_count_include_pad == 1) + { + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + const __fp16 inv_maxk = (__fp16)(1.f / maxk); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); + } + + vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, inv_maxk, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); + } + + outptr += outw * packn; + } + } + } + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + __fp16 sum = (__fp16)0.f; + + for (int k = 0; k < maxk; k++) + { + __fp16 val = sptr[space_ofs[k]]; + sum += val; + } + + outptr[j] = sum / maxk; + } + + outptr += outw; + } + } + } + } + } + + return 0; +} +#endif // __riscv_zvfh + +} // namespace ncnn