diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 0f90de5f7a13..8b89da90f41f 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -29,8 +29,8 @@ InnerProduct_riscv::InnerProduct_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if __riscv_zvfh
-    support_fp16_storage = true;
+#if NCNN_ZVFH
+    support_fp16_storage = cpu_support_riscv_zvfh();
 #endif
 #endif // __riscv_vector
 
@@ -57,7 +57,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     if (opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
@@ -153,9 +153,9 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
     }
 #endif
 
+#if NCNN_ZVFH
     int elembits = bottom_blob.elembits();
 
-#if __riscv_vector && __riscv_zvfh
     if (opt.use_fp16_storage && elembits == 16)
     {
         if (opt.use_fp16_arithmetic)
@@ -524,572 +524,4 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
     return 0;
 }
 
-#if __riscv_vector && __riscv_zvfh
-int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
-{
-    const int packn = csrr_vlenb() / 2;
-
-    const int num_input = weight_data_size / num_output;
-
-    int out_elempack = 1;
-
-    if (opt.use_packing_layout)
-    {
-        out_elempack = num_output % packn == 0 ? packn : 1;
-    }
-
-    // src = inch-outch
-    // dst = pb-inch-outch/pb
-    {
-        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
-
-        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);
-
-        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
-        {
-            __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack);
-
-            for (int p = 0; p < num_input; p++)
-            {
-                for (int j = 0; j < out_elempack; j++)
-                {
-                    *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]);
-                }
-            }
-        }
-    }
-
-    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
-
-    if (opt.lightmode)
-        weight_data.release();
-
-    return 0;
-}
-
-int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    const int packn = csrr_vlenb() / 2;
-
-    const int num_input = weight_data_size / num_output;
-
-    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
-    {
-        // gemm
-        int h = bottom_blob.h;
-        size_t elemsize = bottom_blob.elemsize;
-        int elempack = bottom_blob.elempack;
-
-        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int j = 0; j < h; j++)
-        {
-            if (elempack == packn && num_output_elempack == packn)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    for (int l = 0; l < packn; l++)
-                    {
-                        const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
-                        const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                        vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
-                        if (bias_term)
-                        {
-                            _sum = __riscv_vfmv_v_f_f32m2(bias_data[p * packn + l], vl);
-                        }
-
-                        int n = num_input;
-                        while (n > 0)
-                        {
-                            vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl);
-
-                            _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
-
-                            m += packn;
-                            kptr += packn;
-                            n -= 1;
-                        }
-
-                        _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
-                        outptr += packn;
-                    }
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == packn)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
-                    if (bias_term)
-                    {
-                        _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl);
-                    }
-
-                    int n = num_input;
-                    while (n > 0)
-                    {
-                        vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl);
-
-                        _sum = __riscv_vfmacc_vf_f32m2(_sum, *m, _w, vl);
-
-                        m += 1;
-                        kptr += packn;
-                        n -= 1;
-                    }
-
-                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                    __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
-                    outptr += packn;
-                }
-            }
-
-            if (elempack == packn && num_output_elempack == 1)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
-                    if (bias_term)
-                    {
-                        _sum = __riscv_vfmv_v_f_f32m2(bias_data[p], vl);
-                    }
-
-                    int n = num_input;
-                    while (n > 0)
-                    {
-                        vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl);
-
-                        _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
-
-                        m += packn;
-                        kptr += 1;
-                        n -= 1;
-                    }
-
-                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                    __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
-                    outptr += packn;
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == 1)
-            {
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    float sum = 0.f;
-
-                    if (bias_term)
-                    {
-                        sum = bias_data[p];
-                    }
-
-                    for (int i = 0; i < num_input; i++)
-                    {
-                        sum += (float)m[i] * (float)kptr[i];
-                    }
-
-                    sum = activation_ss(sum, activation_type, activation_params);
-
-                    outptr[0] = (__fp16)sum;
-                    outptr += 1;
-                }
-            }
-        }
-
-        return 0;
-    }
-
-    // flatten
-    Mat bottom_blob_flattened = bottom_blob;
-    if (bottom_blob.dims != 1)
-    {
-        Option opt_flatten = opt;
-        opt_flatten.blob_allocator = opt.workspace_allocator;
-
-        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
-    }
-
-    size_t elemsize = bottom_blob_flattened.elemsize;
-    int elempack = bottom_blob_flattened.elempack;
-
-    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
-    size_t out_elemsize = elemsize / elempack * out_elempack;
-
-    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    if (out_elempack == packn)
-    {
-        // num_output
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
-        {
-            const size_t vl = __riscv_vsetvl_e16m1(packn);
-            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
-            if (bias_term)
-            {
-                _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl);
-            }
-
-            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
-
-            const __fp16* sptr = bottom_blob_flattened;
-
-            int n = num_input;
-            while (n > 0)
-            {
-                vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl);
-
-                _sum = __riscv_vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl);
-
-                sptr += 1;
-                kptr += packn;
-                n -= 1;
-            }
-
-            _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-            __fp16* outptr = (__fp16*)top_blob;
-            __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
-        }
-    }
-
-    if (out_elempack == 1)
-    {
-        // num_output
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output; p++)
-        {
-            float sum = 0.f;
-
-            if (bias_term)
-                sum = bias_data[p];
-
-            const __fp16* kptr = weight_data_tm.row<__fp16>(p);
-
-            const __fp16* sptr = bottom_blob_flattened;
-
-            int i = 0;
-            for (; i < num_input; i++)
-            {
-                float v = (float)(*sptr);
-                float k = (float)(*kptr);
-
-                sum += v * k;
-
-                sptr++;
-                kptr++;
-            }
-
-            sum = activation_ss(sum, activation_type, activation_params);
-
-            __fp16* outptr = (__fp16*)top_blob;
-            outptr[p] = (__fp16)sum;
-        }
-    }
-
-    return 0;
-}
-
-int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    const int packn = csrr_vlenb() / 2;
-
-    const int num_input = weight_data_size / num_output;
-
-    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
-    {
-        // gemm
-        int h = bottom_blob.h;
-        size_t elemsize = bottom_blob.elemsize;
-        int elempack = bottom_blob.elempack;
-
-        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int j = 0; j < h; j++)
-        {
-            if (elempack == packn && num_output_elempack == packn)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    for (int l = 0; l < packn; l++)
-                    {
-                        const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
-                        const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                        vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
-
-                        if (bias_term)
-                        {
-                            _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl);
-                        }
-
-                        int n = num_input;
-                        while (n > 0)
-                        {
-                            vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl);
-
-                            _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
-
-                            m += packn;
-                            kptr += packn;
-                            n -= 1;
-                        }
-
-                        _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, _sum, vl);
-                        outptr += packn;
-                    }
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == packn)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output / num_output_elempack; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
-
-                    if (bias_term)
-                    {
-                        _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
-                    }
-
-                    int n = num_input;
-                    while (n > 0)
-                    {
-                        vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
-
-                        _sum = __riscv_vfmacc_vf_f16m1(_sum, *m, _w, vl);
-
-                        m += 1;
-                        kptr += packn;
-                        n -= 1;
-                    }
-
-                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                    __riscv_vse16_v_f16m1(outptr, _sum, vl);
-                    outptr += packn;
-                }
-            }
-
-            if (elempack == packn && num_output_elempack == 1)
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
-
-                    if (bias_term)
-                    {
-                        _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl);
-                    }
-
-                    int n = num_input;
-                    while (n > 0)
-                    {
-                        vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl);
-
-                        _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
-
-                        m += packn;
-                        kptr += 1;
-                        n -= 1;
-                    }
-
-                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-                    __riscv_vse16_v_f16m1(outptr, _sum, vl);
-                    outptr += packn;
-                }
-            }
-
-            if (elempack == 1 && num_output_elempack == 1)
-            {
-                __fp16* outptr = top_blob.row<__fp16>(j);
-
-                for (int p = 0; p < num_output; p++)
-                {
-                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
-                    const __fp16* m = bottom_blob.row<const __fp16>(j);
-
-                    float sum = 0.f;
-
-                    if (bias_term)
-                    {
-                        sum = bias_data[p];
-                    }
-
-                    for (int i = 0; i < num_input; i++)
-                    {
-                        sum += (float)(m[i] * kptr[i]);
-                    }
-
-                    sum = activation_ss(sum, activation_type, activation_params);
-
-                    outptr[0] = (__fp16)sum;
-                    outptr += 1;
-                }
-            }
-        }
-
-        return 0;
-    }
-
-    // flatten
-    Mat bottom_blob_flattened = bottom_blob;
-    if (bottom_blob.dims != 1)
-    {
-        Option opt_flatten = opt;
-        opt_flatten.blob_allocator = opt.workspace_allocator;
-
-        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
-    }
-
-    size_t elemsize = bottom_blob_flattened.elemsize;
-    int elempack = bottom_blob_flattened.elempack;
-
-    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
-    size_t out_elemsize = elemsize / elempack * out_elempack;
-
-    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    if (out_elempack == packn)
-    {
-        // num_output
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output / out_elempack; p++)
-        {
-            const size_t vl = __riscv_vsetvl_e16m1(packn);
-            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
-
-            if (bias_term)
-            {
-                _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
-            }
-
-            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
-
-            const __fp16* sptr = bottom_blob_flattened;
-
-            int n = num_input;
-            while (n > 0)
-            {
-                vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
-
-                _sum = __riscv_vfmacc_vf_f16m1(_sum, *sptr, _w, vl);
-
-                sptr += 1;
-                kptr += packn;
-                n -= 1;
-            }
-
-            _sum = activation_ps(_sum, activation_type, activation_params, vl);
-
-            __fp16* outptr = (__fp16*)top_blob;
-            __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl);
-        }
-    }
-
-    if (out_elempack == 1)
-    {
-        // num_output
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < num_output; p++)
-        {
-            float sum = 0.f;
-
-            if (bias_term)
-                sum = bias_data[p];
-
-            const __fp16* kptr = weight_data_tm.row<__fp16>(p);
-
-            const __fp16* sptr = bottom_blob_flattened;
-
-            int i = 0;
-            for (; i < num_input; i++)
-            {
-                __fp16 v = *sptr;
-                __fp16 k = *kptr;
-
-                sum += (float)(v * k);
-
-                sptr++;
-                kptr++;
-            }
-
-            sum = activation_ss(sum, activation_type, activation_params);
-
-            __fp16* outptr = (__fp16*)top_blob;
-            outptr[p] = (__fp16)sum;
-        }
-    }
-
-    return 0;
-}
-#endif // __riscv_vector && __riscv_zvfh
-
 } // namespace ncnn
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index 0a75d79bbb51..4c6384a2e0da 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -30,7 +30,7 @@ class InnerProduct_riscv : public InnerProduct
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     int create_pipeline_fp16s(const Option& opt);
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/innerproduct_riscv_zvfh.cpp b/src/layer/riscv/innerproduct_riscv_zvfh.cpp
new file mode 100644
index 000000000000..86c32c0a9930
--- /dev/null
+++ b/src/layer/riscv/innerproduct_riscv_zvfh.cpp
@@ -0,0 +1,594 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+#if __riscv_zvfh
+int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
+{
+    const int packn = csrr_vlenb() / 2;
+
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]);
+                }
+            }
+        }
+    }
+
+    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
+
+    if (opt.lightmode)
+        weight_data.release();
+
+    return 0;
+}
+
+int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int packn = csrr_vlenb() / 2;
+
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            if (elempack == packn && num_output_elempack == packn)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    for (int l = 0; l < packn; l++)
+                    {
+                        const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
+                        const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                        vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+
+                        if (bias_term)
+                        {
+                            _sum = __riscv_vfmv_v_f_f32m2(bias_data[p * packn + l], vl);
+                        }
+
+                        int n = num_input;
+                        while (n > 0)
+                        {
+                            vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl);
+
+                            _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
+
+                            m += packn;
+                            kptr += packn;
+                            n -= 1;
+                        }
+
+                        _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
+                        outptr += packn;
+                    }
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == packn)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+
+                    if (bias_term)
+                    {
+                        _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl);
+                    }
+
+                    int n = num_input;
+                    while (n > 0)
+                    {
+                        vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl);
+
+                        _sum = __riscv_vfmacc_vf_f32m2(_sum, *m, _w, vl);
+
+                        m += 1;
+                        kptr += packn;
+                        n -= 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                    __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
+                    outptr += packn;
+                }
+            }
+
+            if (elempack == packn && num_output_elempack == 1)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+
+                    if (bias_term)
+                    {
+                        _sum = __riscv_vfmv_v_f_f32m2(bias_data[p], vl);
+                    }
+
+                    int n = num_input;
+                    while (n > 0)
+                    {
+                        vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl);
+
+                        _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl);
+
+                        m += packn;
+                        kptr += 1;
+                        n -= 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                    __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
+                    outptr += packn;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        sum += (float)m[i] * (float)kptr[i];
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = (__fp16)sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (out_elempack == packn)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const size_t vl = __riscv_vsetvl_e16m1(packn);
+            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+
+            if (bias_term)
+            {
+                _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl);
+            }
+
+            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
+
+            const __fp16* sptr = bottom_blob_flattened;
+
+            int n = num_input;
+            while (n > 0)
+            {
+                vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl);
+
+                _sum = __riscv_vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl);
+
+                sptr += 1;
+                kptr += packn;
+                n -= 1;
+            }
+
+            _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+            __fp16* outptr = (__fp16*)top_blob;
+            __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl);
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const __fp16* kptr = weight_data_tm.row<__fp16>(p);
+
+            const __fp16* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                float v = (float)(*sptr);
+                float k = (float)(*kptr);
+
+                sum += v * k;
+
+                sptr++;
+                kptr++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            __fp16* outptr = (__fp16*)top_blob;
+            outptr[p] = (__fp16)sum;
+        }
+    }
+
+    return 0;
+}
+
+int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int packn = csrr_vlenb() / 2;
+
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            if (elempack == packn && num_output_elempack == packn)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    for (int l = 0; l < packn; l++)
+                    {
+                        const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l;
+                        const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                        vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl);
+
+                        if (bias_term)
+                        {
+                            _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl);
+                        }
+
+                        int n = num_input;
+                        while (n > 0)
+                        {
+                            vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl);
+
+                            _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
+
+                            m += packn;
+                            kptr += packn;
+                            n -= 1;
+                        }
+
+                        _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, _sum, vl);
+                        outptr += packn;
+                    }
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == packn)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+
+                    if (bias_term)
+                    {
+                        _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
+                    }
+
+                    int n = num_input;
+                    while (n > 0)
+                    {
+                        vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
+
+                        _sum = __riscv_vfmacc_vf_f16m1(_sum, *m, _w, vl);
+
+                        m += 1;
+                        kptr += packn;
+                        n -= 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                    __riscv_vse16_v_f16m1(outptr, _sum, vl);
+                    outptr += packn;
+                }
+            }
+
+            if (elempack == packn && num_output_elempack == 1)
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+
+                    if (bias_term)
+                    {
+                        _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl);
+                    }
+
+                    int n = num_input;
+                    while (n > 0)
+                    {
+                        vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl);
+
+                        _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl);
+
+                        m += packn;
+                        kptr += 1;
+                        n -= 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+                    __riscv_vse16_v_f16m1(outptr, _sum, vl);
+                    outptr += packn;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                __fp16* outptr = top_blob.row<__fp16>(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
+                    const __fp16* m = bottom_blob.row<const __fp16>(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        sum += (float)(m[i] * kptr[i]);
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = (__fp16)sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (out_elempack == packn)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const size_t vl = __riscv_vsetvl_e16m1(packn);
+            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+
+            if (bias_term)
+            {
+                _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl);
+            }
+
+            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);
+
+            const __fp16* sptr = bottom_blob_flattened;
+
+            int n = num_input;
+            while (n > 0)
+            {
+                vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl);
+
+                _sum = __riscv_vfmacc_vf_f16m1(_sum, *sptr, _w, vl);
+
+                sptr += 1;
+                kptr += packn;
+                n -= 1;
+            }
+
+            _sum = activation_ps(_sum, activation_type, activation_params, vl);
+
+            __fp16* outptr = (__fp16*)top_blob;
+            __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl);
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const __fp16* kptr = weight_data_tm.row<__fp16>(p);
+
+            const __fp16* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                __fp16 v = *sptr;
+                __fp16 k = *kptr;
+
+                sum += (float)(v * k);
+
+                sptr++;
+                kptr++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            __fp16* outptr = (__fp16*)top_blob;
+            outptr[p] = (__fp16)sum;
+        }
+    }
+
+    return 0;
+}
+#endif // __riscv_zvfh
+
+} // namespace ncnn
diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp
index fca4ba18ac3f..0d7e9d7cd2df 100644
--- a/src/layer/riscv/interp_riscv.cpp
+++ b/src/layer/riscv/interp_riscv.cpp
@@ -27,20 +27,14 @@ namespace ncnn {
 #if __riscv_vector
 #include "interp_bicubic_packn.h"
 #include "interp_bilinear_packn.h"
-#if __riscv_zvfh
-#include "interp_bicubic_fp16s.h"
-#include "interp_bicubic_packn_fp16s.h"
-#include "interp_bilinear_fp16s.h"
-#include "interp_bilinear_packn_fp16s.h"
-#endif
 #endif
 
 Interp_riscv::Interp_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if __riscv_zvfh
-    support_fp16_storage = true;
+#if NCNN_ZVFH
+    support_fp16_storage = cpu_support_riscv_zvfh();
 #endif
 #endif // __riscv_vector
 }
@@ -51,9 +45,9 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
     const Mat& reference_blob = bottom_blobs[1];
     Mat& top_blob = top_blobs[0];
 
+#if NCNN_ZVFH
     int elembits = bottom_blob.elembits();
 
-#if __riscv_vector && __riscv_zvfh
     if (opt.use_fp16_storage && elembits == 16)
     {
         if (opt.use_fp16_arithmetic)
@@ -489,729 +483,4 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
     return 0;
 }
 
-#if __riscv_vector && __riscv_zvfh
-int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
-{
-    const int packn = csrr_vlenb() / 2;
-
-    const Mat& bottom_blob = bottom_blobs[0];
-    const Mat& reference_blob = bottom_blobs[1];
-    Mat& top_blob = top_blobs[0];
-
-    int h = bottom_blob.h;
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-    int dims = bottom_blob.dims;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    int outw = reference_blob.w;
-    int outh = reference_blob.h;
-
-    if (dims == 1)
-    {
-        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        if (elempack == packn)
-        {
-            const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < w; q++)
-            {
-                Mat top_blob_c = top_blob.channel(q);
-                vfloat16m1_t _v = __riscv_vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl);
-                top_blob_c.fill(_v);
-            }
-
-            return 0;
-        }
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < w; q++)
-        {
-            Mat top_blob_c = top_blob.channel(q);
-            const __fp16* ptr = bottom_blob;
-            top_blob_c.fill(ptr[q]);
-        }
-
-        return 0;
-    }
-
-    if (dims == 2)
-    {
-        if (outw == w)
-        {
-            top_blob = bottom_blob;
-            return 0;
-        }
-
-        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        if (elempack == packn)
-        {
-            if (resize_type == 1) // nearest
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                const float ws = output_width ? w / (float)outw : 1.f / width_scale;
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int y = 0; y < h; y++)
-                {
-                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                    __fp16* outptr = top_blob.row<__fp16>(y);
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int in_x = std::min((int)(x * ws), (w - 1));
-
-                        vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl);
-                        __riscv_vse16_v_f16m1(outptr, _p, vl);
-
-                        outptr += packn;
-                    }
-                }
-            }
-
-            if (resize_type == 2) // bilinear
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                int* buf = new int[outw + outw * packn];
-
-                int* xofs = buf;
-                float* alpha = (float*)(buf + outw);
-
-                linear_coeffs(w, outw, xofs, alpha, align_corner);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int y = 0; y < h; y++)
-                {
-                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                    __fp16* outptr = top_blob.row<__fp16>(y);
-                    const float* alphap = alpha;
-
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int sx = xofs[x] * packn;
-                        const __fp16* Sp = ptr + sx;
-
-                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl);
-                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl);
-                        vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl);
-
-                        alphap += 2;
-                        outptr += packn;
-                    }
-                }
-
-                delete[] buf;
-            }
-
-            if (resize_type == 3) // bicubic
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                int* buf = new int[outw + outw * packn];
-
-                int* xofs = buf;
-                float* alpha = (float*)(buf + outw);
-
-                cubic_coeffs(w, outw, xofs, alpha, align_corner);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int y = 0; y < h; y++)
-                {
-                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                    __fp16* outptr = top_blob.row<__fp16>(y);
-                    const float* alphap = alpha;
-
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int sx = xofs[x] * packn;
-                        const __fp16* Sp = ptr + sx;
-
-                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl);
-                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl);
-                        vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl);
-                        vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl);
-                        vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl);
-
-                        alphap += 4;
-                        outptr += packn;
-                    }
-                }
-
-                delete[] buf;
-            }
-
-            return 0;
-        }
-
-        if (resize_type == 1) // nearest
-        {
-            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int y = 0; y < h; y++)
-            {
-                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                __fp16* outptr = top_blob.row<__fp16>(y);
-                for (int x = 0; x < outw; x++)
-                {
-                    int in_x = std::min((int)(x * ws), (w - 1));
-                    *outptr++ = ptr[in_x];
-                }
-            }
-        }
-
-        if (resize_type == 2) // bilinear
-        {
-            int* buf = new int[outw + outw * 2];
-
-            int* xofs = buf;
-            float* alpha = (float*)(buf + outw);
-
-            linear_coeffs(w, outw, xofs, alpha, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int y = 0; y < h; y++)
-            {
-                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                __fp16* outptr = top_blob.row<__fp16>(y);
-                const float* alphap = alpha;
-
-                for (int x = 0; x < outw; x++)
-                {
-                    int sx = xofs[x];
-                    const __fp16* Sp = ptr + sx;
-                    float a0 = alphap[0];
-                    float a1 = alphap[1];
-                    *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1);
-                    alphap += 2;
-                }
-            }
-
-            delete[] buf;
-        }
-
-        if (resize_type == 3) // bicubic
-        {
-            int* buf = new int[outw + outw * 4];
-
-            int* xofs = buf;
-            float* alpha = (float*)(buf + outw);
-
-            cubic_coeffs(w, outw, xofs, alpha, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int y = 0; y < h; y++)
-            {
-                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                __fp16* outptr = top_blob.row<__fp16>(y);
-                const float* alphap = alpha;
-
-                for (int x = 0; x < outw; x++)
-                {
-                    int sx = xofs[x];
-                    const __fp16* Sp = ptr + sx;
-                    float a0 = alphap[0];
-                    float a1 = alphap[1];
-                    float a2 = alphap[2];
-                    float a3 = alphap[3];
-                    *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3);
-                    alphap += 4;
-                }
-            }
-
-            delete[] buf;
-        }
-
-        return 0;
-    }
-
-    if (outw == w && outh == h)
-    {
-        top_blob = bottom_blob;
-        return 0;
-    }
-
-    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    if (elempack == packn)
-    {
-        if (resize_type == 1) // nearest
-        {
-            const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-            const float hs = output_height ? h / (float)outh : 1.f / height_scale;
-            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat src = bottom_blob.channel(q);
-                Mat dst = top_blob.channel(q);
-
-                for (int y = 0; y < outh; y++)
-                {
-                    int in_y = std::min((int)(y * hs), (h - 1));
-
-                    const __fp16* ptr = src.row<const __fp16>(in_y);
-                    __fp16* outptr = dst.row<__fp16>(y);
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int in_x = std::min((int)(x * ws), (w - 1));
-
-                        vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl);
-                        __riscv_vse16_v_f16m1(outptr, _p, vl);
-
-                        outptr += packn;
-                    }
-                }
-            }
-        }
-
-        if (resize_type == 2) // bilinear
-        {
-            int* buf = new int[outw + outh + outw * 2 + outh * 2];
-
-            int* xofs = buf;        //new int[outw];
-            int* yofs = buf + outw; //new int[outh];
-
-            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
-            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
-
-            linear_coeffs(w, outw, xofs, alpha, align_corner);
-            linear_coeffs(h, outh, yofs, beta, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat src = bottom_blob.channel(q);
-                Mat dst = top_blob.channel(q);
-
-                resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs);
-            }
-
-            delete[] buf;
-        }
-
-        if (resize_type == 3) // bicubic
-        {
-            int* buf = new int[outw + outh + outw * 4 + outh * 4];
-
-            int* xofs = buf;        //new int[outw];
-            int* yofs = buf + outw; //new int[outh];
-
-            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
-            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
-
-            cubic_coeffs(w, outw, xofs, alpha, align_corner);
-            cubic_coeffs(h, outh, yofs, beta, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat src = bottom_blob.channel(q);
-                Mat dst = top_blob.channel(q);
-
-                resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs);
-            }
-
-            delete[] buf;
-        }
-
-        return 0;
-    }
-
-    if (resize_type == 1) // nearest
-    {
-        const float hs = output_height ? h / (float)outh : 1.f / height_scale;
-        const float ws = output_width ? w / (float)outw : 1.f / width_scale;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const Mat src = bottom_blob.channel(q);
-            Mat dst = top_blob.channel(q);
-
-            for (int y = 0; y < outh; y++)
-            {
-                int in_y = std::min((int)(y * hs), (h - 1));
-
-                const __fp16* ptr = src.row<const __fp16>(in_y);
-                __fp16* outptr = dst.row<__fp16>(y);
-                for (int x = 0; x < outw; x++)
-                {
-                    int in_x = std::min((int)(x * ws), (w - 1));
-                    *outptr++ = ptr[in_x];
-                }
-            }
-        }
-    }
-
-    if (resize_type == 2) // bilinear
-    {
-        int* buf = new int[outw + outh + outw * 2 + outh * 2];
-
-        int* xofs = buf;        //new int[outw];
-        int* yofs = buf + outw; //new int[outh];
-
-        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
-        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
-
-        linear_coeffs(w, outw, xofs, alpha, align_corner);
-        linear_coeffs(h, outh, yofs, beta, align_corner);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const Mat src = bottom_blob.channel(q);
-            Mat dst = top_blob.channel(q);
-
-            resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs);
-        }
-
-        delete[] buf;
-    }
-
-    if (resize_type == 3) // bicubic
-    {
-        int* buf = new int[outw + outh + outw * 4 + outh * 4];
-
-        int* xofs = buf;        //new int[outw];
-        int* yofs = buf + outw; //new int[outh];
-
-        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
-        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
-
-        cubic_coeffs(w, outw, xofs, alpha, align_corner);
-        cubic_coeffs(h, outh, yofs, beta, align_corner);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const Mat src = bottom_blob.channel(q);
-            Mat dst = top_blob.channel(q);
-
-            resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs);
-        }
-
-        delete[] buf;
-    }
-
-    return 0;
-}
-
-int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
-{
-    const int packn = csrr_vlenb() / 2;
-
-    const Mat& bottom_blob = bottom_blobs[0];
-    const Mat& reference_blob = bottom_blobs[1];
-    Mat& top_blob = top_blobs[0];
-
-    int h = bottom_blob.h;
-    int w = bottom_blob.w;
-    int channels = bottom_blob.c;
-    int dims = bottom_blob.dims;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    int outw = reference_blob.w;
-    int outh = reference_blob.h;
-
-    if (dims == 1 || resize_type == 1) // nearest
-    {
-        return forward_fp16s(bottom_blobs, top_blobs, opt);
-    }
-
-    if (dims == 2)
-    {
-        if (outw == w)
-        {
-            top_blob = bottom_blob;
-            return 0;
-        }
-
-        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        if (elempack == packn)
-        {
-            if (resize_type == 2) // bilinear
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                int* buf = new int[outw + outw * packn];
-
-                int* xofs = buf;
-                __fp16* alpha = (__fp16*)(buf + outw);
-
-                linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int y = 0; y < h; y++)
-                {
-                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                    __fp16* outptr = top_blob.row<__fp16>(y);
-                    const __fp16* alphap = alpha;
-
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int sx = xofs[x] * packn;
-                        const __fp16* Sp = ptr + sx;
-
-                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl);
-                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl);
-                        vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, _p, vl);
-
-                        alphap += 2;
-                        outptr += packn;
-                    }
-                }
-
-                delete[] buf;
-            }
-
-            if (resize_type == 3) // bicubic
-            {
-                const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-                int* buf = new int[outw + outw * packn];
-
-                int* xofs = buf;
-                __fp16* alpha = (__fp16*)(buf + outw);
-
-                cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int y = 0; y < h; y++)
-                {
-                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                    __fp16* outptr = top_blob.row<__fp16>(y);
-                    const __fp16* alphap = alpha;
-
-                    for (int x = 0; x < outw; x++)
-                    {
-                        int sx = xofs[x] * packn;
-                        const __fp16* Sp = ptr + sx;
-
-                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl);
-                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl);
-                        vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl);
-                        vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl);
-                        vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl);
-
-                        __riscv_vse16_v_f16m1(outptr, _p, vl);
-
-                        alphap += 4;
-                        outptr += packn;
-                    }
-                }
-
-                delete[] buf;
-            }
-
-            return 0;
-        }
-
-        if (resize_type == 2) // bilinear
-        {
-            int* buf = new int[outw + outw * 2];
-
-            int* xofs = buf;
-            __fp16* alpha = (__fp16*)(buf + outw);
-
-            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int y = 0; y < h; y++)
-            {
-                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                __fp16* outptr = top_blob.row<__fp16>(y);
-                const __fp16* alphap = alpha;
-
-                for (int x = 0; x < outw; x++)
-                {
-                    int sx = xofs[x];
-                    const __fp16* Sp = ptr + sx;
-                    __fp16 a0 = alphap[0];
-                    __fp16 a1 = alphap[1];
-                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
-                    alphap += 2;
-                }
-            }
-
-            delete[] buf;
-        }
-
-        if (resize_type == 3) // bicubic
-        {
-            int* buf = new int[outw + outw * 4];
-
-            int* xofs = buf;
-            __fp16* alpha = (__fp16*)(buf + outw);
-
-            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int y = 0; y < h; y++)
-            {
-                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
-                __fp16* outptr = top_blob.row<__fp16>(y);
-                const __fp16* alphap = alpha;
-
-                for (int x = 0; x < outw; x++)
-                {
-                    int sx = xofs[x];
-                    const __fp16* Sp = ptr + sx;
-                    __fp16 a0 = alphap[0];
-                    __fp16 a1 = alphap[1];
-                    __fp16 a2 = alphap[2];
-                    __fp16 a3 = alphap[3];
-                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
-                    alphap += 4;
-                }
-            }
-
-            delete[] buf;
-        }
-
-        return 0;
-    }
-
-    if (outw == w && outh == h)
-    {
-        top_blob = bottom_blob;
-        return 0;
-    }
-
-    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    if (elempack == packn)
-    {
-        if (resize_type == 2) // bilinear
-        {
-            int* buf = new int[outw + outh + outw * 2 + outh * 2];
-
-            int* xofs = buf;        //new int[outw];
-            int* yofs = buf + outw; //new int[outh];
-
-            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
-            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
-
-            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-            linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat src = bottom_blob.channel(q);
-                Mat dst = top_blob.channel(q);
-
-                resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs);
-            }
-
-            delete[] buf;
-        }
-
-        if (resize_type == 3) // bicubic
-        {
-            int* buf = new int[outw + outh + outw * 4 + outh * 4];
-
-            int* xofs = buf;        //new int[outw];
-            int* yofs = buf + outw; //new int[outh];
-
-            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
-            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
-
-            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-            cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
-
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat src = bottom_blob.channel(q);
-                Mat dst = top_blob.channel(q);
-
-                resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs);
-            }
-
-            delete[] buf;
-        }
-
-        return 0;
-    }
-
-    if (resize_type == 2) // bilinear
-    {
-        int* buf = new int[outw + outh + outw * 2 + outh * 2];
-
-        int* xofs = buf;        //new int[outw];
-        int* yofs = buf + outw; //new int[outh];
-
-        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
-        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
-
-        linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-        linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const Mat src = bottom_blob.channel(q);
-            Mat dst = top_blob.channel(q);
-
-            resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
-        }
-
-        delete[] buf;
-    }
-
-    if (resize_type == 3) // bicubic
-    {
-        int* buf = new int[outw + outh + outw * 4 + outh * 4];
-
-        int* xofs = buf;        //new int[outw];
-        int* yofs = buf + outw; //new int[outh];
-
-        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
-        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
-
-        cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
-        cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const Mat src = bottom_blob.channel(q);
-            Mat dst = top_blob.channel(q);
-
-            resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
-        }
-
-        delete[] buf;
-    }
-
-    return 0;
-}
-#endif // __riscv_vector && __riscv_zvfh
-
 } // namespace ncnn
diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h
index c361c0b6454f..4b79c755edcb 100644
--- a/src/layer/riscv/interp_riscv.h
+++ b/src/layer/riscv/interp_riscv.h
@@ -27,7 +27,7 @@ class Interp_riscv : public Interp
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
 protected:
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
     int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/interp_riscv_zvfh.cpp b/src/layer/riscv/interp_riscv_zvfh.cpp
new file mode 100644
index 000000000000..7ebbcc236aff
--- /dev/null
+++ b/src/layer/riscv/interp_riscv_zvfh.cpp
@@ -0,0 +1,761 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "interp_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "riscv_usability.h"
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#include "interp_bicubic.h"
+#include "interp_bilinear.h"
+
+#if __riscv_vector
+#if __riscv_zvfh
+#include "interp_bicubic_fp16s.h"
+#include "interp_bicubic_packn_fp16s.h"
+#include "interp_bilinear_fp16s.h"
+#include "interp_bilinear_packn_fp16s.h"
+#endif
+#endif
+
+#if __riscv_zvfh
+int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const int packn = csrr_vlenb() / 2;
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    if (dims == 1)
+    {
+        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (elempack == packn)
+        {
+            const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < w; q++)
+            {
+                Mat top_blob_c = top_blob.channel(q);
+                vfloat16m1_t _v = __riscv_vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl);
+                top_blob_c.fill(_v);
+            }
+
+            return 0;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < w; q++)
+        {
+            Mat top_blob_c = top_blob.channel(q);
+            const __fp16* ptr = bottom_blob;
+            top_blob_c.fill(ptr[q]);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        if (outw == w)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (elempack == packn)
+        {
+            if (resize_type == 1) // nearest
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                    __fp16* outptr = top_blob.row<__fp16>(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl);
+                        __riscv_vse16_v_f16m1(outptr, _p, vl);
+
+                        outptr += packn;
+                    }
+                }
+            }
+
+            if (resize_type == 2) // bilinear
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                int* buf = new int[outw + outw * packn];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                    __fp16* outptr = top_blob.row<__fp16>(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * packn;
+                        const __fp16* Sp = ptr + sx;
+
+                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl);
+                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl);
+                        vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl);
+
+                        alphap += 2;
+                        outptr += packn;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                int* buf = new int[outw + outw * packn];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                    __fp16* outptr = top_blob.row<__fp16>(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * packn;
+                        const __fp16* Sp = ptr + sx;
+
+                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl);
+                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl);
+                        vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl);
+                        vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl);
+                        vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl);
+
+                        alphap += 4;
+                        outptr += packn;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+
+        if (resize_type == 1) // nearest
+        {
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                __fp16* outptr = top_blob.row<__fp16>(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outw * 2];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                __fp16* outptr = top_blob.row<__fp16>(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const __fp16* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1);
+                    alphap += 2;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outw * 4];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                __fp16* outptr = top_blob.row<__fp16>(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const __fp16* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    float a2 = alphap[2];
+                    float a3 = alphap[3];
+                    *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3);
+                    alphap += 4;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (outw == w && outh == h)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (elempack == packn)
+    {
+        if (resize_type == 1) // nearest
+        {
+            const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+            const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int)(y * hs), (h - 1));
+
+                    const __fp16* ptr = src.row<const __fp16>(in_y);
+                    __fp16* outptr = dst.row<__fp16>(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl);
+                        __riscv_vse16_v_f16m1(outptr, _p, vl);
+
+                        outptr += packn;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+            linear_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (resize_type == 1) // nearest
+    {
+        const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+        const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int)(y * hs), (h - 1));
+
+                const __fp16* ptr = src.row<const __fp16>(in_y);
+                __fp16* outptr = dst.row<__fp16>(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+    }
+
+    if (resize_type == 2) // bilinear
+    {
+        int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+        linear_coeffs(w, outw, xofs, alpha, align_corner);
+        linear_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3) // bicubic
+    {
+        int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+        cubic_coeffs(w, outw, xofs, alpha, align_corner);
+        cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    return 0;
+}
+
+int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const int packn = csrr_vlenb() / 2;
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    if (dims == 1 || resize_type == 1) // nearest
+    {
+        return forward_fp16s(bottom_blobs, top_blobs, opt);
+    }
+
+    if (dims == 2)
+    {
+        if (outw == w)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (elempack == packn)
+        {
+            if (resize_type == 2) // bilinear
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                int* buf = new int[outw + outw * packn];
+
+                int* xofs = buf;
+                __fp16* alpha = (__fp16*)(buf + outw);
+
+                linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                    __fp16* outptr = top_blob.row<__fp16>(y);
+                    const __fp16* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * packn;
+                        const __fp16* Sp = ptr + sx;
+
+                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl);
+                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl);
+                        vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, _p, vl);
+
+                        alphap += 2;
+                        outptr += packn;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+                int* buf = new int[outw + outw * packn];
+
+                int* xofs = buf;
+                __fp16* alpha = (__fp16*)(buf + outw);
+
+                cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                    __fp16* outptr = top_blob.row<__fp16>(y);
+                    const __fp16* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * packn;
+                        const __fp16* Sp = ptr + sx;
+
+                        vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl);
+                        vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl);
+                        vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl);
+                        vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl);
+                        vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl);
+
+                        __riscv_vse16_v_f16m1(outptr, _p, vl);
+
+                        alphap += 4;
+                        outptr += packn;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outw * 2];
+
+            int* xofs = buf;
+            __fp16* alpha = (__fp16*)(buf + outw);
+
+            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                __fp16* outptr = top_blob.row<__fp16>(y);
+                const __fp16* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const __fp16* Sp = ptr + sx;
+                    __fp16 a0 = alphap[0];
+                    __fp16 a1 = alphap[1];
+                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
+                    alphap += 2;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outw * 4];
+
+            int* xofs = buf;
+            __fp16* alpha = (__fp16*)(buf + outw);
+
+            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
+                __fp16* outptr = top_blob.row<__fp16>(y);
+                const __fp16* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const __fp16* Sp = ptr + sx;
+                    __fp16 a0 = alphap[0];
+                    __fp16 a1 = alphap[1];
+                    __fp16 a2 = alphap[2];
+                    __fp16 a3 = alphap[3];
+                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
+                    alphap += 4;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (outw == w && outh == h)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (elempack == packn)
+    {
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
+            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
+
+            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+            linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
+            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
+
+            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (resize_type == 2) // bilinear
+    {
+        int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
+        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];
+
+        linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+        linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3) // bicubic
+    {
+        int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
+        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];
+
+        cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
+        cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    return 0;
+}
+#endif // __riscv_zvfh
+
+} // namespace ncnn
diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp
index 0f0c8e3d2da5..9091e3e8b2ad 100644
--- a/src/layer/riscv/pooling_riscv.cpp
+++ b/src/layer/riscv/pooling_riscv.cpp
@@ -28,8 +28,8 @@ Pooling_riscv::Pooling_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if __riscv_zvfh
-    support_fp16_storage = true;
+#if NCNN_ZVFH
+    support_fp16_storage = cpu_support_riscv_zvfh();
 #endif
 #endif // __riscv_vector
 }
@@ -55,9 +55,9 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
         return Pooling::forward(bottom_blob, top_blob, opt);
     }
 
+#if NCNN_ZVFH
     int elembits = bottom_blob.elembits();
 
-#if __riscv_vector && __riscv_zvfh
     if (opt.use_fp16_storage && elembits == 16)
     {
         if (opt.use_fp16_arithmetic)
@@ -308,655 +308,4 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     return Pooling::forward(bottom_blob, top_blob, opt);
 }
 
-#if __riscv_vector && __riscv_zvfh
-int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    // max value in NxN window
-    // avg value in NxN window
-
-    const int packn = csrr_vlenb() / 2;
-    const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
-
-    if (global_pooling)
-    {
-        top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
-        if (top_blob.empty())
-            return -100;
-
-        int size = w * h;
-
-        if (pooling_type == PoolMethod_MAX)
-        {
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr = bottom_blob.channel(q);
-
-                    vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
-                    for (int i = 0; i < size; i++)
-                    {
-                        vfloat16m1_t _val = __riscv_vle16_v_f16m1(ptr, vl);
-                        _max = __riscv_vfmax_vv_f16m1(_max, _val, vl);
-                        ptr += packn;
-                    }
-
-                    __fp16* outptr = top_blob;
-                    __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl);
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr = bottom_blob.channel(q);
-
-                    __fp16 max = (__fp16)-FLT_MAX;
-                    for (int i = 0; i < size; i++)
-                    {
-                        max = std::max(max, ptr[i]);
-                    }
-
-                    __fp16* outptr = top_blob;
-                    outptr[q] = max;
-                }
-            }
-        }
-
-        if (pooling_type == PoolMethod_AVE)
-        {
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr = bottom_blob.channel(q);
-
-                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-                    for (int i = 0; i < size; i++)
-                    {
-                        vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl);
-                        _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
-                        ptr += packn;
-                    }
-
-                    vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / size, vl);
-
-                    __fp16* outptr = top_blob;
-                    __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const __fp16* ptr = bottom_blob.channel(q);
-
-                    float sum = 0.f;
-                    for (int i = 0; i < size; i++)
-                    {
-                        sum += (float)ptr[i];
-                    }
-
-                    __fp16* outptr = top_blob;
-                    outptr[q] = (__fp16)(sum / size);
-                }
-            }
-        }
-
-        return 0;
-    }
-
-    Mat bottom_blob_bordered;
-    make_padding(bottom_blob, bottom_blob_bordered, opt);
-    if (bottom_blob_bordered.empty())
-        return -100;
-
-    w = bottom_blob_bordered.w;
-    h = bottom_blob_bordered.h;
-
-    int outw = (w - kernel_w) / stride_w + 1;
-    int outh = (h - kernel_h) / stride_h + 1;
-
-    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w - kernel_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2++;
-            }
-            p2 += gap;
-        }
-    }
-
-    if (pooling_type == PoolMethod_MAX)
-    {
-        if (elempack == packn)
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat m = bottom_blob_bordered.channel(q);
-                __fp16* outptr = top_blob.channel(q);
-
-                for (int i = 0; i < outh; i++)
-                {
-                    for (int j = 0; j < outw; j++)
-                    {
-                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
-
-                        vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
-
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
-                            _max = __riscv_vfmax_vv_f16m1(_max, _val, vl);
-                        }
-
-                        __riscv_vse16_v_f16m1(outptr + j * packn, _max, vl);
-                    }
-
-                    outptr += outw * packn;
-                }
-            }
-        }
-
-        if (elempack == 1)
-        {
-            #pragma omp parallel for num_threads(opt.num_threads)
-            for (int q = 0; q < channels; q++)
-            {
-                const Mat m = bottom_blob_bordered.channel(q);
-                __fp16* outptr = top_blob.channel(q);
-
-                for (int i = 0; i < outh; i++)
-                {
-                    for (int j = 0; j < outw; j++)
-                    {
-                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
-
-                        __fp16 max = (__fp16)-FLT_MAX;
-
-                        for (int k = 0; k < maxk; k++)
-                        {
-                            __fp16 val = sptr[space_ofs[k]];
-                            max = std::max(max, val);
-                        }
-
-                        outptr[j] = max;
-                    }
-
-                    outptr += outw;
-                }
-            }
-        }
-    }
-
-    if (pooling_type == PoolMethod_AVE)
-    {
-        if (avgpool_count_include_pad == 0)
-        {
-            int wtailpad = 0;
-            int htailpad = 0;
-
-            if (pad_mode == 0) // full padding
-            {
-                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
-                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
-            }
-
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        int sy0 = i * stride_h;
-
-                        for (int j = 0; j < outw; j++)
-                        {
-                            int sx0 = j * stride_w;
-
-                            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-                            int area = 0;
-
-                            for (int ki = 0; ki < kernel_h; ki++)
-                            {
-                                int sy = sy0 + ki;
-
-                                if (sy < pad_top)
-                                    continue;
-
-                                if (sy >= h - pad_bottom - htailpad)
-                                    break;
-
-                                for (int kj = 0; kj < kernel_w; kj++)
-                                {
-                                    int sx = sx0 + kj;
-
-                                    if (sx < pad_left)
-                                        continue;
-
-                                    if (sx >= w - pad_right - wtailpad)
-                                        break;
-
-                                    vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl), vl);
-                                    _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
-                                    area += 1;
-                                }
-                            }
-
-                            vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / area, vl);
-                            __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
-                        }
-
-                        outptr += outw * packn;
-                    }
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        int sy0 = i * stride_h;
-
-                        for (int j = 0; j < outw; j++)
-                        {
-                            int sx0 = j * stride_w;
-
-                            float sum = 0.f;
-                            int area = 0;
-
-                            for (int ki = 0; ki < kernel_h; ki++)
-                            {
-                                int sy = sy0 + ki;
-
-                                if (sy < pad_top)
-                                    continue;
-
-                                if (sy >= h - pad_bottom - htailpad)
-                                    break;
-
-                                for (int kj = 0; kj < kernel_w; kj++)
-                                {
-                                    int sx = sx0 + kj;
-
-                                    if (sx < pad_left)
-                                        continue;
-
-                                    if (sx >= w - pad_right - wtailpad)
-                                        break;
-
-                                    float val = (float)(m.row<const __fp16>(sy)[sx]);
-                                    sum += val;
-                                    area += 1;
-                                }
-                            }
-
-                            outptr[j] = (__fp16)(sum / area);
-                        }
-
-                        outptr += outw;
-                    }
-                }
-            }
-        }
-
-        if (avgpool_count_include_pad == 1)
-        {
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    const float inv_maxk = 1.f / maxk;
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        for (int j = 0; j < outw; j++)
-                        {
-                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
-
-                            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
-
-                            for (int k = 0; k < maxk; k++)
-                            {
-                                vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl);
-                                _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
-                            }
-
-                            vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, inv_maxk, vl);
-                            __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
-                        }
-
-                        outptr += outw * packn;
-                    }
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        for (int j = 0; j < outw; j++)
-                        {
-                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
-
-                            float sum = 0.f;
-
-                            for (int k = 0; k < maxk; k++)
-                            {
-                                float val = (float)(sptr[space_ofs[k]]);
-                                sum += val;
-                            }
-
-                            outptr[j] = (__fp16)(sum / maxk);
-                        }
-
-                        outptr += outw;
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
-
-int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    // max value in NxN window
-    // avg value in NxN window
-
-    if (pooling_type == PoolMethod_MAX || global_pooling)
-    {
-        return forward_fp16s(bottom_blob, top_blob, opt);
-    }
-
-    const int packn = csrr_vlenb() / 2;
-    const size_t vl = __riscv_vsetvl_e16m1(packn);
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
-
-    Mat bottom_blob_bordered;
-    make_padding(bottom_blob, bottom_blob_bordered, opt);
-    if (bottom_blob_bordered.empty())
-        return -100;
-
-    w = bottom_blob_bordered.w;
-    h = bottom_blob_bordered.h;
-
-    int outw = (w - kernel_w) / stride_w + 1;
-    int outh = (h - kernel_h) / stride_h + 1;
-
-    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    const int maxk = kernel_w * kernel_h;
-
-    // kernel offsets
-    std::vector<int> _space_ofs(maxk);
-    int* space_ofs = &_space_ofs[0];
-    {
-        int p1 = 0;
-        int p2 = 0;
-        int gap = w - kernel_w;
-        for (int i = 0; i < kernel_h; i++)
-        {
-            for (int j = 0; j < kernel_w; j++)
-            {
-                space_ofs[p1] = p2;
-                p1++;
-                p2++;
-            }
-            p2 += gap;
-        }
-    }
-
-    if (pooling_type == PoolMethod_AVE)
-    {
-        if (avgpool_count_include_pad == 0)
-        {
-            int wtailpad = 0;
-            int htailpad = 0;
-
-            if (pad_mode == 0) // full padding
-            {
-                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
-                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
-            }
-
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        int sy0 = i * stride_h;
-
-                        for (int j = 0; j < outw; j++)
-                        {
-                            int sx0 = j * stride_w;
-
-                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
-                            int area = 0;
-
-                            for (int ki = 0; ki < kernel_h; ki++)
-                            {
-                                int sy = sy0 + ki;
-
-                                if (sy < pad_top)
-                                    continue;
-
-                                if (sy >= h - pad_bottom - htailpad)
-                                    break;
-
-                                for (int kj = 0; kj < kernel_w; kj++)
-                                {
-                                    int sx = sx0 + kj;
-
-                                    if (sx < pad_left)
-                                        continue;
-
-                                    if (sx >= w - pad_right - wtailpad)
-                                        break;
-
-                                    vfloat16m1_t _val = __riscv_vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl);
-                                    _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl);
-                                    area += 1;
-                                }
-                            }
-
-                            vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl);
-                            __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl);
-                        }
-
-                        outptr += outw * packn;
-                    }
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        int sy0 = i * stride_h;
-
-                        for (int j = 0; j < outw; j++)
-                        {
-                            int sx0 = j * stride_w;
-
-                            __fp16 sum = (__fp16)0.f;
-                            int area = 0;
-
-                            for (int ki = 0; ki < kernel_h; ki++)
-                            {
-                                int sy = sy0 + ki;
-
-                                if (sy < pad_top)
-                                    continue;
-
-                                if (sy >= h - pad_bottom - htailpad)
-                                    break;
-
-                                for (int kj = 0; kj < kernel_w; kj++)
-                                {
-                                    int sx = sx0 + kj;
-
-                                    if (sx < pad_left)
-                                        continue;
-
-                                    if (sx >= w - pad_right - wtailpad)
-                                        break;
-
-                                    __fp16 val = m.row<const __fp16>(sy)[sx];
-                                    sum += val;
-                                    area += 1;
-                                }
-                            }
-
-                            outptr[j] = sum / area;
-                        }
-
-                        outptr += outw;
-                    }
-                }
-            }
-        }
-
-        if (avgpool_count_include_pad == 1)
-        {
-            if (elempack == packn)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    const __fp16 inv_maxk = (__fp16)(1.f / maxk);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        for (int j = 0; j < outw; j++)
-                        {
-                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
-
-                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
-
-                            for (int k = 0; k < maxk; k++)
-                            {
-                                vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
-                                _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl);
-                            }
-
-                            vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, inv_maxk, vl);
-                            __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl);
-                        }
-
-                        outptr += outw * packn;
-                    }
-                }
-            }
-
-            if (elempack == 1)
-            {
-                #pragma omp parallel for num_threads(opt.num_threads)
-                for (int q = 0; q < channels; q++)
-                {
-                    const Mat m = bottom_blob_bordered.channel(q);
-                    __fp16* outptr = top_blob.channel(q);
-
-                    for (int i = 0; i < outh; i++)
-                    {
-                        for (int j = 0; j < outw; j++)
-                        {
-                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
-
-                            __fp16 sum = (__fp16)0.f;
-
-                            for (int k = 0; k < maxk; k++)
-                            {
-                                __fp16 val = sptr[space_ofs[k]];
-                                sum += val;
-                            }
-
-                            outptr[j] = sum / maxk;
-                        }
-
-                        outptr += outw;
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
-#endif // __riscv_vector && __riscv_zvfh
-
 } // namespace ncnn
diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h
index 9521c7eef91e..4c14577b3ca6 100644
--- a/src/layer/riscv/pooling_riscv.h
+++ b/src/layer/riscv/pooling_riscv.h
@@ -28,7 +28,7 @@ class Pooling_riscv : public Pooling
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
diff --git a/src/layer/riscv/pooling_riscv_zvfh.cpp b/src/layer/riscv/pooling_riscv_zvfh.cpp
new file mode 100644
index 000000000000..214c56975a0d
--- /dev/null
+++ b/src/layer/riscv/pooling_riscv_zvfh.cpp
@@ -0,0 +1,678 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling_riscv.h"
+
+#include <float.h>
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+#if __riscv_zvfh
+int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // max value in NxN window
+    // avg value in NxN window
+
+    const int packn = csrr_vlenb() / 2;
+    const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
+
+    if (global_pooling)
+    {
+        top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int size = w * h;
+
+        if (pooling_type == PoolMethod_MAX)
+        {
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob.channel(q);
+
+                    vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat16m1_t _val = __riscv_vle16_v_f16m1(ptr, vl);
+                        _max = __riscv_vfmax_vv_f16m1(_max, _val, vl);
+                        ptr += packn;
+                    }
+
+                    __fp16* outptr = top_blob;
+                    __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl);
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob.channel(q);
+
+                    __fp16 max = (__fp16)-FLT_MAX;
+                    for (int i = 0; i < size; i++)
+                    {
+                        max = std::max(max, ptr[i]);
+                    }
+
+                    __fp16* outptr = top_blob;
+                    outptr[q] = max;
+                }
+            }
+        }
+
+        if (pooling_type == PoolMethod_AVE)
+        {
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob.channel(q);
+
+                    vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl);
+                        _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
+                        ptr += packn;
+                    }
+
+                    vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / size, vl);
+
+                    __fp16* outptr = top_blob;
+                    __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr = bottom_blob.channel(q);
+
+                    float sum = 0.f;
+                    for (int i = 0; i < size; i++)
+                    {
+                        sum += (float)ptr[i];
+                    }
+
+                    __fp16* outptr = top_blob;
+                    outptr[q] = (__fp16)(sum / size);
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_w) / stride_w + 1;
+    int outh = (h - kernel_h) / stride_h + 1;
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w - kernel_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2++;
+            }
+            p2 += gap;
+        }
+    }
+
+    if (pooling_type == PoolMethod_MAX)
+    {
+        if (elempack == packn)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat m = bottom_blob_bordered.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
+
+                        vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl);
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
+                            _max = __riscv_vfmax_vv_f16m1(_max, _val, vl);
+                        }
+
+                        __riscv_vse16_v_f16m1(outptr + j * packn, _max, vl);
+                    }
+
+                    outptr += outw * packn;
+                }
+            }
+        }
+
+        if (elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat m = bottom_blob_bordered.channel(q);
+                __fp16* outptr = top_blob.channel(q);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
+
+                        __fp16 max = (__fp16)-FLT_MAX;
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            __fp16 val = sptr[space_ofs[k]];
+                            max = std::max(max, val);
+                        }
+
+                        outptr[j] = max;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+
+    if (pooling_type == PoolMethod_AVE)
+    {
+        if (avgpool_count_include_pad == 0)
+        {
+            int wtailpad = 0;
+            int htailpad = 0;
+
+            if (pad_mode == 0) // full padding
+            {
+                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
+                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
+            }
+
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl), vl);
+                                    _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
+                                    area += 1;
+                                }
+                            }
+
+                            vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / area, vl);
+                            __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
+                        }
+
+                        outptr += outw * packn;
+                    }
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            float sum = 0.f;
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    float val = (float)(m.row<const __fp16>(sy)[sx]);
+                                    sum += val;
+                                    area += 1;
+                                }
+                            }
+
+                            outptr[j] = (__fp16)(sum / area);
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+
+        if (avgpool_count_include_pad == 1)
+        {
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    const float inv_maxk = 1.f / maxk;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
+
+                            vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl);
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl);
+                                _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl);
+                            }
+
+                            vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, inv_maxk, vl);
+                            __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl);
+                        }
+
+                        outptr += outw * packn;
+                    }
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
+
+                            float sum = 0.f;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float val = (float)(sptr[space_ofs[k]]);
+                                sum += val;
+                            }
+
+                            outptr[j] = (__fp16)(sum / maxk);
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // max value in NxN window
+    // avg value in NxN window
+
+    if (pooling_type == PoolMethod_MAX || global_pooling)
+    {
+        return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+
+    const int packn = csrr_vlenb() / 2;
+    const size_t vl = __riscv_vsetvl_e16m1(packn);
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_w) / stride_w + 1;
+    int outh = (h - kernel_h) / stride_h + 1;
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w - kernel_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2++;
+            }
+            p2 += gap;
+        }
+    }
+
+    if (pooling_type == PoolMethod_AVE)
+    {
+        if (avgpool_count_include_pad == 0)
+        {
+            int wtailpad = 0;
+            int htailpad = 0;
+
+            if (pad_mode == 0) // full padding
+            {
+                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
+                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
+            }
+
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    vfloat16m1_t _val = __riscv_vle16_v_f16m1(m.row<const __fp16>(sy) + sx * packn, vl);
+                                    _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl);
+                                    area += 1;
+                                }
+                            }
+
+                            vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl);
+                            __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl);
+                        }
+
+                        outptr += outw * packn;
+                    }
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            __fp16 sum = (__fp16)0.f;
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    __fp16 val = m.row<const __fp16>(sy)[sx];
+                                    sum += val;
+                                    area += 1;
+                                }
+                            }
+
+                            outptr[j] = sum / area;
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+
+        if (avgpool_count_include_pad == 1)
+        {
+            if (elempack == packn)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    const __fp16 inv_maxk = (__fp16)(1.f / maxk);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * packn;
+
+                            vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl);
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl);
+                                _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl);
+                            }
+
+                            vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, inv_maxk, vl);
+                            __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl);
+                        }
+
+                        outptr += outw * packn;
+                    }
+                }
+            }
+
+            if (elempack == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    __fp16* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;
+
+                            __fp16 sum = (__fp16)0.f;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __fp16 val = sptr[space_ofs[k]];
+                                sum += val;
+                            }
+
+                            outptr[j] = sum / maxk;
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // __riscv_zvfh
+
+} // namespace ncnn