From 3dd6ac24e330df3b87fab284b8689c87c88bd421 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 29 Nov 2023 11:51:45 +0800 Subject: [PATCH 1/5] detect avx512 isa with signal action on macos --- CMakeLists.txt | 2 +- src/cpu.cpp | 131 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 96 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86ca5f6e6e26..4ffd677bb331 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,7 +149,7 @@ endif() if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)") - OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64") + OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "(ARMV7|ARM64)") OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)"))) set(NCNN_TARGET_ARCH arm) diff --git a/src/cpu.cpp b/src/cpu.cpp index 77efb2a71fb3..6264f860b982 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -232,8 +232,78 @@ static int detectisa(const void* some_inst) #endif #endif -#if __arm__ -#if __aarch64__ +#else // _WIN32 +#include + +static int g_sigill_caught = 0; +static sigjmp_buf g_jmpbuf; + +static void catch_sigill(int signo, siginfo_t* si, void *data) +{ + g_sigill_caught = 1; + siglongjmp(g_jmpbuf, -1); +} + +static int detectisa(void (*some_inst)()) +{ + g_sigill_caught = 0; + + struct sigaction sa = { 0 }; + struct sigaction old_sa; + sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO; + sa.sa_sigaction = catch_sigill; + sigaction(SIGILL, &sa, &old_sa); + + if (sigsetjmp(g_jmpbuf, 1) == 0) + { + some_inst(); + } + + sigaction(SIGILL, &old_sa, NULL); + + return g_sigill_caught ? 0 : 1; +} + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".byte " #__VA_ARGS__ : : : ); }; +#elif __aarch64__ +#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".word " #__VA_ARGS__ : : : ); }; +#elif __arm__ +#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".word " #__VA_ARGS__ : : : ); }; +#endif + +#endif // _WIN32 + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0 +DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0 +DEFINE_INSTCODE(some_sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0 +DEFINE_INSTCODE(some_sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0 +DEFINE_INSTCODE(some_ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0 +DEFINE_INSTCODE(some_sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0 +DEFINE_INSTCODE(some_sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0 +DEFINE_INSTCODE(some_sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0 +DEFINE_INSTCODE(some_xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00) // vpmadcswd %xmm0,%xmm0,%xmm0,%xmm0 +DEFINE_INSTCODE(some_avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0 +DEFINE_INSTCODE(some_f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0 +DEFINE_INSTCODE(some_fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0 +DEFINE_INSTCODE(some_avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0 +DEFINE_INSTCODE(some_avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0 +DEFINE_INSTCODE(some_avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0 +DEFINE_INSTCODE(some_avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0 +DEFINE_INSTCODE(some_avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0 +DEFINE_INSTCODE(some_avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0 +DEFINE_INSTCODE(some_avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph %zmm0,%zmm0,%zmm0 +DEFINE_INSTCODE(some_avxvnni, 0x62, 0xf2, 0x7d, 0x28, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0 +DEFINE_INSTCODE(some_avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0 +DEFINE_INSTCODE(some_avxifma, 0x62, 0xf2, 0xfd, 0x28, 0xb4, 0xc0) // vpmadd52luq %ymm0,%ymm0,%ymm0 + +#elif __aarch64__ DEFINE_INSTCODE(some_neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s DEFINE_INSTCODE(some_vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s DEFINE_INSTCODE(some_cpuid, 0xd5380000) // mrs x0,midr_el1 @@ -247,14 +317,13 @@ DEFINE_INSTCODE(some_sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b DEFINE_INSTCODE(some_svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h DEFINE_INSTCODE(some_svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b DEFINE_INSTCODE(some_svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s -#else + +#elif __arm__ DEFINE_INSTCODE(some_edsp, 0x0000fb20) // smlad r0,r0,r0,r0 DEFINE_INSTCODE(some_neon, 0x0d40ef00) // vadd.f32 q0,q0,q0 DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0 -#endif // __aarch64__ -#endif // __arm__ -#endif // defined _WIN32 +#endif #if defined __ANDROID__ || defined __linux__ @@ -509,9 +578,6 @@ static inline int x86_get_xcr0() static int get_cpu_support_x86_avx() { -#if !NCNN_AVX - return 0; -#endif unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -533,9 +599,6 @@ static int get_cpu_support_x86_avx() static int get_cpu_support_x86_fma() { -#if !NCNN_FMA - return 0; -#endif unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -557,9 +620,6 @@ static int get_cpu_support_x86_fma() static int get_cpu_support_x86_xop() { -#if !NCNN_XOP - return 0; -#endif unsigned int cpu_info[4] = {0}; x86_cpuid(0x80000000, cpu_info); @@ -573,9 +633,6 @@ static int get_cpu_support_x86_xop() static int get_cpu_support_x86_f16c() { -#if !NCNN_F16C - return 0; -#endif unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -590,9 +647,6 @@ static int get_cpu_support_x86_f16c() static int get_cpu_support_x86_avx2() { -#if !NCNN_AVX2 - return 0; -#endif unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -615,9 +669,9 @@ static int get_cpu_support_x86_avx2() static int get_cpu_support_x86_avx_vnni() { -#if !NCNN_AVXVNNI - return 0; -#endif +#if __APPLE__ + return detectisa(some_avxvnni); +#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -636,13 +690,14 @@ static int get_cpu_support_x86_avx_vnni() x86_cpuid_sublevel(7, 1, cpu_info); return cpu_info[0] & (1u << 4); +#endif } static int get_cpu_support_x86_avx512() { -#if !NCNN_AVX512 - return 0; -#endif +#if __APPLE__ + return detectisa(some_avx512f) && detectisa(some_avx512bw) && detectisa(some_avx512cd) && detectisa(some_avx512dq) && detectisa(some_avx512vl); +#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -665,13 +720,14 @@ static int get_cpu_support_x86_avx512() x86_cpuid_sublevel(7, 0, cpu_info); return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31)); +#endif } static int get_cpu_support_x86_avx512_vnni() { -#if !NCNN_AVX512VNNI - return 0; -#endif +#if __APPLE__ + return detectisa(some_avx512vnni); +#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -694,13 +750,14 @@ static int get_cpu_support_x86_avx512_vnni() x86_cpuid_sublevel(7, 0, cpu_info); return cpu_info[2] & (1u << 11); +#endif } static int get_cpu_support_x86_avx512_bf16() { -#if !NCNN_AVX512BF16 - return 0; -#endif +#if __APPLE__ + return detectisa(some_avx512bf16); +#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -719,13 +776,14 @@ static int get_cpu_support_x86_avx512_bf16() x86_cpuid_sublevel(7, 1, cpu_info); return cpu_info[0] & (1u << 5); +#endif } static int get_cpu_support_x86_avx512_fp16() { -#if !NCNN_AVX512FP16 - return 0; -#endif +#if __APPLE__ + return detectisa(some_avx512fp16); +#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -748,6 +806,7 @@ static int get_cpu_support_x86_avx512_fp16() x86_cpuid_sublevel(7, 0, cpu_info); return cpu_info[3] & (1u << 23); +#endif } #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) From 282183daa0a3eacbb87042f314e58eb5b7db081a Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 29 Nov 2023 03:44:13 +0000 Subject: [PATCH 2/5] apply code-format changes --- src/cpu.cpp | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index 6264f860b982..bac5093b284e 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -232,13 +232,13 @@ static int detectisa(const void* some_inst) #endif #endif -#else // _WIN32 +#else // _WIN32 #include static int g_sigill_caught = 0; static sigjmp_buf g_jmpbuf; -static void catch_sigill(int signo, siginfo_t* si, void *data) +static void catch_sigill(int signo, siginfo_t* si, void* data) { g_sigill_caught = 1; siglongjmp(g_jmpbuf, -1); @@ -248,7 +248,7 @@ static int detectisa(void (*some_inst)()) { g_sigill_caught = 0; - struct sigaction sa = { 0 }; + struct sigaction sa = {0}; struct sigaction old_sa; sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO; sa.sa_sigaction = catch_sigill; @@ -265,11 +265,32 @@ static int detectisa(void (*some_inst)()) } #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) -#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".byte " #__VA_ARGS__ : : : ); }; +#define DEFINE_INSTCODE(name, ...) \ + static void name() \ + { \ + asm volatile(".byte " #__VA_ARGS__ \ + : \ + : \ + :); \ + }; #elif __aarch64__ -#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".word " #__VA_ARGS__ : : : ); }; +#define DEFINE_INSTCODE(name, ...) \ + static void name() \ + { \ + asm volatile(".word " #__VA_ARGS__ \ + : \ + : \ + :); \ + }; #elif __arm__ -#define DEFINE_INSTCODE(name, ...) static void name() { asm volatile(".word " #__VA_ARGS__ : : : ); }; +#define DEFINE_INSTCODE(name, ...) \ + static void name() \ + { \ + asm volatile(".word " #__VA_ARGS__ \ + : \ + : \ + :); \ + }; #endif #endif // _WIN32 From 209b231baab5ef3ebc16992015c0f781cb419ce2 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 29 Nov 2023 11:59:24 +0800 Subject: [PATCH 3/5] fix include --- src/cpu.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index bac5093b284e..6bd785d04810 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -17,6 +17,7 @@ #include "platform.h" #include +#include #include #include @@ -233,8 +234,6 @@ static int detectisa(const void* some_inst) #endif #else // _WIN32 -#include - static int g_sigill_caught = 0; static sigjmp_buf g_jmpbuf; From 496cdeb7555f30bd17466410d8a6b8d2795e79e4 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 29 Nov 2023 12:03:31 +0800 Subject: [PATCH 4/5] fix include --- src/cpu.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpu.cpp b/src/cpu.cpp index 6bd785d04810..2d3c6284ab5b 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -17,6 +17,7 @@ #include "platform.h" #include +#include #include #include #include From 7ebf67c1ec049bedbadb96d021778c908a32be65 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 29 Nov 2023 14:52:43 +0800 Subject: [PATCH 5/5] guard os support --- src/cpu.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index 2d3c6284ab5b..3bfbd5096188 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -234,7 +234,7 @@ static int detectisa(const void* some_inst) #endif #endif -#else // _WIN32 +#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__ static int g_sigill_caught = 0; static sigjmp_buf g_jmpbuf; @@ -293,8 +293,9 @@ static int detectisa(void (*some_inst)()) }; #endif -#endif // _WIN32 +#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__ +#if defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0 DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0 @@ -345,6 +346,7 @@ DEFINE_INSTCODE(some_neon, 0x0d40ef00) // vadd.f32 q0,q0,q0 DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0 #endif +#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__ #if defined __ANDROID__ || defined __linux__