Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

detect avx512 isa with signal action on macos #5185

Merged
merged 5 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ endif()
if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)")
OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64")
OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "(ARMV7|ARM64)")
OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)")))
set(NCNN_TARGET_ARCH arm)

Expand Down
154 changes: 118 additions & 36 deletions src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include "platform.h"

#include <limits.h>
#include <setjmp.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>

Expand Down Expand Up @@ -232,8 +234,98 @@ static int detectisa(const void* some_inst)
#endif
#endif

#if __arm__
#if __aarch64__
#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
static int g_sigill_caught = 0;
static sigjmp_buf g_jmpbuf;

static void catch_sigill(int signo, siginfo_t* si, void* data)
{
g_sigill_caught = 1;
siglongjmp(g_jmpbuf, -1);
}

static int detectisa(void (*some_inst)())
{
g_sigill_caught = 0;

struct sigaction sa = {0};
struct sigaction old_sa;
sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
sa.sa_sigaction = catch_sigill;
sigaction(SIGILL, &sa, &old_sa);

if (sigsetjmp(g_jmpbuf, 1) == 0)
{
some_inst();
}

sigaction(SIGILL, &old_sa, NULL);

return g_sigill_caught ? 0 : 1;
}

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define DEFINE_INSTCODE(name, ...) \
static void name() \
{ \
asm volatile(".byte " #__VA_ARGS__ \
: \
: \
:); \
};
#elif __aarch64__
#define DEFINE_INSTCODE(name, ...) \
static void name() \
{ \
asm volatile(".word " #__VA_ARGS__ \
: \
: \
:); \
};
#elif __arm__
#define DEFINE_INSTCODE(name, ...) \
static void name() \
{ \
asm volatile(".word " #__VA_ARGS__ \
: \
: \
:); \
};
#endif

#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__

#if defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0
DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0
DEFINE_INSTCODE(some_sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0
DEFINE_INSTCODE(some_sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0
DEFINE_INSTCODE(some_ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0
DEFINE_INSTCODE(some_sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0
DEFINE_INSTCODE(some_sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0
DEFINE_INSTCODE(some_sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0
DEFINE_INSTCODE(some_xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00) // vpmadcswd %xmm0,%xmm0,%xmm0,%xmm0
DEFINE_INSTCODE(some_avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0
DEFINE_INSTCODE(some_f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0
DEFINE_INSTCODE(some_fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0
DEFINE_INSTCODE(some_avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0
DEFINE_INSTCODE(some_avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0
DEFINE_INSTCODE(some_avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0
DEFINE_INSTCODE(some_avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0
DEFINE_INSTCODE(some_avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0
DEFINE_INSTCODE(some_avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0
DEFINE_INSTCODE(some_avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph %zmm0,%zmm0,%zmm0
DEFINE_INSTCODE(some_avxvnni, 0x62, 0xf2, 0x7d, 0x28, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0
DEFINE_INSTCODE(some_avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0
DEFINE_INSTCODE(some_avxifma, 0x62, 0xf2, 0xfd, 0x28, 0xb4, 0xc0) // vpmadd52luq %ymm0,%ymm0,%ymm0

#elif __aarch64__
DEFINE_INSTCODE(some_neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s
DEFINE_INSTCODE(some_vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s
DEFINE_INSTCODE(some_cpuid, 0xd5380000) // mrs x0,midr_el1
Expand All @@ -247,14 +339,14 @@ DEFINE_INSTCODE(some_sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b
DEFINE_INSTCODE(some_svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h
DEFINE_INSTCODE(some_svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b
DEFINE_INSTCODE(some_svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
#else

#elif __arm__
DEFINE_INSTCODE(some_edsp, 0x0000fb20) // smlad r0,r0,r0,r0
DEFINE_INSTCODE(some_neon, 0x0d40ef00) // vadd.f32 q0,q0,q0
DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0

#endif // __aarch64__
#endif // __arm__
#endif // defined _WIN32
#endif
#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__

#if defined __ANDROID__ || defined __linux__

Expand Down Expand Up @@ -509,9 +601,6 @@ static inline int x86_get_xcr0()

static int get_cpu_support_x86_avx()
{
#if !NCNN_AVX
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -533,9 +622,6 @@ static int get_cpu_support_x86_avx()

static int get_cpu_support_x86_fma()
{
#if !NCNN_FMA
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -557,9 +643,6 @@ static int get_cpu_support_x86_fma()

static int get_cpu_support_x86_xop()
{
#if !NCNN_XOP
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0x80000000, cpu_info);

Expand All @@ -573,9 +656,6 @@ static int get_cpu_support_x86_xop()

static int get_cpu_support_x86_f16c()
{
#if !NCNN_F16C
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -590,9 +670,6 @@ static int get_cpu_support_x86_f16c()

static int get_cpu_support_x86_avx2()
{
#if !NCNN_AVX2
return 0;
#endif
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -615,9 +692,9 @@ static int get_cpu_support_x86_avx2()

static int get_cpu_support_x86_avx_vnni()
{
#if !NCNN_AVXVNNI
return 0;
#endif
#if __APPLE__
return detectisa(some_avxvnni);
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -636,13 +713,14 @@ static int get_cpu_support_x86_avx_vnni()

x86_cpuid_sublevel(7, 1, cpu_info);
return cpu_info[0] & (1u << 4);
#endif
}

static int get_cpu_support_x86_avx512()
{
#if !NCNN_AVX512
return 0;
#endif
#if __APPLE__
return detectisa(some_avx512f) && detectisa(some_avx512bw) && detectisa(some_avx512cd) && detectisa(some_avx512dq) && detectisa(some_avx512vl);
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -665,13 +743,14 @@ static int get_cpu_support_x86_avx512()

x86_cpuid_sublevel(7, 0, cpu_info);
return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
#endif
}

static int get_cpu_support_x86_avx512_vnni()
{
#if !NCNN_AVX512VNNI
return 0;
#endif
#if __APPLE__
return detectisa(some_avx512vnni);
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -694,13 +773,14 @@ static int get_cpu_support_x86_avx512_vnni()

x86_cpuid_sublevel(7, 0, cpu_info);
return cpu_info[2] & (1u << 11);
#endif
}

static int get_cpu_support_x86_avx512_bf16()
{
#if !NCNN_AVX512BF16
return 0;
#endif
#if __APPLE__
return detectisa(some_avx512bf16);
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -719,13 +799,14 @@ static int get_cpu_support_x86_avx512_bf16()

x86_cpuid_sublevel(7, 1, cpu_info);
return cpu_info[0] & (1u << 5);
#endif
}

static int get_cpu_support_x86_avx512_fp16()
{
#if !NCNN_AVX512FP16
return 0;
#endif
#if __APPLE__
return detectisa(some_avx512fp16);
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);

Expand All @@ -748,6 +829,7 @@ static int get_cpu_support_x86_avx512_fp16()

x86_cpuid_sublevel(7, 0, cpu_info);
return cpu_info[3] & (1u << 23);
#endif
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

Expand Down
Loading