From 082924837dec5df66eb08e19aec0bf81ac78c3be Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Tue, 1 Feb 2022 08:14:50 +0000 Subject: [PATCH] Add complex return style detection and `cblas_*_sub` workaround When using `libmkl_rt`, it tends to provide the intel interface rather than the gfortran interface, which causes `{c,z}dot{u,c}` to expect an implicit argument to store the return value in. We provide some autodetection of this, and build some wrappers to work around this for the eight functions affected. In MKL v2022.0, special ILP64-suffixed symbols are provided for the FORTRAN symbols, but not for the CBLAS ones. While we tend to use the FORTRAN symbols for most tasks, there are a few CBLAS symbols we use in the Julia world, presumably because of the above ABI problem. Now that that is fixed, we can likely stop using the CBLAS methods completely and use only the FORTRAN symbols, but for completeness, we detect missing ILP64-suffixed versions as well and forward them to the appropriate FORTRAN symbols. Note that a fully complete mapping of CBLAS -> FORTRAN symbols would be ideal here, however that is a large undertaking, and so we choose here to be selective in which functions we provide mappings for. --- README.md | 2 + src/Make.inc | 11 ++- src/Makefile | 9 +- src/autodetection.c | 63 +++++++++--- src/cblas_adapters.c | 107 +++++++++++++++++++++ src/complex_return_style_adapters.c | 123 ++++++++++++++++++++++++ src/config.c | 5 +- src/libblastrampoline.c | 104 +++++++++++++++++++- src/libblastrampoline.h | 25 ++++- src/libblastrampoline_cblasdata.h | 30 ++++++ src/libblastrampoline_complex_retdata.h | 63 ++++++++++++ src/libblastrampoline_internal.h | 9 +- src/mkl_v2022_adapters.c | 25 +++++ test/direct.jl | 30 ++++++ test/runtests.jl | 59 +++++------- test/utils.jl | 29 +++++- test/zdotc_test/Makefile | 15 +++ test/zdotc_test/zdotc_test.c | 42 ++++++++ 18 files changed, 689 insertions(+), 62 deletions(-) create mode 100644 src/cblas_adapters.c create mode 100644 src/complex_return_style_adapters.c create mode 100644 src/libblastrampoline_cblasdata.h create mode 100644 src/libblastrampoline_complex_retdata.h create mode 100644 src/mkl_v2022_adapters.c create mode 100644 test/zdotc_test/Makefile create mode 100644 test/zdotc_test/zdotc_test.c diff --git a/README.md b/README.md index 7e63f01..c5220fd 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ You can always tell if your system is limited in this fashion by calling `lbt_ge ### Version History +v5.0.0 - Add complex return value wrappers and CBLAS workaround. The complex return value wrapper ensures that all symbols maintain a standard ABI for returning complex numbers, and the CBLAS workaround maps CBLAS symbols to FORTRAN symbols when properly-suffixed CBLAS symbols do not exist, as is the case in MKL `v2022.0`. + v4.1.0 - Add `LBT_STRICT` environment variable that causes calling missing symbols to kill the process. v4.0.0 - Add `suffix_hint` parameter to `lbt_forward()` to allow overriding symbol suffix search order for dual-interface libraries, and allow loading the same library with multiple interfaces. diff --git a/src/Make.inc b/src/Make.inc index 5fe4636..252db76 100644 --- a/src/Make.inc +++ b/src/Make.inc @@ -22,8 +22,8 @@ ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS))$(findstring CYGWIN,$(O OS := WINNT endif -LBT_SOVERSION_MAJOR := 4 -LBT_SOVERSION_MINOR := 1 +LBT_SOVERSION_MAJOR := 5 +LBT_SOVERSION_MINOR := 0 LBT_SOVERSION_PATCH := 0 ifeq ($(OS), WINNT) @@ -82,6 +82,13 @@ ifneq (,$(filter $(ARCH), x86_64 i686)) F2C_AUTODETECTION := 1 endif +# If we're on x86_64 or i686, we may need to autodetect MKL v2022's incomplete ILP64 suffixing. +CBLAS_DIVERGENCE_AUTODETECTION := 0 +ifneq (,$(filter $(ARCH), x86_64 i686)) + LBT_CFLAGS += -DCBLAS_DIVERGENCE_AUTODETECTION + CBLAS_DIVERGENCE_AUTODETECTION := 1 +endif + ifeq ($(VERBOSE),0) ENDCOLOR := "\033[0m" diff --git a/src/Makefile b/src/Makefile index b8bc2ca..9726366 100644 --- a/src/Makefile +++ b/src/Makefile @@ -5,7 +5,10 @@ include $(LBT_ROOT)/src/Make.inc all: $(builddir)/libblastrampoline.$(SHLIB_EXT) # Objects we'll build -MAIN_OBJS := libblastrampoline.o dl_utils.o config.o autodetection.o threading.o deepbindless.o trampolines/trampolines_$(ARCH).o +MAIN_OBJS := libblastrampoline.o dl_utils.o config.o \ + autodetection.o \ + complex_return_style_adapters.o \ + threading.o deepbindless.o trampolines/trampolines_$(ARCH).o # Include win_utils.c on windws ifeq ($(OS),WINNT) @@ -17,6 +20,10 @@ ifeq ($(F2C_AUTODETECTION),1) MAIN_OBJS += f2c_adapters.o endif +ifeq ($(CBLAS_DIVERGENCE_AUTODETECTION),1) +MAIN_OBJS += cblas_adapters.o +endif + # Place the `.o` files into `$(builddir)` MAIN_OBJS := $(addprefix $(builddir)/,$(MAIN_OBJS)) diff --git a/src/autodetection.c b/src/autodetection.c index e2909d8..15ee9cb 100644 --- a/src/autodetection.c +++ b/src/autodetection.c @@ -1,4 +1,24 @@ #include "libblastrampoline_internal.h" +#include + +/* + * Search for a symbol that ends in one of the given suffixes. Returns NULL if not found. + */ +const char * symbol_suffix_search(void * handle, const char * symbol_name, const char ** suffixes, const int num_suffixes) { + char symbol_name_suffixed[MAX_SYMBOL_LEN]; + for (int suffix_idx=0; suffix_idx +#include "libblastrampoline_internal.h" + +/* + * Some libraries provide ILP64-suffixed FORTRAN symbols, but forgot the CBLAS ones. + * To allow Julia to still use `cblas_{c,z}dot{c,u}_sub` when linking against the + * explicitly ILP64-suffixed MKL libraries, we map the CBLAS forwards to the FORTRAN + * symbols, where appropriate. This effects MKL v2022.0, x-ref: + * - https://github.com/JuliaLinearAlgebra/libblastrampoline/issues/56 + */ + +extern double complex zdotc_(const int32_t *, + const double complex *, const int32_t *, + const double complex *, const int32_t *); +void lbt_cblas_zdotc_sub(const int32_t N, + const double complex *X, const int32_t incX, + const double complex *Y, const int32_t incY, + double complex * z) +{ + *z = zdotc_(&N, X, &incX, Y, &incY); +} + +extern double complex zdotc_64_(const int64_t *, + const double complex *, const int64_t *, + const double complex *, const int64_t *); +void lbt_cblas_zdotc_sub64_(const int64_t N, + const double complex *X, const int64_t incX, + const double complex *Y, const int64_t incY, + double complex * z) +{ + *z = zdotc_64_(&N, X, &incX, Y, &incY); +} + + +extern double complex zdotu_(const int32_t *, + const double complex *, const int32_t *, + const double complex *, const int32_t *); +void lbt_cblas_zdotu_sub(const int32_t N, + const double complex *X, const int32_t incX, + const double complex *Y, const int32_t incY, + double complex * z) +{ + *z = zdotu_(&N, X, &incX, Y, &incY); +} + +extern double complex zdotu_64_(const int64_t *, + const double complex *, const int64_t *, + const double complex *, const int64_t *); +void lbt_cblas_zdotu_sub64_(const int64_t N, + const double complex *X, const int64_t incX, + const double complex *Y, const int64_t incY, + double complex * z) +{ + *z = zdotu_64_(&N, X, &incX, Y, &incY); +} + + + + + + + + +extern float complex cdotc_(const int32_t *, + const float complex *, const int32_t *, + const float complex *, const int32_t *); +void lbt_cblas_cdotc_sub(const int32_t N, + const float complex *X, const int32_t incX, + const float complex *Y, const int32_t incY, + float complex * z) +{ + *z = cdotc_(&N, X, &incX, Y, &incY); +} + +extern float complex cdotc_64_(const int64_t *, + const float complex *, const int64_t *, + const float complex *, const int64_t *); +void lbt_cblas_cdotc_sub64_(const int64_t N, + const float complex *X, const int64_t incX, + const float complex *Y, const int64_t incY, + float complex * z) +{ + *z = cdotc_64_(&N, X, &incX, Y, &incY); +} + + +extern float complex cdotu_(const int32_t *, + const float complex *, const int32_t *, + const float complex *, const int32_t *); +void lbt_cblas_cdotu_sub(const int32_t N, + const float complex *X, const int32_t incX, + const float complex *Y, const int32_t incY, + float complex * z) +{ + *z = cdotu_(&N, X, &incX, Y, &incY); +} + +extern float complex cdotu_64_(const int64_t *, + const float complex *, const int64_t *, + const float complex *, const int64_t *); +void lbt_cblas_cdotu_sub64_(const int64_t N, + const float complex *X, const int64_t incX, + const float complex *Y, const int64_t incY, + float complex * z) +{ + *z = cdotu_64_(&N, X, &incX, Y, &incY); +} diff --git a/src/complex_return_style_adapters.c b/src/complex_return_style_adapters.c new file mode 100644 index 0000000..bb20c84 --- /dev/null +++ b/src/complex_return_style_adapters.c @@ -0,0 +1,123 @@ +#include +#include "libblastrampoline_internal.h" + +/* + * Some libraries use an argument-passing convention for returning complex numbers. + * We create wrappers to work around this behavior and provide a consistent ABI + * across all libraries. An example of this style of library is MKL, x-ref: + * - https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/ARPACK-with-MKL-crashes-when-calling-zdotc/td-p/1054316 + * - https://scicomp.stackexchange.com/questions/5380/intel-mkl-difference-between-mkl-intel-lp64-and-mkl-gf-lp64 + */ + + +// zdotc +extern void (*cmplxret_zdotc__addr)(double complex * z, + const int32_t *, + const double complex *, const int32_t *, + const double complex *, const int32_t *); +double complex cmplxret_zdotc_(const int32_t * N, + const double complex *X, const int32_t * incX, + const double complex *Y, const int32_t * incY) +{ + double complex c; + cmplxret_zdotc__addr(&c, N, X, incX, Y, incY); + return c; +} + +extern void (*cmplxret_zdotc_64__addr)(double complex * z, + const int64_t *, + const double complex *, const int64_t *, + const double complex *, const int64_t *); +double complex cmplxret_zdotc_64_(const int64_t * N, + const double complex *X, const int64_t * incX, + const double complex *Y, const int64_t * incY) +{ + double complex c; + cmplxret_zdotc_64__addr(&c, N, X, incX, Y, incY); + return c; +} + + +// zdotu +extern void (*zdotu__addr)(double complex * z, + const int32_t *, + const double complex *, const int32_t *, + const double complex *, const int32_t *); +double complex cmplxret_zdotu_(const int32_t * N, + const double complex *X, const int32_t * incX, + const double complex *Y, const int32_t * incY) +{ + double complex c; + zdotu__addr(&c, N, X, incX, Y, incY); + return c; +} + +extern void (*zdotu_64__addr)(double complex * z, + const int64_t *, + const double complex *, const int64_t *, + const double complex *, const int64_t *); +double complex cmplxret_zdotu_64_(const int64_t * N, + const double complex *X, const int64_t * incX, + const double complex *Y, const int64_t * incY) +{ + double complex c; + zdotu_64__addr(&c, N, X, incX, Y, incY); + return c; +} + + +// cdotc +extern void (*cdotc__addr)(float complex * z, + const int32_t *, + const float complex *, const int32_t *, + const float complex *, const int32_t *); +float complex cmplxret_cdotc_(const int32_t * N, + const float complex *X, const int32_t * incX, + const float complex *Y, const int32_t * incY) +{ + float complex c; + cdotc__addr(&c, N, X, incX, Y, incY); + return c; +} + +extern void cdotc_64__addr(float complex * z, + const int64_t *, + const float complex *, const int64_t *, + const float complex *, const int64_t *); +float complex cmplxret_cdotc_64_(const int64_t * N, + const float complex *X, const int64_t * incX, + const float complex *Y, const int64_t * incY) +{ + float complex c; + cdotc_64__addr(&c, N, X, incX, Y, incY); + return c; +} + + +// cdotu +extern void (*cdotu__addr)(float complex * z, + const int32_t *, + const float complex *, const int32_t *, + const float complex *, const int32_t *); +float complex cmplxret_cdotu_(const int32_t * N, + const float complex *X, const int32_t * incX, + const float complex *Y, const int32_t * incY) +{ + float complex c; + cdotu__addr(&c, N, X, incX, Y, incY); + return c; +} + +extern void (*cdotu_64__addr)(float complex * z, + const int64_t *, + const float complex *, const int64_t *, + const float complex *, const int64_t *); +float complex cmplxret_cdotu_64_(const int64_t * N, + const float complex *X, const int64_t * incX, + const float complex *Y, const int64_t * incY) +{ + float complex c; + cdotu_64__addr(&c, N, X, incX, Y, incY); + return c; +} + diff --git a/src/config.c b/src/config.c index 8b804af..e60f2c5 100644 --- a/src/config.c +++ b/src/config.c @@ -69,7 +69,8 @@ void clear_other_forwards(int skip_idx, uint8_t * forwards, int32_t interface) { } } -void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, int interface, int f2c) { +void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, + int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t cblas) { // Scan for the an empty slot, and also check to see if this library has been loaded before. int free_idx = -1; for (int idx=0; idxactive_forwards = (uint8_t *)malloc(sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1)); memcpy(new_libinfo->active_forwards, forwards, sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1)); new_libinfo->interface = interface; + new_libinfo->complex_retstyle = complex_retstyle; new_libinfo->f2c = f2c; + new_libinfo->cblas = cblas; lbt_config.loaded_libs[free_idx] = new_libinfo; diff --git a/src/libblastrampoline.c b/src/libblastrampoline.c index 93d6816..c7b3d4d 100644 --- a/src/libblastrampoline.c +++ b/src/libblastrampoline.c @@ -1,9 +1,13 @@ #include "libblastrampoline_internal.h" #include "libblastrampoline_trampdata.h" +#include "libblastrampoline_complex_retdata.h" #ifdef F2C_AUTODETECTION #include "libblastrampoline_f2cdata.h" #endif +#ifdef CBLAS_DIVERGENCE_AUTODETECTION +#include "libblastrampoline_cblasdata.h" +#endif // Sentinel to tell us if we've got a deepbindless workaround active or not #define DEEPBINDLESS_INTERFACE_LP64_LOADED 0x01 @@ -40,7 +44,7 @@ LBT_DLLEXPORT void lbt_set_default_func(const void * addr) { /* * Force a forward to a particular value. */ -int32_t set_forward_by_index(int32_t symbol_idx, const void * addr, int32_t interface, int32_t f2c, int32_t verbose) { +int32_t set_forward_by_index(int32_t symbol_idx, const void * addr, int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t verbose) { // Quit out immediately if this is not a interface setting if (interface != LBT_INTERFACE_LP64 && interface != LBT_INTERFACE_ILP64) { return -1; @@ -63,6 +67,30 @@ int32_t set_forward_by_index(int32_t symbol_idx, const void * addr, int32_t inte } } + if (complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) { + // Check to see if this symbol is one of the complex-returning functions + for (int complex_symbol_idx=0; cmplxret_func_idxs[complex_symbol_idx] != -1; ++complex_symbol_idx) { + // Skip any symbols that aren't ours + if (cmplxret_func_idxs[complex_symbol_idx] != symbol_idx) + continue; + + // Report to the user that we're cblas-wrapping this one + if (verbose) { + char exported_name[MAX_SYMBOL_LEN]; + sprintf(exported_name, "%s%s", exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : ""); + printf(" - [%04d] complex(%s)\n", symbol_idx, exported_name); + } + + if (interface == LBT_INTERFACE_LP64) { + (*cmplxret_func32_addrs[complex_symbol_idx]) = (*exported_func32_addrs[symbol_idx]); + (*exported_func32_addrs[symbol_idx]) = cmplxret32_func_wrappers[complex_symbol_idx]; + } else { + (*cmplxret_func64_addrs[complex_symbol_idx]) = (*exported_func64_addrs[symbol_idx]); + (*exported_func64_addrs[symbol_idx]) = cmplxret64_func_wrappers[complex_symbol_idx]; + } + } + } + #ifdef F2C_AUTODETECTION if (f2c == LBT_F2C_REQUIRED) { // Check to see if this symbol is one of the f2c functions @@ -132,13 +160,13 @@ LBT_DLLEXPORT const void * lbt_get_forward(const char * symbol_name, int32_t int } } -LBT_DLLEXPORT int32_t lbt_set_forward(const char * symbol_name, const void * addr, int32_t interface, int32_t f2c, int32_t verbose) { +LBT_DLLEXPORT int32_t lbt_set_forward(const char * symbol_name, const void * addr, int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t verbose) { // Search symbol list for `symbol_name`, then sub off to `set_forward_by_index()` int32_t symbol_idx = find_symbol_idx(symbol_name); if (symbol_idx == -1) return -1; - int32_t ret = set_forward_by_index(symbol_idx, addr, interface, f2c, verbose); + int32_t ret = set_forward_by_index(symbol_idx, addr, interface, complex_retstyle, f2c, verbose); if (ret == 0) { // Un-mark this symbol as being provided by any of our libraries; // if you use the footgun API, you can keep track of who is providing what. @@ -187,6 +215,21 @@ LBT_DLLEXPORT int32_t lbt_forward(const char * libname, int32_t clear, int32_t v } } + // Next, let's figure out what the complex return style is: + int complex_retstyle = autodetect_complex_return_style(handle, lib_suffix); + if (complex_retstyle == LBT_COMPLEX_RETSTYLE_UNKNOWN) { + fprintf(stderr, "Unable to autodetect complex return style of \"%s\"\n", libname); + return 0; + } + if (verbose) { + if (complex_retstyle == LBT_COMPLEX_RETSTYLE_NORMAL) { + printf(" -> Autodetected normal complex return style\n"); + } + if (complex_retstyle == LBT_COMPLEX_RETSTYLE_ARGUMENT) { + printf(" -> Autodetected argument-passing complex return style\n"); + } + } + int f2c = LBT_F2C_PLAIN; #ifdef F2C_AUTODETECTION // Next, we need to probe to see if this is an f2c-style calling convention library @@ -206,6 +249,32 @@ LBT_DLLEXPORT int32_t lbt_forward(const char * libname, int32_t clear, int32_t v } #endif + int cblas = LBT_CBLAS_UNKNOWN; +#ifdef CBLAS_DIVERGENCE_AUTODETECTION + // Next, we need to probe to see if this is MKL v2022 with missing ILP64-suffixed + // CBLAS symbols, but only if it's an ILP64 library. + if (interface == LBT_INTERFACE_ILP64) { + cblas = autodetect_cblas_divergence(handle, lib_suffix); + if (verbose) { + switch (cblas) { + case LBT_CBLAS_CONFORMANT: + printf(" -> CBLAS detected\n"); + break; + case LBT_CBLAS_DIVERGENT: + printf(" -> Autodetected CBLAS-divergent library!\n"); + break; + case LBT_CBLAS_UNKNOWN: + printf(" -> CBLAS not found\n"); + break; + default: + printf(" -> ERROR: CBLAS DETECTION FAILED UNEXPECTEDLY\n"); + exit(1); + break; + } + } + } +#endif + /* * Now, if we are opening a 64-bit library with 32-bit names (e.g. suffix == ""), * we can handle that... as long as we're on a system where we can tell a library @@ -280,13 +349,38 @@ LBT_DLLEXPORT int32_t lbt_forward(const char * libname, int32_t clear, int32_t v sprintf(symbol_name, "%s%s", exported_func_names[symbol_idx], lib_suffix); void *addr = lookup_symbol(handle, symbol_name); if (addr != NULL) { - set_forward_by_index(symbol_idx, addr, interface, f2c, verbose); + set_forward_by_index(symbol_idx, addr, interface, complex_retstyle, f2c, verbose); BITFIELD_SET(forwards, symbol_idx); nforwards++; } } - record_library_load(libname, handle, lib_suffix, &forwards[0], interface, f2c); +#ifdef CBLAS_DIVERGENCE_AUTODETECTION + // If we're loading a divergent CBLAS library, we need to scan through all + // CBLAS symbols, and forward them to wrappers which will convert them to + // the FORTRAN equivalents. + if (cblas == LBT_CBLAS_DIVERGENT) { + int32_t cblas_symbol_idx = 0; + for (cblas_symbol_idx = 0; cblas_func_idxs[cblas_symbol_idx] != -1; cblas_symbol_idx += 1) { + int32_t symbol_idx = cblas_func_idxs[cblas_symbol_idx]; + + // Report to the user that we're cblas-wrapping this one + if (verbose) { + char exported_name[MAX_SYMBOL_LEN]; + sprintf(exported_name, "%s%s", exported_func_names[symbol_idx], interface == LBT_INTERFACE_ILP64 ? "64_" : ""); + printf(" - [%04d] cblas(%s)\n", symbol_idx, exported_name); + } + + if (interface == LBT_INTERFACE_LP64) { + (*exported_func32_addrs[symbol_idx]) = cblas32_func_wrappers[cblas_symbol_idx]; + } else { + (*exported_func64_addrs[symbol_idx]) = cblas64_func_wrappers[cblas_symbol_idx]; + } + } + } +#endif + + record_library_load(libname, handle, lib_suffix, &forwards[0], interface, complex_retstyle, f2c, cblas); if (verbose) { printf("Processed %d symbols; forwarded %d symbols with %d-bit interface and mangling to a suffix of \"%s\"\n", symbol_idx, nforwards, interface, lib_suffix); } diff --git a/src/libblastrampoline.h b/src/libblastrampoline.h index 217deb4..cb58d6b 100644 --- a/src/libblastrampoline.h +++ b/src/libblastrampoline.h @@ -57,10 +57,14 @@ typedef struct { // Note that if you use the footgun API (e.g. "lbt_set_forward()") these values will be // zeroed out and you must track them manually if you need to. uint8_t * active_forwards; - // The interface type as autodetected by `lbt_forward`, see `LBT_INTERFACE_XXX` below + // The interface type as autodetected by `lbt_forward`, see `LBT_INTERFACE_XXX` below int32_t interface; - // The `f2c` status as autodetected by `lbt_forward`, see `LBT_F2C_XXX` below + // The complex return style as autodetected by `lbt_forward`, see `LBT_COMPLEX_RETSTYLE_XXX` below + int32_t complex_retstyle; + // The `f2c` status as autodetected by `lbt_forward`, see `LBT_F2C_XXX` below int32_t f2c; + // The `cblas` status as autodetected by `lbt_forward`, see `LBT_CBLAS_XXX` below + int32_t cblas; } lbt_library_info_t; // Possible values for `interface` in `lbt_library_info_t` @@ -76,6 +80,21 @@ typedef struct { #define LBT_F2C_REQUIRED 1 #define LBT_F2C_UNKNOWN -1 +// Possible values for `retstyle` in `lbt_library_info_t` +// These describe whether a library is using "normal" return value passing (e.g. through +// the `XMM{0,1}` registers on x86_64, or the `ST{0,1}` floating-point registers on i686) +#define LBT_COMPLEX_RETSTYLE_NORMAL 0 +#define LBT_COMPLEX_RETSTYLE_ARGUMENT 1 +#define LBT_COMPLEX_RETSTYLE_UNKNOWN -1 + +// Possible values for `cblas` in `lbt_library_info_t` +// These describe whether a library has properly named CBLAS symbols, or as in the case of +// MKL v2022.0, the CBLAS symbols lack the ILP64 suffixes, and will need to be adapted to +// forward to the ILP64-suffixed FORTRAN symbols. +#define LBT_CBLAS_CONFORMANT 0 +#define LBT_CBLAS_DIVERGENT 1 +#define LBT_CBLAS_UNKNOWN -1 + // The config type you get back from `lbt_get_config()` typedef struct { // The NULL-terminated list of libraries loaded via `lbt_forward()`. @@ -208,7 +227,7 @@ LBT_DLLEXPORT const void * lbt_get_forward(const char * symbol_name, int32_t int * If `addr` is set to `NULL` it will be set as the default function, see `lbt_set_default_func()` * for how to set the default function pointer. */ -LBT_DLLEXPORT int32_t lbt_set_forward(const char * symbol_name, const void * addr, int32_t interface, int32_t f2c, int32_t verbose); +LBT_DLLEXPORT int32_t lbt_set_forward(const char * symbol_name, const void * addr, int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t verbose); #ifdef __cplusplus } // extern "C" diff --git a/src/libblastrampoline_cblasdata.h b/src/libblastrampoline_cblasdata.h new file mode 100644 index 0000000..47e42d8 --- /dev/null +++ b/src/libblastrampoline_cblasdata.h @@ -0,0 +1,30 @@ +#define XX(name, index) extern const void * lbt_##name ; +#define XX_64(name, index) extern const void * lbt_##name##64_ ; +CBLAS_SUB_FUNCS(XX) +CBLAS_SUB_FUNCS(XX_64) +#undef XX +#undef XX_64 + + +// Next, create an array that that points to all of our wrapper code +// locations, allowing a cblas index -> function lookup +#define XX(name, index) &lbt_##name, +#define XX_64(name, index) &lbt_##name##64_, +const void ** cblas32_func_wrappers[] = { + CBLAS_SUB_FUNCS(XX) + NULL +}; +const void ** cblas64_func_wrappers[] = { + CBLAS_SUB_FUNCS(XX_64) + NULL +}; +#undef XX +#undef XX_64 + +// Finally, an array that maps cblas index -> exported symbol index +#define XX(name, index) index, +const int cblas_func_idxs[] = { + CBLAS_SUB_FUNCS(XX) + -1 +}; +#undef XX \ No newline at end of file diff --git a/src/libblastrampoline_complex_retdata.h b/src/libblastrampoline_complex_retdata.h new file mode 100644 index 0000000..ea9ecc9 --- /dev/null +++ b/src/libblastrampoline_complex_retdata.h @@ -0,0 +1,63 @@ +// These hold the "true" address that the complex return wrapper invokes in turn +#define XX(name, idx) LBT_HIDDEN const void * cmplxret_##name##_addr; +#define XX_64(name, idx) LBT_HIDDEN const void * cmplxret_##name##64__addr; +COMPLEX64_FUNCS(XX) +COMPLEX64_FUNCS(XX_64) +COMPLEX128_FUNCS(XX) +COMPLEX128_FUNCS(XX_64) +#undef XX +#undef XX_64 + +// Build mapping from cmplxret-index to `_addr` instance +#define XX(name, index) &cmplxret_##name##_addr, +#define XX_64(name, index) &cmplxret_##name##64__addr, +const void ** cmplxret_func32_addrs[] = { + COMPLEX64_FUNCS(XX) + COMPLEX128_FUNCS(XX) + NULL +}; +const void ** cmplxret_func64_addrs[] = { + COMPLEX64_FUNCS(XX_64) + COMPLEX128_FUNCS(XX_64) + NULL +}; +#undef XX +#undef XX_64 + + +// Forward-declare some functions +#define XX(name, index) extern const void * cmplxret_##name ; +#define XX_64(name, index) extern const void * cmplxret_##name##64_ ; +COMPLEX64_FUNCS(XX) +COMPLEX128_FUNCS(XX) +COMPLEX64_FUNCS(XX_64) +COMPLEX128_FUNCS(XX_64) +#undef XX +#undef XX_64 + + +// Next, create an array that that points to all of our wrapper code +// locations, allowing a cblas index -> function lookup +#define XX(name, index) &cmplxret_##name, +#define XX_64(name, index) &cmplxret_##name##64_, +const void ** cmplxret32_func_wrappers[] = { + COMPLEX64_FUNCS(XX) + COMPLEX128_FUNCS(XX) + NULL +}; +const void ** cmplxret64_func_wrappers[] = { + COMPLEX64_FUNCS(XX_64) + COMPLEX128_FUNCS(XX_64) + NULL +}; +#undef XX +#undef XX_64 + +// Finally, an array that maps cblas index -> exported symbol index +#define XX(name, index) index, +const int cmplxret_func_idxs[] = { + COMPLEX64_FUNCS(XX) + COMPLEX128_FUNCS(XX) + -1 +}; +#undef XX \ No newline at end of file diff --git a/src/libblastrampoline_internal.h b/src/libblastrampoline_internal.h index 717ad0b..1de82b9 100644 --- a/src/libblastrampoline_internal.h +++ b/src/libblastrampoline_internal.h @@ -59,7 +59,8 @@ int32_t find_symbol_idx(const char * name); void init_config(); void clear_loaded_libraries(); void clear_forwarding_mark(int32_t symbol_idx, int32_t interface); -void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, int interface, int f2c); +void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, + int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t cblas); // Functions in `win_utils.c` #ifdef _OS_WINDOWS_ @@ -79,9 +80,13 @@ const char * autodetect_symbol_suffix(void * handle, const char * suffix_hint); int32_t autodetect_blas_interface(void * isamax_addr); int32_t autodetect_lapack_interface(void * dpotrf_addr); int32_t autodetect_interface(void * handle, const char * suffix); +int32_t autodetect_complex_return_style(void * handle, const char * suffix); #ifdef F2C_AUTODETECTION -int autodetect_f2c(void * handle, const char * suffix); +int32_t autodetect_f2c(void * handle, const char * suffix); +#endif +#ifdef CBLAS_DIVERGENCE_AUTODETECTION +int32_t autodetect_cblas_divergence(void * handle, const char * suffix); #endif // Functions in deepbindless_surrogates.c diff --git a/src/mkl_v2022_adapters.c b/src/mkl_v2022_adapters.c new file mode 100644 index 0000000..ed03747 --- /dev/null +++ b/src/mkl_v2022_adapters.c @@ -0,0 +1,25 @@ +#include +#include +#include "libblastrampoline_internal.h" + +/* + * In MKL v2022.0, a new ILP64 interface was released, but it sadly lacked a few symbol names. + * While waiting for a new release, the following intermediate + */ + +/* +double cblas_ddot_64(const int64_t N, const double *X, const int64_t incX, const double *Y, const int64_t incY) +{ + return ddot_64(&N, X, &incX, Y, &incY); +}*/ + +extern double complex (*mkl_cblas_zdotc_sub_64__addr)(const int64_t N, + const double complex *X, const int64_t incX, + const double complex *Y, const int64_t incY); +void mkl_cblas_zdotc_sub_64_(const int64_t N, + const double complex *X, const int64_t incX, + const double complex *Y, const int64_t incY, + double complex * z) +{ + *z = mkl_cblas_zdotc_sub_64__addr(N, X, incX, Y, incY); +} \ No newline at end of file diff --git a/test/direct.jl b/test/direct.jl index c95c500..8954667 100644 --- a/test/direct.jl +++ b/test/direct.jl @@ -69,6 +69,13 @@ lbt_handle = dlopen("$(lbt_prefix)/$(binlib)/lib$(lbt_link_name).$(shlib_ext)", @test libs[1].interface == LBT_INTERFACE_LP64 end @test libs[1].f2c == LBT_F2C_PLAIN + if Sys.ARCH == :x86_64 + @test libs[1].cblas == LBT_CBLAS_CONFORMANT + else + @test libs[1].cblas == LBT_CBLAS_UNKNOWN + end + @test libs[1].complex_retstyle == LBT_COMLPEX_RETSTYLE_NORMAL + @test bitfield_get(libs[1].active_forwards, dgemm_idx) != 0 # Next check OpenBLAS32_jll which is always LP64 @@ -76,6 +83,7 @@ lbt_handle = dlopen("$(lbt_prefix)/$(binlib)/lib$(lbt_link_name).$(shlib_ext)", @test libs[2].suffix == "" @test libs[2].interface == LBT_INTERFACE_LP64 @test libs[2].f2c == LBT_F2C_PLAIN + @test libs[2].complex_retstyle == LBT_COMLPEX_RETSTYLE_NORMAL # If OpenBLAS32 and OpenBLAS are the same interface (e.g. i686) # then libs[2].active_forwards should be all zero! @@ -219,4 +227,26 @@ if MKL_jll.is_available() && Sys.WORD_SIZE == 64 @test libs[2].libname == libmkl_rt @test libs[2].interface == LBT_INTERFACE_LP64 end + + @testset "MKL v2022 CBLAS workaround" begin + # Load ILP64 MKL + lbt_forward(lbt_handle, libmkl_rt; clear=true, suffix_hint = "64") + + config = lbt_get_config(lbt_handle) + libs = unpack_loaded_libraries(config) + @test length(libs) == 1 + @test libs[1].interface == LBT_INTERFACE_ILP64 + @test libs[1].cblas == LBT_CBLAS_DIVERGENT + @test libs[1].complex_retstyle == LBT_COMLPEX_RETSTYLE_ARGUMENT + + # Call cblas_zdotu_sub, showcasing that it doesn't work + empty!(stacktraces) + A = ComplexF64[3.1 + 1.4im, -1.0 + 1.2im] + B = ComplexF64[1.3 + 0.3im, -1.1 + -3.4im] + result = ComplexF64[0] + zdotu_fptr = dlsym(lbt_handle, :cblas_zdotc_sub64_) + ccall(zdotu_fptr, Cvoid, (Int64, Ptr{ComplexF64}, Int64, Ptr{ComplexF64}, Int64, Ptr{ComplexF64}), 2, A, 1, B, 1, result) + @test result[1] ≈ ComplexF64(1.47 + 3.83im) + @test isempty(stacktraces) + end end diff --git a/test/runtests.jl b/test/runtests.jl index cd613e6..78d5e3e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,11 +75,22 @@ function run_test((test_name, test_expected_outputs, test_success), libblas_name end end -# our tests +# our tests, written in C, defined in subdirectories in `test/` dgemm = ("dgemm_test", ("||C||^2 is: 24.3384",), true) sgesv = ("sgesv_test", ("||b||^2 is: 3.0000",), true) sgesv_failure = ("sgesv_test", ("Error: no BLAS/LAPACK library loaded!",), false) -sdot = ("sdot_test", ("C is: 1.9900",), true) +sdot = ("sdot_test", ("C is: 1.9900",), true) +zdotc = ("zdotc_test", ( + "C (cblas) is: ( 1.4700, 3.8300)", + "C (fortran) is: ( 1.4700, 3.8300)", + ), true) + +# Helper function to run all the tests with the given arguments +function run_all_tests(args...; tests = [dgemm, sgesv, sdot, zdotc]) + for test in tests + run_test(test, args...) + end +end # Build version that links against vanilla OpenBLAS openblas_interface = :LP64 @@ -88,16 +99,12 @@ if Sys.WORD_SIZE == 64 && Sys.ARCH != :aarch64 end openblas_jll_libname = splitext(basename(OpenBLAS_jll.libopenblas_path)[4:end])[1] @testset "Vanilla OpenBLAS_jll ($(openblas_interface))" begin - run_test(dgemm, openblas_jll_libname, OpenBLAS_jll.LIBPATH_list, openblas_interface, "") - run_test(sgesv, openblas_jll_libname, OpenBLAS_jll.LIBPATH_list, openblas_interface, "") - run_test(sdot, openblas_jll_libname, OpenBLAS_jll.LIBPATH_list, openblas_interface, "") + run_all_tests(openblas_jll_libname, OpenBLAS_jll.LIBPATH_list, openblas_interface, "") end # Build version that links against vanilla OpenBLAS32 @testset "Vanilla OpenBLAS32_jll (LP64)" begin - run_test(dgemm, "openblas", OpenBLAS32_jll.LIBPATH_list, :LP64, "") - run_test(sgesv, "openblas", OpenBLAS32_jll.LIBPATH_list, :LP64, "") - run_test(sdot, "openblas", OpenBLAS32_jll.LIBPATH_list, :LP64, "") + run_all_tests("openblas", OpenBLAS32_jll.LIBPATH_list, :LP64, "") end # Next, build a version that links against `libblastrampoline`, and tell @@ -107,26 +114,20 @@ lbt_dir = joinpath(lbt_dir, binlib) @testset "LBT -> OpenBLAS_jll ($(openblas_interface))" begin libdirs = unique(vcat(lbt_dir, OpenBLAS_jll.LIBPATH_list..., CompilerSupportLibraries_jll.LIBPATH_list...)) - run_test(dgemm, lbt_link_name, libdirs, openblas_interface, OpenBLAS_jll.libopenblas_path) - run_test(sgesv, lbt_link_name, libdirs, openblas_interface, OpenBLAS_jll.libopenblas_path) - run_test(sdot, lbt_link_name, libdirs, openblas_interface, OpenBLAS_jll.libopenblas_path) + run_all_tests("blastrampoline", libdirs, openblas_interface, OpenBLAS_jll.libopenblas_path) end # And again, but this time with OpenBLAS32_jll @testset "LBT -> OpenBLAS32_jll (LP64)" begin libdirs = unique(vcat(lbt_dir, OpenBLAS32_jll.LIBPATH_list..., CompilerSupportLibraries_jll.LIBPATH_list...)) - run_test(dgemm, lbt_link_name, libdirs, :LP64, OpenBLAS32_jll.libopenblas_path) - run_test(sgesv, lbt_link_name, libdirs, :LP64, OpenBLAS32_jll.libopenblas_path) - run_test(sdot, lbt_link_name, libdirs, :LP64, OpenBLAS32_jll.libopenblas_path) + run_all_tests("blastrampoline", libdirs, :LP64, OpenBLAS32_jll.libopenblas_path) end # Test against MKL_jll using `libmkl_rt`, which is :LP64 by default if MKL_jll.is_available() @testset "LBT -> MKL_jll (LP64)" begin libdirs = unique(vcat(lbt_dir, MKL_jll.LIBPATH_list..., CompilerSupportLibraries_jll.LIBPATH_list...)) - run_test(dgemm, lbt_link_name, libdirs, :LP64, MKL_jll.libmkl_rt_path) - run_test(sgesv, lbt_link_name, libdirs, :LP64, MKL_jll.libmkl_rt_path) - run_test(sdot, lbt_link_name, libdirs, :LP64, MKL_jll.libmkl_rt_path) + run_all_tests("blastrampoline", libdirs, :LP64, MKL_jll.libmkl_rt_path) end # Test that we can set MKL's interface via an environment variable to select ILP64, and LBT detects it properly @@ -134,9 +135,7 @@ if MKL_jll.is_available() @testset "LBT -> MKL_jll (ILP64, via env)" begin withenv("MKL_INTERFACE_LAYER" => "ILP64") do libdirs = unique(vcat(lbt_dir, MKL_jll.LIBPATH_list..., CompilerSupportLibraries_jll.LIBPATH_list...)) - run_test(dgemm, lbt_link_name, libdirs, :ILP64, MKL_jll.libmkl_rt_path) - run_test(sgesv, lbt_link_name, libdirs, :ILP64, MKL_jll.libmkl_rt_path) - run_test(sdot, lbt_link_name, libdirs, :ILP64, MKL_jll.libmkl_rt_path) + run_all_tests("blastrampoline", libdirs, :ILP64, MKL_jll.libmkl_rt_path) end end end @@ -145,16 +144,15 @@ end # Do we have Accelerate available? veclib_blas_path = "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/libBLAS.dylib" if isfile(veclib_blas_path) + # Test that we can run BLAS-only tests without LAPACK loaded (`sgesv` test requires LAPACK symbols) @testset "LBT -> vecLib/libBLAS" begin - run_test(dgemm, lbt_link_name, [lbt_dir], :LP64, veclib_blas_path) - run_test(sdot, lbt_link_name, [lbt_dir], :LP64, veclib_blas_path) + run_all_tests("blastrampoline", [lbt_dir], :LP64, veclib_blas_path; tests=[dgemm, sdot, zdotc]) end + # With LAPACK as well, run all tests veclib_lapack_path = "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/libLAPACK.dylib" @testset "LBT -> vecLib/libLAPACK" begin - run_test(dgemm, lbt_link_name, [lbt_dir], :LP64, string(veclib_blas_path, ";", veclib_lapack_path)) - run_test(sgesv, lbt_link_name, [lbt_dir], :LP64, string(veclib_blas_path, ";", veclib_lapack_path)) - run_test(sdot, lbt_link_name, [lbt_dir], :LP64, string(veclib_blas_path, ";", veclib_lapack_path)) + run_all_tests("blastrampoline", [lbt_dir], :LP64, string(veclib_blas_path, ";", veclib_lapack_path)) end end @@ -162,21 +160,16 @@ end # Do we have a `blas64.so` somewhere? If so, test with that for fun blas64 = dlopen("libblas64", throw_error=false) if blas64 !== nothing + # Test that we can run BLAS-only tests without LAPACK loaded (`sgesv` test requires LAPACK symbols, blas64 doesn't have CBLAS) @testset "LBT -> libblas64 (ILP64, BLAS)" begin - run_test(dgemm, lbt_link_name, [lbt_dir], :ILP64, dlpath(blas64)) - run_test(sdot, lbt_link_name, [lbt_dir], :ILP64, dlpath(blas64)) - # Can't run `sgesv` here as we don't have LAPACK symbols in `libblas64.so` - - run_test(sgesv_failure, lbt_link_name, [lbt_dir], :ILP64, dlpath(blas64)) + run_all_tests("blastrampoline", [lbt_dir], :ILP64, dlpath(blas64); tests=[dgemm, sdot]) end # Check if we have a `liblapack` and if we do, run again, this time including `sgesv` lapack = dlopen("liblapack64", throw_error=false) if lapack !== nothing @testset "LBT -> libblas64 + liblapack64 (ILP64, BLAS+LAPACK)" begin - run_test(dgemm, lbt_link_name, [lbt_dir], :ILP64, "$(dlpath(blas64));$(dlpath(lapack))") - run_test(sgesv, lbt_link_name, [lbt_dir], :ILP64, "$(dlpath(blas64));$(dlpath(lapack))") - run_test(sdot, lbt_link_name, [lbt_dir], :ILP64, "$(dlpath(blas64));$(dlpath(lapack))") + run_all_tests("blastrampoline", [lbt_dir], :ILP64, "$(dlpath(blas64));$(dlpath(lapack))"; tests=[dgemm, sdot, sgesv]) end end end diff --git a/test/utils.jl b/test/utils.jl index 5991a5f..af0a7df 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -87,7 +87,9 @@ struct lbt_library_info_t suffix::Cstring active_forwards::Ptr{UInt8} interface::Int32 + complex_retstyle::Int32 f2c::Int32 + cblas::Int32 end struct LBTLibraryInfo libname::String @@ -95,13 +97,32 @@ struct LBTLibraryInfo suffix::String active_forwards::Vector{UInt8} interface::Int32 + complex_retstyle::Int32 f2c::Int32 - - LBTLibraryInfo(x::lbt_library_info_t, num_symbols::UInt32) = new(unsafe_string(x.libname), x.handle, unsafe_string(x.suffix), unsafe_wrap(Vector{UInt8}, x.active_forwards, div(num_symbols,8)+1), x.interface, x.f2c) + cblas::Int32 + + LBTLibraryInfo(x::lbt_library_info_t, num_symbols::UInt32) = new( + unsafe_string(x.libname), + x.handle, + unsafe_string(x.suffix), + unsafe_wrap(Vector{UInt8}, + x.active_forwards, + div(num_symbols,8)+1), + x.interface, + x.complex_retstyle, + x.f2c, + x.cblas + ) end const LBT_INTERFACE_LP64 = 32 const LBT_INTERFACE_ILP64 = 64 const LBT_F2C_PLAIN = 0 +const LBT_COMLPEX_RETSTYLE_NORMAL = 0 +const LBT_COMLPEX_RETSTYLE_ARGUMENT = 1 +const LBT_COMLPEX_RETSTYLE_UNKNOWN = -1 +const LBT_CBLAS_CONFORMANT = 0 +const LBT_CBLAS_DIVERGENT = 1 +const LBT_CBLAS_UNKNOWN = -1 struct lbt_config_t loaded_libs::Ptr{Ptr{lbt_library_info_t}} @@ -131,8 +152,8 @@ function lbt_get_forward(handle, symbol_name, interface, f2c = LBT_F2C_PLAIN) return ccall(dlsym(handle, :lbt_get_forward), Ptr{Cvoid}, (Cstring, Int32, Int32), symbol_name, interface, f2c) end -function lbt_set_forward(handle, symbol_name, addr, interface, f2c = LBT_F2C_PLAIN; verbose::Bool = false) - return ccall(dlsym(handle, :lbt_set_forward), Int32, (Cstring, Ptr{Cvoid}, Int32, Int32, Int32), symbol_name, addr, interface, f2c, verbose ? 1 : 0) +function lbt_set_forward(handle, symbol_name, addr, interface, complex_retstyle = LBT_COMLPEX_RETSTYLE_NORMAL, f2c = LBT_F2C_PLAIN; verbose::Bool = false) + return ccall(dlsym(handle, :lbt_set_forward), Int32, (Cstring, Ptr{Cvoid}, Int32, Int32, Int32, Int32), symbol_name, addr, interface, complex_retstyle, f2c, verbose ? 1 : 0) end function lbt_set_default_func(handle, addr) diff --git a/test/zdotc_test/Makefile b/test/zdotc_test/Makefile new file mode 100644 index 0000000..ba0b2d6 --- /dev/null +++ b/test/zdotc_test/Makefile @@ -0,0 +1,15 @@ +include ../../src/Make.inc + +all: $(prefix)/zdotc_test$(EXE) + +$(prefix): + @mkdir -p $@ + +$(prefix)/zdotc_test$(EXE): zdotc_test.c | $(prefix) + @$(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) + +clean: + @rm -f $(prefix)/zdotc_test$(EXE) + +run: $(prefix)/zdotc_test$(EXE) + @$(prefix)/zdotc_test$(EXE) diff --git a/test/zdotc_test/zdotc_test.c b/test/zdotc_test/zdotc_test.c new file mode 100644 index 0000000..205f6b6 --- /dev/null +++ b/test/zdotc_test/zdotc_test.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#ifdef ILP64 +#define MANGLE(x) x##64_ +typedef int64_t blasint; +#else +#define MANGLE(x) x +typedef int32_t blasint; +#endif + +extern double complex MANGLE(cblas_zdotc_sub)(blasint, double complex *, blasint, double complex *, blasint, double complex *); +extern double complex MANGLE(zdotc_)(blasint *, double complex *, blasint *, double complex *, blasint *); + +#define N 2 +int main() +{ + double complex A[N], B[N]; + + // Initialize `A` with known values (transposed into FORTRAN ordering) + A[0] = 3.1 + 1.4*I; + A[1] = -1.0 + 1.2*I; + + // Initialize `b` with known values + B[0] = 1.3 + 0.3*I; + B[1] = -1.1 + -3.4*I; + + // Perform complex dot product + blasint len = N; + blasint inca = 1; + blasint incb = 1; + complex double C; + MANGLE(cblas_zdotc_sub)(len, &A[0], inca, &B[0], incb, &C); + + // Print out C + printf("C (cblas) is: (%8.4f, %8.4f)\n", creal(C), cimag(C)); + + // Do the same thing, but with the FORTRAN interface + C = MANGLE(zdotc_)(&len, &A[0], &inca, &B[0], &incb); + printf("C (fortran) is: (%8.4f, %8.4f)\n", creal(C), cimag(C)); +}