Add complex return style detection and cblas_*_sub workaround

When using `libmkl_rt`, it tends to provide the intel interface rather than the gfortran interface, which causes `{c,z}dot{u,c}` to expect an implicit argument to store the return value in. We provide some autodetection of this, and build some wrappers to work around this for the eight functions affected. In MKL v2022.0, special ILP64-suffixed symbols are provided for the FORTRAN symbols, but not for the CBLAS ones. While we tend to use the FORTRAN symbols for most tasks, there are a few CBLAS symbols we use in the Julia world, presumably because of the above ABI problem. Now that that is fixed, we can likely stop using the CBLAS methods completely and use only the FORTRAN symbols, but for completeness, we detect missing ILP64-suffixed versions as well and forward them to the appropriate FORTRAN symbols. Note that a fully complete mapping of CBLAS -> FORTRAN symbols would be ideal here, however that is a large undertaking, and so we choose here to be selective in which functions we provide mappings for.
JuliaLinearAlgebra · Feb 2, 2022 · 0829248 · 0829248
1 parent f64f859
commit 0829248
Show file tree

Hide file tree

Showing 18 changed files with 689 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -60,6 +60,8 @@ You can always tell if your system is limited in this fashion by calling `lbt_ge
 
 ### Version History
 
+v5.0.0 - Add complex return value wrappers and CBLAS workaround.  The complex return value wrapper ensures that all symbols maintain a standard ABI for returning complex numbers, and the CBLAS workaround maps CBLAS symbols to FORTRAN symbols when properly-suffixed CBLAS symbols do not exist, as is the case in MKL `v2022.0`.
+
 v4.1.0 - Add `LBT_STRICT` environment variable that causes calling missing symbols to kill the process.
 
 v4.0.0 - Add `suffix_hint` parameter to `lbt_forward()` to allow overriding symbol suffix search order for dual-interface libraries, and allow loading the same library with multiple interfaces.

diff --git a/src/Make.inc b/src/Make.inc
@@ -22,8 +22,8 @@ ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS))$(findstring CYGWIN,$(O
   OS := WINNT
 endif
 
-LBT_SOVERSION_MAJOR := 4
-LBT_SOVERSION_MINOR := 1
+LBT_SOVERSION_MAJOR := 5
+LBT_SOVERSION_MINOR := 0
 LBT_SOVERSION_PATCH := 0
 
 ifeq ($(OS), WINNT)
@@ -82,6 +82,13 @@ ifneq (,$(filter $(ARCH), x86_64 i686))
   F2C_AUTODETECTION := 1
 endif
 
+# If we're on x86_64 or i686, we may need to autodetect MKL v2022's incomplete ILP64 suffixing.
+CBLAS_DIVERGENCE_AUTODETECTION := 0
+ifneq (,$(filter $(ARCH), x86_64 i686))
+  LBT_CFLAGS += -DCBLAS_DIVERGENCE_AUTODETECTION
+  CBLAS_DIVERGENCE_AUTODETECTION := 1
+endif
+
 
 ifeq ($(VERBOSE),0)
 ENDCOLOR := "\033[0m"

diff --git a/src/Makefile b/src/Makefile
@@ -5,7 +5,10 @@ include $(LBT_ROOT)/src/Make.inc
 all: $(builddir)/libblastrampoline.$(SHLIB_EXT)
 
 # Objects we'll build
-MAIN_OBJS := libblastrampoline.o dl_utils.o config.o autodetection.o threading.o deepbindless.o trampolines/trampolines_$(ARCH).o
+MAIN_OBJS := libblastrampoline.o dl_utils.o config.o \
+			 autodetection.o \
+			 complex_return_style_adapters.o \
+			 threading.o deepbindless.o trampolines/trampolines_$(ARCH).o
 
 # Include win_utils.c on windws
 ifeq ($(OS),WINNT)
@@ -17,6 +20,10 @@ ifeq ($(F2C_AUTODETECTION),1)
 MAIN_OBJS += f2c_adapters.o
 endif
 
+ifeq ($(CBLAS_DIVERGENCE_AUTODETECTION),1)
+MAIN_OBJS += cblas_adapters.o
+endif
+
 # Place the `.o` files into `$(builddir)`
 MAIN_OBJS := $(addprefix $(builddir)/,$(MAIN_OBJS))
 

diff --git a/src/autodetection.c b/src/autodetection.c
@@ -1,4 +1,24 @@
 #include "libblastrampoline_internal.h"
+#include <complex.h>
+
+/*
+ * Search for a symbol that ends in one of the given suffixes.  Returns NULL if not found.
+ */
+const char * symbol_suffix_search(void * handle, const char * symbol_name, const char ** suffixes, const int num_suffixes) {
+    char symbol_name_suffixed[MAX_SYMBOL_LEN];
+    for (int suffix_idx=0; suffix_idx<num_suffixes; suffix_idx++) {
+        // Skip NULL suffixes (the case where we have no `suffix_hint`)
+        if (suffixes[suffix_idx] == NULL) {
+            continue;
+        }
+
+        sprintf(symbol_name_suffixed, "%s%s", symbol_name, suffixes[suffix_idx]);
+        if (lookup_symbol(handle, symbol_name_suffixed) != NULL) {
+            return suffixes[suffix_idx];
+        }
+    }
+    return NULL;
+}
 
 /*
  * Autodetect the name mangling suffix used by the given BLAS/LAPACK library.
@@ -28,18 +48,10 @@ const char * autodetect_symbol_suffix(void * handle, const char * suffix_hint) {
     };
 
     // If the suffix hint is NULL, just skip it when calling `lookup_symbol()`.
-    int suffix_start_idx = 0;
-    if (suffix_hint == NULL) {
-        suffix_start_idx = 1;
-    }
-
-    char symbol_name[MAX_SYMBOL_LEN];
     for (int symbol_idx=0; symbol_idx<sizeof(symbol_names)/sizeof(const char *); symbol_idx++) {
-        for (int suffix_idx=suffix_start_idx; suffix_idx<sizeof(suffixes)/sizeof(const char *); suffix_idx++) {
-            sprintf(symbol_name, "%s%s", symbol_names[symbol_idx], suffixes[suffix_idx]);
-            if (lookup_symbol(handle, symbol_name) != NULL) {
-                return suffixes[suffix_idx];
-            }
+        const char * suffix = symbol_suffix_search(handle, symbol_names[symbol_idx], suffixes, sizeof(suffixes)/sizeof(const char *));
+        if (suffix != NULL) {
+            return suffix;
         }
     }
     return NULL;
@@ -221,3 +233,32 @@ int32_t autodetect_f2c(void * handle, const char * suffix) {
     return LBT_F2C_UNKNOWN;
 }
 #endif
+
+#ifdef CBLAS_DIVERGENCE_AUTODETECTION
+int32_t autodetect_cblas_divergence(void * handle, const char * suffix) {
+    char symbol_name[MAX_SYMBOL_LEN];
+
+    sprintf(symbol_name, "zdotc_%s", suffix);
+    if (lookup_symbol(handle, symbol_name) != NULL ) {
+        // If we have both `zdotc_64` and `cblas_zdotc_sub64`, it's all good:
+        sprintf(symbol_name, "cblas_zdotc_sub%s", suffix);
+        if (lookup_symbol(handle, symbol_name) != NULL ) {
+            return LBT_CBLAS_CONFORMANT;
+        }
+
+        const char * lp64_suffixes[] = {
+            "", "_", "__",
+        };
+        const char * suffix = symbol_suffix_search(handle, "cblas_zdotc_sub", lp64_suffixes, sizeof(lp64_suffixes)/sizeof(const char *));
+
+        // If we have `zdotc_64`, but we don't have `cblas_zdotc_sub64`, AND we have an
+        // LP64-mangled `cblas_zdotc_sub`, then we know it's a library that has ILP64
+        // FORTRAN symbols, but not CBLAS ones, such as MKL v2022.0.
+        if (suffix != NULL) {
+            return LBT_CBLAS_DIVERGENT;
+        }
+    }
+    // If we can't even find `zdotc_64`, we don't know what this is.
+    return LBT_CBLAS_UNKNOWN;
+}
+#endif
diff --git a/src/cblas_adapters.c b/src/cblas_adapters.c
@@ -0,0 +1,107 @@
+#include <complex.h>
+#include "libblastrampoline_internal.h"
+
+/*
+ * Some libraries provide ILP64-suffixed FORTRAN symbols, but forgot the CBLAS ones.
+ * To allow Julia to still use `cblas_{c,z}dot{c,u}_sub` when linking against the
+ * explicitly ILP64-suffixed MKL libraries, we map the CBLAS forwards to the FORTRAN
+ * symbols, where appropriate.  This effects MKL v2022.0, x-ref:
+ *  - https://github.com/JuliaLinearAlgebra/libblastrampoline/issues/56
+ */
+
+extern double complex zdotc_(const int32_t *,
+                             const double complex *, const int32_t *,
+                             const double complex *, const int32_t *);
+void lbt_cblas_zdotc_sub(const int32_t N,
+                         const double complex *X, const int32_t incX,
+                         const double complex *Y, const int32_t incY,
+                         double complex * z)
+{
+   *z = zdotc_(&N, X, &incX, Y, &incY);
+}
+
+extern double complex zdotc_64_(const int64_t *,
+                                const double complex *, const int64_t *,
+                                const double complex *, const int64_t *);
+void lbt_cblas_zdotc_sub64_(const int64_t N,
+                            const double complex *X, const int64_t incX,
+                            const double complex *Y, const int64_t incY,
+                            double complex * z)
+{
+   *z = zdotc_64_(&N, X, &incX, Y, &incY);
+}
+
+
+extern double complex zdotu_(const int32_t *,
+                             const double complex *, const int32_t *,
+                             const double complex *, const int32_t *);
+void lbt_cblas_zdotu_sub(const int32_t N,
+                         const double complex *X, const int32_t incX,
+                         const double complex *Y, const int32_t incY,
+                         double complex * z)
+{
+   *z = zdotu_(&N, X, &incX, Y, &incY);
+}
+
+extern double complex zdotu_64_(const int64_t *,
+                                const double complex *, const int64_t *,
+                                const double complex *, const int64_t *);
+void lbt_cblas_zdotu_sub64_(const int64_t N,
+                            const double complex *X, const int64_t incX,
+                            const double complex *Y, const int64_t incY,
+                            double complex * z)
+{
+   *z = zdotu_64_(&N, X, &incX, Y, &incY);
+}
+
+
+
+
+
+
+
+
+extern float complex cdotc_(const int32_t *,
+                             const float complex *, const int32_t *,
+                             const float complex *, const int32_t *);
+void lbt_cblas_cdotc_sub(const int32_t N,
+                         const float complex *X, const int32_t incX,
+                         const float complex *Y, const int32_t incY,
+                         float complex * z)
+{
+   *z = cdotc_(&N, X, &incX, Y, &incY);
+}
+
+extern float complex cdotc_64_(const int64_t *,
+                                const float complex *, const int64_t *,
+                                const float complex *, const int64_t *);
+void lbt_cblas_cdotc_sub64_(const int64_t N,
+                            const float complex *X, const int64_t incX,
+                            const float complex *Y, const int64_t incY,
+                            float complex * z)
+{
+   *z = cdotc_64_(&N, X, &incX, Y, &incY);
+}
+
+
+extern float complex cdotu_(const int32_t *,
+                             const float complex *, const int32_t *,
+                             const float complex *, const int32_t *);
+void lbt_cblas_cdotu_sub(const int32_t N,
+                         const float complex *X, const int32_t incX,
+                         const float complex *Y, const int32_t incY,
+                         float complex * z)
+{
+   *z = cdotu_(&N, X, &incX, Y, &incY);
+}
+
+extern float complex cdotu_64_(const int64_t *,
+                                const float complex *, const int64_t *,
+                                const float complex *, const int64_t *);
+void lbt_cblas_cdotu_sub64_(const int64_t N,
+                            const float complex *X, const int64_t incX,
+                            const float complex *Y, const int64_t incY,
+                            float complex * z)
+{
+   *z = cdotu_64_(&N, X, &incX, Y, &incY);
+}
diff --git a/src/complex_return_style_adapters.c b/src/complex_return_style_adapters.c
@@ -0,0 +1,123 @@
+#include <complex.h>
+#include "libblastrampoline_internal.h"
+
+/*
+ * Some libraries use an argument-passing convention for returning complex numbers.
+ * We create wrappers to work around this behavior and provide a consistent ABI
+ * across all libraries.  An example of this style of library is MKL, x-ref:
+ *  - https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/ARPACK-with-MKL-crashes-when-calling-zdotc/td-p/1054316
+ *  - https://scicomp.stackexchange.com/questions/5380/intel-mkl-difference-between-mkl-intel-lp64-and-mkl-gf-lp64
+ */
+
+
+// zdotc
+extern void (*cmplxret_zdotc__addr)(double complex * z,
+                                    const int32_t *,
+                                    const double complex *, const int32_t *,
+                                    const double complex *, const int32_t *);
+double complex cmplxret_zdotc_(const int32_t * N,
+                               const double complex *X, const int32_t * incX,
+                               const double complex *Y, const int32_t * incY)
+{
+   double complex c;
+   cmplxret_zdotc__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+extern void (*cmplxret_zdotc_64__addr)(double complex * z,
+                                       const int64_t *,
+                                       const double complex *, const int64_t *,
+                                       const double complex *, const int64_t *);
+double complex cmplxret_zdotc_64_(const int64_t * N,
+                                  const double complex *X, const int64_t * incX,
+                                  const double complex *Y, const int64_t * incY)
+{
+   double complex c;
+   cmplxret_zdotc_64__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+
+// zdotu
+extern void (*zdotu__addr)(double complex * z,
+                           const int32_t *,
+                           const double complex *, const int32_t *,
+                           const double complex *, const int32_t *);
+double complex cmplxret_zdotu_(const int32_t * N,
+                               const double complex *X, const int32_t * incX,
+                               const double complex *Y, const int32_t * incY)
+{
+   double complex c;
+   zdotu__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+extern void (*zdotu_64__addr)(double complex * z,
+                              const int64_t *,
+                              const double complex *, const int64_t *,
+                              const double complex *, const int64_t *);
+double complex cmplxret_zdotu_64_(const int64_t * N,
+                                  const double complex *X, const int64_t * incX,
+                                  const double complex *Y, const int64_t * incY)
+{
+   double complex c;
+   zdotu_64__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+
+// cdotc
+extern void (*cdotc__addr)(float complex * z,
+                           const int32_t *,
+                           const float complex *, const int32_t *,
+                           const float complex *, const int32_t *);
+float complex cmplxret_cdotc_(const int32_t * N,
+                               const float complex *X, const int32_t * incX,
+                               const float complex *Y, const int32_t * incY)
+{
+   float complex c;
+   cdotc__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+extern void cdotc_64__addr(float complex * z,
+                           const int64_t *,
+                           const float complex *, const int64_t *,
+                           const float complex *, const int64_t *);
+float complex cmplxret_cdotc_64_(const int64_t * N,
+                                  const float complex *X, const int64_t * incX,
+                                  const float complex *Y, const int64_t * incY)
+{
+   float complex c;
+   cdotc_64__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+
+// cdotu
+extern void (*cdotu__addr)(float complex * z,
+                           const int32_t *,
+                           const float complex *, const int32_t *,
+                           const float complex *, const int32_t *);
+float complex cmplxret_cdotu_(const int32_t * N,
+                               const float complex *X, const int32_t * incX,
+                               const float complex *Y, const int32_t * incY)
+{
+   float complex c;
+   cdotu__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
+extern void (*cdotu_64__addr)(float complex * z,
+                              const int64_t *,
+                              const float complex *, const int64_t *,
+                              const float complex *, const int64_t *);
+float complex cmplxret_cdotu_64_(const int64_t * N,
+                                  const float complex *X, const int64_t * incX,
+                                  const float complex *Y, const int64_t * incY)
+{
+   float complex c;
+   cdotu_64__addr(&c, N, X, incX, Y, incY);
+   return c;
+}
+
diff --git a/src/config.c b/src/config.c
@@ -69,7 +69,8 @@ void clear_other_forwards(int skip_idx, uint8_t * forwards, int32_t interface) {
     }
 }
 
-void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, int interface, int f2c) {
+void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards,
+                         int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t cblas) {
     // Scan for the an empty slot, and also check to see if this library has been loaded before.
     int free_idx = -1;
     for (int idx=0; idx<MAX_TRACKED_LIBS; ++idx) {
@@ -101,7 +102,9 @@ void record_library_load(const char * libname, void * handle, const char * suffi
     new_libinfo->active_forwards = (uint8_t *)malloc(sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1));
     memcpy(new_libinfo->active_forwards, forwards, sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1));
     new_libinfo->interface = interface;
+    new_libinfo->complex_retstyle = complex_retstyle;
     new_libinfo->f2c = f2c;
+    new_libinfo->cblas = cblas;
 
     lbt_config.loaded_libs[free_idx] = new_libinfo;