Skip to content

Commit

Permalink
Add complex return style detection and cblas_*_sub workaround
Browse files Browse the repository at this point in the history
When using `libmkl_rt`, it tends to provide the intel interface rather
than the gfortran interface, which causes `{c,z}dot{u,c}` to expect an
implicit argument to store the return value in.  We provide some
autodetection of this, and build some wrappers to work around this for
the eight functions affected.

In MKL v2022.0, special ILP64-suffixed symbols are provided for the
FORTRAN symbols, but not for the CBLAS ones.  While we tend to use the
FORTRAN symbols for most tasks, there are a few CBLAS symbols we use in
the Julia world, presumably because of the above ABI problem.  Now that
that is fixed, we can likely stop using the CBLAS methods completely and
use only the FORTRAN symbols, but for completeness, we detect missing
ILP64-suffixed versions as well and forward them to the appropriate
FORTRAN symbols.

Note that a fully complete mapping of CBLAS -> FORTRAN symbols would be
ideal here, however that is a large undertaking, and so we choose here
to be selective in which functions we provide mappings for.
  • Loading branch information
staticfloat committed Feb 2, 2022
1 parent f64f859 commit 0829248
Show file tree
Hide file tree
Showing 18 changed files with 689 additions and 62 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ You can always tell if your system is limited in this fashion by calling `lbt_ge

### Version History

v5.0.0 - Add complex return value wrappers and CBLAS workaround. The complex return value wrapper ensures that all symbols maintain a standard ABI for returning complex numbers, and the CBLAS workaround maps CBLAS symbols to FORTRAN symbols when properly-suffixed CBLAS symbols do not exist, as is the case in MKL `v2022.0`.

v4.1.0 - Add `LBT_STRICT` environment variable that causes calling missing symbols to kill the process.

v4.0.0 - Add `suffix_hint` parameter to `lbt_forward()` to allow overriding symbol suffix search order for dual-interface libraries, and allow loading the same library with multiple interfaces.
Expand Down
11 changes: 9 additions & 2 deletions src/Make.inc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS))$(findstring CYGWIN,$(O
OS := WINNT
endif

LBT_SOVERSION_MAJOR := 4
LBT_SOVERSION_MINOR := 1
LBT_SOVERSION_MAJOR := 5
LBT_SOVERSION_MINOR := 0
LBT_SOVERSION_PATCH := 0

ifeq ($(OS), WINNT)
Expand Down Expand Up @@ -82,6 +82,13 @@ ifneq (,$(filter $(ARCH), x86_64 i686))
F2C_AUTODETECTION := 1
endif

# If we're on x86_64 or i686, we may need to autodetect MKL v2022's incomplete ILP64 suffixing.
CBLAS_DIVERGENCE_AUTODETECTION := 0
ifneq (,$(filter $(ARCH), x86_64 i686))
LBT_CFLAGS += -DCBLAS_DIVERGENCE_AUTODETECTION
CBLAS_DIVERGENCE_AUTODETECTION := 1
endif


ifeq ($(VERBOSE),0)
ENDCOLOR := "\033[0m"
Expand Down
9 changes: 8 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ include $(LBT_ROOT)/src/Make.inc
all: $(builddir)/libblastrampoline.$(SHLIB_EXT)

# Objects we'll build
MAIN_OBJS := libblastrampoline.o dl_utils.o config.o autodetection.o threading.o deepbindless.o trampolines/trampolines_$(ARCH).o
MAIN_OBJS := libblastrampoline.o dl_utils.o config.o \
autodetection.o \
complex_return_style_adapters.o \
threading.o deepbindless.o trampolines/trampolines_$(ARCH).o

# Include win_utils.c on windws
ifeq ($(OS),WINNT)
Expand All @@ -17,6 +20,10 @@ ifeq ($(F2C_AUTODETECTION),1)
MAIN_OBJS += f2c_adapters.o
endif

ifeq ($(CBLAS_DIVERGENCE_AUTODETECTION),1)
MAIN_OBJS += cblas_adapters.o
endif

# Place the `.o` files into `$(builddir)`
MAIN_OBJS := $(addprefix $(builddir)/,$(MAIN_OBJS))

Expand Down
63 changes: 52 additions & 11 deletions src/autodetection.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,24 @@
#include "libblastrampoline_internal.h"
#include <complex.h>

/*
* Search for a symbol that ends in one of the given suffixes. Returns NULL if not found.
*/
const char * symbol_suffix_search(void * handle, const char * symbol_name, const char ** suffixes, const int num_suffixes) {
char symbol_name_suffixed[MAX_SYMBOL_LEN];
for (int suffix_idx=0; suffix_idx<num_suffixes; suffix_idx++) {
// Skip NULL suffixes (the case where we have no `suffix_hint`)
if (suffixes[suffix_idx] == NULL) {
continue;
}

sprintf(symbol_name_suffixed, "%s%s", symbol_name, suffixes[suffix_idx]);
if (lookup_symbol(handle, symbol_name_suffixed) != NULL) {
return suffixes[suffix_idx];
}
}
return NULL;
}

/*
* Autodetect the name mangling suffix used by the given BLAS/LAPACK library.
Expand Down Expand Up @@ -28,18 +48,10 @@ const char * autodetect_symbol_suffix(void * handle, const char * suffix_hint) {
};

// If the suffix hint is NULL, just skip it when calling `lookup_symbol()`.
int suffix_start_idx = 0;
if (suffix_hint == NULL) {
suffix_start_idx = 1;
}

char symbol_name[MAX_SYMBOL_LEN];
for (int symbol_idx=0; symbol_idx<sizeof(symbol_names)/sizeof(const char *); symbol_idx++) {
for (int suffix_idx=suffix_start_idx; suffix_idx<sizeof(suffixes)/sizeof(const char *); suffix_idx++) {
sprintf(symbol_name, "%s%s", symbol_names[symbol_idx], suffixes[suffix_idx]);
if (lookup_symbol(handle, symbol_name) != NULL) {
return suffixes[suffix_idx];
}
const char * suffix = symbol_suffix_search(handle, symbol_names[symbol_idx], suffixes, sizeof(suffixes)/sizeof(const char *));
if (suffix != NULL) {
return suffix;
}
}
return NULL;
Expand Down Expand Up @@ -221,3 +233,32 @@ int32_t autodetect_f2c(void * handle, const char * suffix) {
return LBT_F2C_UNKNOWN;
}
#endif

#ifdef CBLAS_DIVERGENCE_AUTODETECTION
int32_t autodetect_cblas_divergence(void * handle, const char * suffix) {
char symbol_name[MAX_SYMBOL_LEN];

sprintf(symbol_name, "zdotc_%s", suffix);
if (lookup_symbol(handle, symbol_name) != NULL ) {
// If we have both `zdotc_64` and `cblas_zdotc_sub64`, it's all good:
sprintf(symbol_name, "cblas_zdotc_sub%s", suffix);
if (lookup_symbol(handle, symbol_name) != NULL ) {
return LBT_CBLAS_CONFORMANT;
}

const char * lp64_suffixes[] = {
"", "_", "__",
};
const char * suffix = symbol_suffix_search(handle, "cblas_zdotc_sub", lp64_suffixes, sizeof(lp64_suffixes)/sizeof(const char *));

// If we have `zdotc_64`, but we don't have `cblas_zdotc_sub64`, AND we have an
// LP64-mangled `cblas_zdotc_sub`, then we know it's a library that has ILP64
// FORTRAN symbols, but not CBLAS ones, such as MKL v2022.0.
if (suffix != NULL) {
return LBT_CBLAS_DIVERGENT;
}
}
// If we can't even find `zdotc_64`, we don't know what this is.
return LBT_CBLAS_UNKNOWN;
}
#endif
107 changes: 107 additions & 0 deletions src/cblas_adapters.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#include <complex.h>
#include "libblastrampoline_internal.h"

/*
* Some libraries provide ILP64-suffixed FORTRAN symbols, but forgot the CBLAS ones.
* To allow Julia to still use `cblas_{c,z}dot{c,u}_sub` when linking against the
* explicitly ILP64-suffixed MKL libraries, we map the CBLAS forwards to the FORTRAN
* symbols, where appropriate. This effects MKL v2022.0, x-ref:
* - https://github.com/JuliaLinearAlgebra/libblastrampoline/issues/56
*/

extern double complex zdotc_(const int32_t *,
const double complex *, const int32_t *,
const double complex *, const int32_t *);
void lbt_cblas_zdotc_sub(const int32_t N,
const double complex *X, const int32_t incX,
const double complex *Y, const int32_t incY,
double complex * z)
{
*z = zdotc_(&N, X, &incX, Y, &incY);
}

extern double complex zdotc_64_(const int64_t *,
const double complex *, const int64_t *,
const double complex *, const int64_t *);
void lbt_cblas_zdotc_sub64_(const int64_t N,
const double complex *X, const int64_t incX,
const double complex *Y, const int64_t incY,
double complex * z)
{
*z = zdotc_64_(&N, X, &incX, Y, &incY);
}


extern double complex zdotu_(const int32_t *,
const double complex *, const int32_t *,
const double complex *, const int32_t *);
void lbt_cblas_zdotu_sub(const int32_t N,
const double complex *X, const int32_t incX,
const double complex *Y, const int32_t incY,
double complex * z)
{
*z = zdotu_(&N, X, &incX, Y, &incY);
}

extern double complex zdotu_64_(const int64_t *,
const double complex *, const int64_t *,
const double complex *, const int64_t *);
void lbt_cblas_zdotu_sub64_(const int64_t N,
const double complex *X, const int64_t incX,
const double complex *Y, const int64_t incY,
double complex * z)
{
*z = zdotu_64_(&N, X, &incX, Y, &incY);
}








extern float complex cdotc_(const int32_t *,
const float complex *, const int32_t *,
const float complex *, const int32_t *);
void lbt_cblas_cdotc_sub(const int32_t N,
const float complex *X, const int32_t incX,
const float complex *Y, const int32_t incY,
float complex * z)
{
*z = cdotc_(&N, X, &incX, Y, &incY);
}

extern float complex cdotc_64_(const int64_t *,
const float complex *, const int64_t *,
const float complex *, const int64_t *);
void lbt_cblas_cdotc_sub64_(const int64_t N,
const float complex *X, const int64_t incX,
const float complex *Y, const int64_t incY,
float complex * z)
{
*z = cdotc_64_(&N, X, &incX, Y, &incY);
}


extern float complex cdotu_(const int32_t *,
const float complex *, const int32_t *,
const float complex *, const int32_t *);
void lbt_cblas_cdotu_sub(const int32_t N,
const float complex *X, const int32_t incX,
const float complex *Y, const int32_t incY,
float complex * z)
{
*z = cdotu_(&N, X, &incX, Y, &incY);
}

extern float complex cdotu_64_(const int64_t *,
const float complex *, const int64_t *,
const float complex *, const int64_t *);
void lbt_cblas_cdotu_sub64_(const int64_t N,
const float complex *X, const int64_t incX,
const float complex *Y, const int64_t incY,
float complex * z)
{
*z = cdotu_64_(&N, X, &incX, Y, &incY);
}
123 changes: 123 additions & 0 deletions src/complex_return_style_adapters.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include <complex.h>
#include "libblastrampoline_internal.h"

/*
* Some libraries use an argument-passing convention for returning complex numbers.
* We create wrappers to work around this behavior and provide a consistent ABI
* across all libraries. An example of this style of library is MKL, x-ref:
* - https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/ARPACK-with-MKL-crashes-when-calling-zdotc/td-p/1054316
* - https://scicomp.stackexchange.com/questions/5380/intel-mkl-difference-between-mkl-intel-lp64-and-mkl-gf-lp64
*/


// zdotc
extern void (*cmplxret_zdotc__addr)(double complex * z,
const int32_t *,
const double complex *, const int32_t *,
const double complex *, const int32_t *);
double complex cmplxret_zdotc_(const int32_t * N,
const double complex *X, const int32_t * incX,
const double complex *Y, const int32_t * incY)
{
double complex c;
cmplxret_zdotc__addr(&c, N, X, incX, Y, incY);
return c;
}

extern void (*cmplxret_zdotc_64__addr)(double complex * z,
const int64_t *,
const double complex *, const int64_t *,
const double complex *, const int64_t *);
double complex cmplxret_zdotc_64_(const int64_t * N,
const double complex *X, const int64_t * incX,
const double complex *Y, const int64_t * incY)
{
double complex c;
cmplxret_zdotc_64__addr(&c, N, X, incX, Y, incY);
return c;
}


// zdotu
extern void (*zdotu__addr)(double complex * z,
const int32_t *,
const double complex *, const int32_t *,
const double complex *, const int32_t *);
double complex cmplxret_zdotu_(const int32_t * N,
const double complex *X, const int32_t * incX,
const double complex *Y, const int32_t * incY)
{
double complex c;
zdotu__addr(&c, N, X, incX, Y, incY);
return c;
}

extern void (*zdotu_64__addr)(double complex * z,
const int64_t *,
const double complex *, const int64_t *,
const double complex *, const int64_t *);
double complex cmplxret_zdotu_64_(const int64_t * N,
const double complex *X, const int64_t * incX,
const double complex *Y, const int64_t * incY)
{
double complex c;
zdotu_64__addr(&c, N, X, incX, Y, incY);
return c;
}


// cdotc
extern void (*cdotc__addr)(float complex * z,
const int32_t *,
const float complex *, const int32_t *,
const float complex *, const int32_t *);
float complex cmplxret_cdotc_(const int32_t * N,
const float complex *X, const int32_t * incX,
const float complex *Y, const int32_t * incY)
{
float complex c;
cdotc__addr(&c, N, X, incX, Y, incY);
return c;
}

extern void cdotc_64__addr(float complex * z,
const int64_t *,
const float complex *, const int64_t *,
const float complex *, const int64_t *);
float complex cmplxret_cdotc_64_(const int64_t * N,
const float complex *X, const int64_t * incX,
const float complex *Y, const int64_t * incY)
{
float complex c;
cdotc_64__addr(&c, N, X, incX, Y, incY);
return c;
}


// cdotu
extern void (*cdotu__addr)(float complex * z,
const int32_t *,
const float complex *, const int32_t *,
const float complex *, const int32_t *);
float complex cmplxret_cdotu_(const int32_t * N,
const float complex *X, const int32_t * incX,
const float complex *Y, const int32_t * incY)
{
float complex c;
cdotu__addr(&c, N, X, incX, Y, incY);
return c;
}

extern void (*cdotu_64__addr)(float complex * z,
const int64_t *,
const float complex *, const int64_t *,
const float complex *, const int64_t *);
float complex cmplxret_cdotu_64_(const int64_t * N,
const float complex *X, const int64_t * incX,
const float complex *Y, const int64_t * incY)
{
float complex c;
cdotu_64__addr(&c, N, X, incX, Y, incY);
return c;
}

5 changes: 4 additions & 1 deletion src/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ void clear_other_forwards(int skip_idx, uint8_t * forwards, int32_t interface) {
}
}

void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards, int interface, int f2c) {
void record_library_load(const char * libname, void * handle, const char * suffix, uint8_t * forwards,
int32_t interface, int32_t complex_retstyle, int32_t f2c, int32_t cblas) {
// Scan for the an empty slot, and also check to see if this library has been loaded before.
int free_idx = -1;
for (int idx=0; idx<MAX_TRACKED_LIBS; ++idx) {
Expand Down Expand Up @@ -101,7 +102,9 @@ void record_library_load(const char * libname, void * handle, const char * suffi
new_libinfo->active_forwards = (uint8_t *)malloc(sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1));
memcpy(new_libinfo->active_forwards, forwards, sizeof(uint8_t)*(NUM_EXPORTED_FUNCS/8 + 1));
new_libinfo->interface = interface;
new_libinfo->complex_retstyle = complex_retstyle;
new_libinfo->f2c = f2c;
new_libinfo->cblas = cblas;

lbt_config.loaded_libs[free_idx] = new_libinfo;

Expand Down
Loading

0 comments on commit 0829248

Please sign in to comment.