Skip to content

Commit

Permalink
Merge pull request #36507 from JuliaLang/jn/relaxed
Browse files Browse the repository at this point in the history
Annotate (most) Julia reference accesses as atomic unordered
  • Loading branch information
vtjnash authored Jul 8, 2020
2 parents 5f21b52 + 2b510b4 commit b19a357
Show file tree
Hide file tree
Showing 16 changed files with 352 additions and 233 deletions.
82 changes: 57 additions & 25 deletions src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,34 @@ extern "C" {

#define JL_ARRAY_ALIGN(jl_value, nbytes) LLT_ALIGN(jl_value, nbytes)

// this is a version of memcpy that preserves atomic memory ordering
// which makes it safe to use for objects that can contain memory references
// without risk of creating pointers out of thin air
// TODO: replace with LLVM's llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32
// aka `__llvm_memmove_element_unordered_atomic_8` (for 64 bit)
void memmove_refs(void **dstp, void *const *srcp, size_t n) JL_NOTSAFEPOINT
{
size_t i;
if (dstp < srcp || dstp > srcp + n) {
for (i = 0; i < n; i++) {
jl_atomic_store_relaxed(dstp + i, jl_atomic_load_relaxed(srcp + i));
}
}
else {
for (i = 0; i < n; i++) {
jl_atomic_store_relaxed(dstp + n - i - 1, jl_atomic_load_relaxed(srcp + n - i - 1));
}
}
}

void memmove_safe(int hasptr, char *dst, const char *src, size_t nb) JL_NOTSAFEPOINT
{
if (hasptr)
memmove_refs((void**)dst, (void**)src, nb / sizeof(void*));
else
memmove(dst, src, nb);
}

// array constructors ---------------------------------------------------------
char *jl_array_typetagdata(jl_array_t *a) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -542,10 +570,9 @@ JL_DLLEXPORT jl_value_t *jl_ptrarrayref(jl_array_t *a JL_PROPAGATES_ROOT, size_t
{
assert(i < jl_array_len(a));
assert(a->flags.ptrarray);
jl_value_t *elt = ((jl_value_t**)a->data)[i];
if (elt == NULL) {
jl_value_t *elt = jl_atomic_load_relaxed(((jl_value_t**)a->data) + i);
if (elt == NULL)
jl_throw(jl_undefref_exception);
}
return elt;
}

Expand All @@ -569,7 +596,7 @@ JL_DLLEXPORT jl_value_t *jl_arrayref(jl_array_t *a, size_t i)
JL_DLLEXPORT int jl_array_isassigned(jl_array_t *a, size_t i)
{
if (a->flags.ptrarray) {
return ((jl_value_t**)jl_array_data(a))[i] != NULL;
return jl_atomic_load_relaxed(((jl_value_t**)jl_array_data(a)) + i) != NULL;
}
else if (a->flags.hasptr) {
jl_datatype_t *eltype = (jl_datatype_t*)jl_tparam0(jl_typeof(a));
Expand Down Expand Up @@ -600,12 +627,17 @@ JL_DLLEXPORT void jl_arrayset(jl_array_t *a JL_ROOTING_ARGUMENT, jl_value_t *rhs
if (jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(rhs)))
return;
}
jl_assign_bits(&((char*)a->data)[i * a->elsize], rhs);
if (a->flags.hasptr) {
memmove_refs((void**)&((char*)a->data)[i * a->elsize], (void**)rhs, a->elsize / sizeof(void*));
}
else {
jl_assign_bits(&((char*)a->data)[i * a->elsize], rhs);
}
if (a->flags.hasptr)
jl_gc_multi_wb(jl_array_owner(a), rhs);
}
else {
((jl_value_t**)a->data)[i] = rhs;
jl_atomic_store_relaxed(((jl_value_t**)a->data) + i, rhs);
jl_gc_wb(jl_array_owner(a), rhs);
}
}
Expand All @@ -615,7 +647,7 @@ JL_DLLEXPORT void jl_arrayunset(jl_array_t *a, size_t i)
if (i >= jl_array_len(a))
jl_bounds_error_int((jl_value_t*)a, i + 1);
if (a->flags.ptrarray)
((jl_value_t**)a->data)[i] = NULL;
jl_atomic_store_relaxed(((jl_value_t**)a->data) + i, NULL);
else if (a->flags.hasptr) {
size_t elsize = a->elsize;
jl_assume(elsize >= sizeof(void*) && elsize % sizeof(void*) == 0);
Expand Down Expand Up @@ -762,7 +794,7 @@ STATIC_INLINE void jl_array_grow_at_beg(jl_array_t *a, size_t idx, size_t inc,
if (isbitsunion) newtypetagdata = typetagdata - inc;
if (idx > 0) {
// inserting new elements after 1st element
memmove(newdata, data, idx * elsz);
memmove_safe(a->flags.hasptr, newdata, data, idx * elsz);
if (isbitsunion) {
memmove(newtypetagdata, typetagdata, idx);
memset(newtypetagdata + idx, 0, inc);
Expand Down Expand Up @@ -796,11 +828,11 @@ STATIC_INLINE void jl_array_grow_at_beg(jl_array_t *a, size_t idx, size_t inc,
// We could use memcpy if resizing allocates a new buffer,
// hopefully it's not a particularly important optimization.
if (idx > 0 && newdata < data) {
memmove(newdata, data, nb1);
memmove_safe(a->flags.hasptr, newdata, data, nb1);
}
memmove(newdata + nbinc + nb1, data + nb1, n * elsz - nb1);
memmove_safe(a->flags.hasptr, newdata + nbinc + nb1, data + nb1, n * elsz - nb1);
if (idx > 0 && newdata > data) {
memmove(newdata, data, nb1);
memmove_safe(a->flags.hasptr, newdata, data, nb1);
}
a->offset = newoffset;
}
Expand All @@ -810,16 +842,16 @@ STATIC_INLINE void jl_array_grow_at_beg(jl_array_t *a, size_t idx, size_t inc,
newdata = data - oldoffsnb + a->offset * elsz;
if (isbitsunion) newtypetagdata = newdata + (a->maxsize - a->offset) * elsz + a->offset;
if (idx > 0 && newdata < data) {
memmove(newdata, data, nb1);
memmove_safe(a->flags.hasptr, newdata, data, nb1);
if (isbitsunion) {
memmove(newtypetagdata, typetagdata, idx);
memset(newtypetagdata + idx, 0, inc);
}
}
memmove(newdata + nbinc + nb1, data + nb1, n * elsz - nb1);
memmove_safe(a->flags.hasptr, newdata + nbinc + nb1, data + nb1, n * elsz - nb1);
if (isbitsunion) memmove(newtypetagdata + idx + inc, typetagdata + idx, n - idx);
if (idx > 0 && newdata > data) {
memmove(newdata, data, nb1);
memmove_safe(a->flags.hasptr, newdata, data, nb1);
if (isbitsunion) {
memmove(newtypetagdata, typetagdata, idx);
memset(newtypetagdata + idx, 0, inc);
Expand Down Expand Up @@ -891,7 +923,7 @@ STATIC_INLINE void jl_array_grow_at_end(jl_array_t *a, size_t idx,
memmove(newtypetagdata, typetagdata, idx);
memset(newtypetagdata + idx, 0, inc);
}
if (has_gap) memmove(newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1);
if (has_gap) memmove_safe(a->flags.hasptr, newdata + nb1 + nbinc, newdata + nb1, n * elsz - nb1);
}
a->data = data = newdata;
}
Expand All @@ -901,7 +933,7 @@ STATIC_INLINE void jl_array_grow_at_end(jl_array_t *a, size_t idx,
memset(typetagdata + idx, 0, inc);
}
size_t nb1 = idx * elsz;
memmove(data + nb1 + inc * elsz, data + nb1, n * elsz - nb1);
memmove_safe(a->flags.hasptr, data + nb1 + inc * elsz, data + nb1, n * elsz - nb1);
}
else {
// there was enough room for requested growth already in a->maxsize
Expand Down Expand Up @@ -1036,12 +1068,12 @@ STATIC_INLINE void jl_array_del_at_beg(jl_array_t *a, size_t idx, size_t dec,
if (elsz == 1 && !isbitsunion)
nbtotal++;
if (idx > 0) {
memmove(newdata, olddata, nb1);
memmove_safe(a->flags.hasptr, newdata, olddata, nb1);
if (isbitsunion) memmove(newtypetagdata, typetagdata, idx);
}
// Move the rest of the data if the offset changed
if (newoffs != offset) {
memmove(newdata + nb1, olddata + nb1 + nbdec, nbtotal - nb1);
memmove_safe(a->flags.hasptr, newdata + nb1, olddata + nb1 + nbdec, nbtotal - nb1);
if (isbitsunion) memmove(newtypetagdata + idx, typetagdata + idx + dec, n - idx);
}
a->data = newdata;
Expand All @@ -1063,7 +1095,7 @@ STATIC_INLINE void jl_array_del_at_end(jl_array_t *a, size_t idx, size_t dec,
int isbitsunion = jl_array_isbitsunion(a);
size_t last = idx + dec;
if (n > last) {
memmove(data + idx * elsz, data + last * elsz, (n - last) * elsz);
memmove_safe(a->flags.hasptr, data + idx * elsz, data + last * elsz, (n - last) * elsz);
if (isbitsunion) {
char *typetagdata = jl_array_typetagdata(a);
memmove(typetagdata + idx, typetagdata + last, n - last);
Expand Down Expand Up @@ -1161,14 +1193,14 @@ JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary)
}

// Copy element by element until we hit a young object, at which point
// we can continue using `memmove`.
// we can finish by using `memmove`.
static NOINLINE ssize_t jl_array_ptr_copy_forward(jl_value_t *owner,
void **src_p, void **dest_p,
ssize_t n)
{
for (ssize_t i = 0; i < n; i++) {
void *val = src_p[i];
dest_p[i] = val;
void *val = jl_atomic_load_relaxed(src_p + i);
jl_atomic_store_relaxed(dest_p + i, val);
// `val` is young or old-unmarked
if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
jl_gc_queue_root(owner);
Expand All @@ -1183,8 +1215,8 @@ static NOINLINE ssize_t jl_array_ptr_copy_backward(jl_value_t *owner,
ssize_t n)
{
for (ssize_t i = 0; i < n; i++) {
void *val = src_p[n - i - 1];
dest_p[n - i - 1] = val;
void *val = jl_atomic_load_relaxed(src_p + n - i - 1);
jl_atomic_store_relaxed(dest_p + n - i - 1, val);
// `val` is young or old-unmarked
if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
jl_gc_queue_root(owner);
Expand Down Expand Up @@ -1218,7 +1250,7 @@ JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
n -= done;
}
}
memmove(dest_p, src_p, n * sizeof(void*));
memmove_refs(dest_p, src_p, n);
}

JL_DLLEXPORT void jl_array_ptr_1d_push(jl_array_t *a, jl_value_t *item)
Expand Down
8 changes: 7 additions & 1 deletion src/atomics.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
* specified.
*/
#if defined(__GNUC__)
# define jl_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
# define jl_fence_release() __atomic_thread_fence(__ATOMIC_RELEASE)
# define jl_signal_fence() __atomic_signal_fence(__ATOMIC_SEQ_CST)
# define jl_atomic_fetch_add_relaxed(obj, arg) \
__atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
Expand Down Expand Up @@ -73,7 +75,7 @@
// TODO: Maybe add jl_atomic_compare_exchange_weak for spin lock
# define jl_atomic_store(obj, val) \
__atomic_store_n(obj, val, __ATOMIC_SEQ_CST)
# define jl_atomic_store_relaxed(obj, val) \
# define jl_atomic_store_relaxed(obj, val) \
__atomic_store_n(obj, val, __ATOMIC_RELAXED)
# if defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) || \
!(defined(_CPU_X86_) || defined(_CPU_X86_64_))
Expand All @@ -96,6 +98,9 @@
# define jl_atomic_load_relaxed(obj) \
__atomic_load_n(obj, __ATOMIC_RELAXED)
#elif defined(_COMPILER_MICROSOFT_)
// TODO: these only define compiler barriers, and aren't correct outside of x86
# define jl_fence() _ReadWriteBarrier()
# define jl_fence_release() _WriteBarrier()
# define jl_signal_fence() _ReadWriteBarrier()

// add
Expand Down Expand Up @@ -266,6 +271,7 @@ static inline void jl_atomic_store_release(volatile T *obj, T2 val)
jl_signal_fence();
*obj = (T)val;
}
template<typename T, typename T2>
static inline void jl_atomic_store_relaxed(volatile T *obj, T2 val)
{
*obj = (T)val;
Expand Down
37 changes: 20 additions & 17 deletions src/ccall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,15 @@ static Value *runtime_sym_lookup(
BasicBlock *dlsym_lookup = BasicBlock::Create(jl_LLVMContext, "dlsym");
BasicBlock *ccall_bb = BasicBlock::Create(jl_LLVMContext, "ccall");
Constant *initnul = ConstantPointerNull::get((PointerType*)T_pvoidfunc);
LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(llvmgv, sizeof(void*));
LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(T_pvoidfunc, llvmgv, sizeof(void*));
// This in principle needs a consume ordering so that load from
// this pointer sees a valid value. However, this is not supported by
// LLVM (or agreed on in the C/C++ standard FWIW) and should be
// almost impossible to happen on every platform we support since this
// ordering is enforced by the hardware and LLVM has to speculate an
// invalid load from the `cglobal` but doesn't depend on the `cglobal`
// value for this to happen.
// llvmf_orig->setAtomic(AtomicOrdering::Consume);
llvmf_orig->setAtomic(AtomicOrdering::Unordered);
irbuilder.CreateCondBr(
irbuilder.CreateICmpNE(llvmf_orig, initnul),
ccall_bb,
Expand All @@ -114,7 +114,7 @@ static Value *runtime_sym_lookup(
}
Value *llvmf = irbuilder.CreateCall(prepare_call_in(jl_builderModule(irbuilder), jldlsym_func),
{ libname, stringConstPtr(emission_context, irbuilder, f_name), libptrgv });
auto store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
StoreInst *store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
store->setAtomic(AtomicOrdering::Release);
irbuilder.CreateBr(ccall_bb);

Expand Down Expand Up @@ -169,7 +169,7 @@ static GlobalVariable *emit_plt_thunk(
IRBuilder<> irbuilder(b0);
Value *ptr = runtime_sym_lookup(emission_context, irbuilder, funcptype, f_lib, f_name, plt, libptrgv,
llvmgv, runtime_lib);
auto store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
StoreInst *store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
store->setAtomic(AtomicOrdering::Release);
SmallVector<Value*, 16> args;
for (Function::arg_iterator arg = plt->arg_begin(), arg_e = plt->arg_end(); arg != arg_e; ++arg)
Expand Down Expand Up @@ -234,7 +234,7 @@ static Value *emit_plt(
// consume ordering too. This is even less likely to cause issues though
// since the only thing we do to this loaded pointer is to call it
// immediately.
// got_val->setAtomic(AtomicOrdering::Consume);
got_val->setAtomic(AtomicOrdering::Unordered);
return ctx.builder.CreateBitCast(got_val, funcptype);
}

Expand Down Expand Up @@ -349,17 +349,19 @@ static Value *llvm_type_rewrite(
Value *from;
Value *to;
const DataLayout &DL = jl_data_layout;
unsigned align = std::max(DL.getPrefTypeAlignment(target_type), DL.getPrefTypeAlignment(from_type));
if (DL.getTypeAllocSize(target_type) >= DL.getTypeAllocSize(from_type)) {
to = emit_static_alloca(ctx, target_type);
cast<AllocaInst>(to)->setAlignment(Align(align));
from = emit_bitcast(ctx, to, from_type->getPointerTo());
}
else {
from = emit_static_alloca(ctx, from_type);
cast<AllocaInst>(from)->setAlignment(Align(align));
to = emit_bitcast(ctx, from, target_type->getPointerTo());
}
// XXX: deal with possible alignment issues
ctx.builder.CreateStore(v, from);
return ctx.builder.CreateLoad(to);
ctx.builder.CreateAlignedStore(v, from, align);
return ctx.builder.CreateAlignedLoad(to, align);
}

// --- argument passing and scratch space utilities ---
Expand Down Expand Up @@ -1576,9 +1578,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
Value *ptls_i16 = emit_bitcast(ctx, ctx.ptlsStates, T_pint16);
const int tid_offset = offsetof(jl_tls_states_t, tid);
Value *ptid = ctx.builder.CreateGEP(ptls_i16, ConstantInt::get(T_size, tid_offset / 2));
return mark_or_box_ccall_result(ctx,
tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(ptid)),
retboxed, rt, unionall, static_rt);
LoadInst *tid = ctx.builder.CreateAlignedLoad(ptid, sizeof(int16_t));
tbaa_decorate(tbaa_const, tid);
return mark_or_box_ccall_result(ctx, tid, retboxed, rt, unionall, static_rt);
}
else if (is_libjulia_func(jl_get_current_task)) {
assert(lrt == T_prjlvalue);
Expand All @@ -1587,9 +1589,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
Value *ptls_pv = emit_bitcast(ctx, ctx.ptlsStates, T_pprjlvalue);
const int ct_offset = offsetof(jl_tls_states_t, current_task);
Value *pct = ctx.builder.CreateGEP(ptls_pv, ConstantInt::get(T_size, ct_offset / sizeof(void*)));
return mark_or_box_ccall_result(ctx,
tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(pct)),
retboxed, rt, unionall, static_rt);
LoadInst *ct = ctx.builder.CreateAlignedLoad(pct, sizeof(void*));
tbaa_decorate(tbaa_const, ct);
return mark_or_box_ccall_result(ctx, ct, retboxed, rt, unionall, static_rt);
}
else if (is_libjulia_func(jl_set_next_task)) {
assert(lrt == T_void);
Expand All @@ -1608,8 +1610,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
ctx.builder.CreateCall(prepare_call(gcroot_flush_func));
Value *pdefer_sig = emit_defer_signal(ctx);
Value *defer_sig = ctx.builder.CreateLoad(pdefer_sig);
defer_sig = ctx.builder.CreateAdd(defer_sig,
ConstantInt::get(T_sigatomic, 1));
defer_sig = ctx.builder.CreateAdd(defer_sig, ConstantInt::get(T_sigatomic, 1));
ctx.builder.CreateStore(defer_sig, pdefer_sig);
emit_signal_fence(ctx);
return ghostValue(jl_nothing_type);
Expand Down Expand Up @@ -1671,7 +1672,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
idx = ctx.builder.CreateAdd(idx, ConstantInt::get(T_size, ((jl_datatype_t*)ety)->layout->first_ptr));
}
Value *slot_addr = ctx.builder.CreateInBoundsGEP(T_prjlvalue, arrayptr, idx);
Value *load = tbaa_decorate(tbaa_ptrarraybuf, ctx.builder.CreateLoad(T_prjlvalue, slot_addr));
LoadInst *load = ctx.builder.CreateAlignedLoad(T_prjlvalue, slot_addr, sizeof(void*));
load->setAtomic(AtomicOrdering::Unordered);
tbaa_decorate(tbaa_ptrarraybuf, load);
Value *res = ctx.builder.CreateZExt(ctx.builder.CreateICmpNE(load, Constant::getNullValue(T_prjlvalue)), T_int32);
JL_GC_POP();
return mark_or_box_ccall_result(ctx, res, retboxed, rt, unionall, static_rt);
Expand Down
Loading

2 comments on commit b19a357

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - possible new issues were detected. A full report can be found here. cc @maleadt

Please sign in to comment.