Skip to content

Commit

Permalink
Implement write barrier fastpath for sticky immix (#8)
Browse files Browse the repository at this point in the history
This PR implements the write barrier fastpath for sticky immix in both the runtime write barrier and the codegen write barrier. There are also a few other changes: 1. pass collection type to MMTk's `handle_user_collection_request`, 2. call MMTk in `jl_gc_notify_image_alloc`.
  • Loading branch information
qinsoon authored May 16, 2023
1 parent 620cb79 commit e7e43f1
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 32 deletions.
2 changes: 2 additions & 0 deletions src/jl_exported_funcs.inc
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@
XX(jl_gc_queue_root) \
XX(jl_gc_wb1_noinline) \
XX(jl_gc_wb2_noinline) \
XX(jl_gc_wb1_slow) \
XX(jl_gc_wb2_slow) \
XX(jl_gc_safepoint) \
XX(jl_gc_schedule_foreign_sweepfunc) \
XX(jl_gc_set_cb_notify_external_alloc) \
Expand Down
37 changes: 31 additions & 6 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,23 +961,21 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
}

#else // MMTK_GC
// MMTk's write barrier method. This is the full write barier including fastpath and slowpath.
// TODO: We should inline fastpath in the following functions, and only call slowpath.
STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT;
STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT;

STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
{
mmtk_gc_wb_full(parent, ptr);
mmtk_gc_wb(parent, ptr);
}

STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
{
mmtk_gc_wb_full(ptr, (void*)0);
mmtk_gc_wb(ptr, (void*)0);
}

STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
{
mmtk_gc_wb_full(parent, (void*)0);
mmtk_gc_wb(parent, (void*)0);
}
#endif // MMTK_GC

Expand Down Expand Up @@ -2284,12 +2282,39 @@ extern JL_DLLEXPORT int jl_default_debug_info_kind;

#ifdef MMTK_GC
extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
extern const uint8_t MMTK_NEEDS_WRITE_BARRIER;
extern const uint8_t OBJECT_BARRIER;
extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;

// Directly call into MMTk for write barrier (debugging only)
STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
{
jl_task_t *ct = jl_current_task;
jl_ptls_t ptls = ct->ptls;
mmtk_object_reference_write_post(ptls->mmtk_mutator_ptr, parent, ptr);
}

// Inlined fastpath
STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSAFEPOINT
{
if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
intptr_t addr = (intptr_t) (void*) parent;
uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
intptr_t shift = (addr >> 3) & 0b111;
uint8_t byte_val = *meta_addr;
if (((byte_val >> shift) & 1) == 1) {
jl_task_t *ct = jl_current_task;
jl_ptls_t ptls = ct->ptls;
mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr);
}
}
}

STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
{
mmtk_gc_wb_fast(parent, ptr);
}
#endif

#ifdef __cplusplus
Expand Down
7 changes: 2 additions & 5 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
extern uint8_t mmtk_needs_write_barrier(void);
#endif // MMTK_GC
JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
extern uv_mutex_t gc_perm_lock;
Expand Down Expand Up @@ -617,16 +616,14 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT

#else // MMTK_GC

// TODO: We should inline fastpath in the following functions, and only call slowpath.

STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
{
mmtk_gc_wb_full(bnd, val);
mmtk_gc_wb(bnd, val);
}

STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
{
mmtk_gc_wb_full(parent, (void*)0);
mmtk_gc_wb(parent, (void*)0);
}
#endif // MMTK_GC

Expand Down
35 changes: 32 additions & 3 deletions src/llvm-final-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ struct FinalLowerGC: private JuliaPassContext {
#ifdef MMTK_GC
Function *writeBarrier1Func;
Function *writeBarrier2Func;
Function *writeBarrier1SlowFunc;
Function *writeBarrier2SlowFunc;
#endif
Instruction *pgcstack;

Expand Down Expand Up @@ -78,6 +80,8 @@ struct FinalLowerGC: private JuliaPassContext {
#ifdef MMTK_GC
Value *lowerWriteBarrier1(CallInst *target, Function &F);
Value *lowerWriteBarrier2(CallInst *target, Function &F);
Value *lowerWriteBarrier1Slow(CallInst *target, Function &F);
Value *lowerWriteBarrier2Slow(CallInst *target, Function &F);
#endif
};

Expand Down Expand Up @@ -227,6 +231,21 @@ Value *FinalLowerGC::lowerWriteBarrier2(CallInst *target, Function &F)
target->setCalledFunction(writeBarrier2Func);
return target;
}

Value *FinalLowerGC::lowerWriteBarrier1Slow(CallInst *target, Function &F)
{
assert(target->arg_size() == 1);
target->setCalledFunction(writeBarrier1SlowFunc);
return target;
}

Value *FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F)
{
assert(target->arg_size() == 2);
target->setCalledFunction(writeBarrier2SlowFunc);
return target;
}

#endif

Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
Expand Down Expand Up @@ -339,7 +358,9 @@ bool FinalLowerGC::doInitialization(Module &M) {
#ifdef MMTK_GC
writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc};
#else
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
#endif
Expand All @@ -359,8 +380,8 @@ bool FinalLowerGC::doInitialization(Module &M) {
bool FinalLowerGC::doFinalization(Module &M)
{
#ifdef MMTK_GC
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = nullptr;
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc};
queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = writeBarrier1SlowFunc = writeBarrier2SlowFunc = nullptr;
#else
GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
queueRootFunc = poolAllocFunc = bigAllocFunc = nullptr;
Expand Down Expand Up @@ -437,6 +458,8 @@ bool FinalLowerGC::runOnFunction(Function &F)
#ifdef MMTK_GC
auto writeBarrier1Func = getOrNull(jl_intrinsics::writeBarrier1);
auto writeBarrier2Func = getOrNull(jl_intrinsics::writeBarrier2);
auto writeBarrier1SlowFunc = getOrNull(jl_intrinsics::writeBarrier1Slow);
auto writeBarrier2SlowFunc = getOrNull(jl_intrinsics::writeBarrier2Slow);
#endif

// Lower all calls to supported intrinsics.
Expand Down Expand Up @@ -478,6 +501,12 @@ bool FinalLowerGC::runOnFunction(Function &F)
else if (callee == writeBarrier2Func) {
replaceInstruction(CI, lowerWriteBarrier2(CI, F), it);
}
else if (callee == writeBarrier1SlowFunc) {
replaceInstruction(CI, lowerWriteBarrier1Slow(CI, F), it);
}
else if (callee == writeBarrier2SlowFunc) {
replaceInstruction(CI, lowerWriteBarrier2Slow(CI, F), it);
}
#endif
else if (callee == safepointFunc) {
lowerSafepoint(CI, F);
Expand Down
57 changes: 43 additions & 14 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2537,22 +2537,51 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
assert(false);
}
#else
// FIXME: Currently we call write barrier with the src object (parent).
// This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
// But for other MMTk plans, we need to be careful.
const bool INLINE_WRITE_BARRIER = true;
if (CI->getCalledOperand() == write_barrier_func) {
// if (CI->arg_size() == 2) {
// // parent, target
// Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier2);
// builder.CreateCall(wb_func, { parent, CI->getArgOperand(1) }); // We need to be careful about arg1, which may not match the type for wb_func. We probably need a bitcast
// } else {
// // parent and many targets
// Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
// builder.CreateCall(wb_func, { parent });
// }
auto barrier = mmtk_needs_write_barrier();
if (barrier == 1) {
// We only care about parent
Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
builder.CreateCall(wb_func, { parent });
if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
if (INLINE_WRITE_BARRIER) {
auto i8_ty = Type::getInt8Ty(F.getContext());
auto intptr_ty = T_size;

// intptr_t addr = (intptr_t) (void*) src;
// uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));

auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);

// intptr_t shift = (addr >> 3) & 0b111;
auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);

// uint8_t byte_val = *meta_addr;
auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());

// if (((byte_val >> shift) & 1) == 1) {
auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));

// object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
MDBuilder MDB(F.getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights));
builder.SetInsertPoint(mayTriggerSlowpath);
builder.CreateCall(getOrDeclare(jl_intrinsics::writeBarrier1Slow), { parent });
} else {
Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
builder.CreateCall(wb_func, { parent });
}
}
} else {
assert(false);
}
#endif

Expand Down
58 changes: 58 additions & 0 deletions src/llvm-pass-helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ namespace jl_intrinsics {
#ifdef MMTK_GC
static const char *WRITE_BARRIER_1_NAME = "julia.write_barrier1_noinline";
static const char *WRITE_BARRIER_2_NAME = "julia.write_barrier2_noinline";
static const char *WRITE_BARRIER_1_SLOW_NAME = "julia.write_barrier_1_slow";
static const char *WRITE_BARRIER_2_SLOW_NAME = "julia.write_barrier_2_slow";
#endif

// Annotates a function with attributes suitable for GC allocation
Expand Down Expand Up @@ -255,6 +257,32 @@ namespace jl_intrinsics {
intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return intrinsic;
});
const IntrinsicDescription writeBarrier1Slow(
WRITE_BARRIER_1_SLOW_NAME,
[](const JuliaPassContext &context) {
auto intrinsic = Function::Create(
FunctionType::get(
Type::getVoidTy(context.getLLVMContext()),
{ context.T_prjlvalue },
false),
Function::ExternalLinkage,
WRITE_BARRIER_1_SLOW_NAME);
intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return intrinsic;
});
const IntrinsicDescription writeBarrier2Slow(
WRITE_BARRIER_2_SLOW_NAME,
[](const JuliaPassContext &context) {
auto intrinsic = Function::Create(
FunctionType::get(
Type::getVoidTy(context.getLLVMContext()),
{ context.T_prjlvalue, context.T_prjlvalue },
false),
Function::ExternalLinkage,
WRITE_BARRIER_2_SLOW_NAME);
intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return intrinsic;
});
#endif
}

Expand All @@ -265,6 +293,8 @@ namespace jl_well_known {
#ifdef MMTK_GC
static const char *GC_WB_1_NAME = XSTR(jl_gc_wb1_noinline);
static const char *GC_WB_2_NAME = XSTR(jl_gc_wb2_noinline);
static const char *GC_WB_1_SLOW_NAME = XSTR(jl_gc_wb1_slow);
static const char *GC_WB_2_SLOW_NAME = XSTR(jl_gc_wb2_slow);
#endif

using jl_intrinsics::addGCAllocAttributes;
Expand Down Expand Up @@ -342,5 +372,33 @@ namespace jl_well_known {
func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return func;
});

const WellKnownFunctionDescription GCWriteBarrier1Slow(
GC_WB_1_SLOW_NAME,
[](const JuliaPassContext &context) {
auto func = Function::Create(
FunctionType::get(
Type::getVoidTy(context.getLLVMContext()),
{ context.T_prjlvalue },
false),
Function::ExternalLinkage,
GC_WB_1_SLOW_NAME);
func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return func;
});

const WellKnownFunctionDescription GCWriteBarrier2Slow(
GC_WB_2_SLOW_NAME,
[](const JuliaPassContext &context) {
auto func = Function::Create(
FunctionType::get(
Type::getVoidTy(context.getLLVMContext()),
{ context.T_prjlvalue, context.T_prjlvalue },
false),
Function::ExternalLinkage,
GC_WB_2_SLOW_NAME);
func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
return func;
});
#endif
}
4 changes: 4 additions & 0 deletions src/llvm-pass-helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ namespace jl_intrinsics {
#ifdef MMTK_GC
extern const IntrinsicDescription writeBarrier1;
extern const IntrinsicDescription writeBarrier2;
extern const IntrinsicDescription writeBarrier1Slow;
extern const IntrinsicDescription writeBarrier2Slow;
#endif
}

Expand All @@ -158,6 +160,8 @@ namespace jl_well_known {
#ifdef MMTK_GC
extern const WellKnownFunctionDescription GCWriteBarrier1;
extern const WellKnownFunctionDescription GCWriteBarrier2;
extern const WellKnownFunctionDescription GCWriteBarrier1Slow;
extern const WellKnownFunctionDescription GCWriteBarrier2Slow;
#endif
}

Expand Down
Loading

0 comments on commit e7e43f1

Please sign in to comment.