From e7e43f11687e7db62dc18e90f3ab9b24099539fd Mon Sep 17 00:00:00 2001 From: Yi Lin Date: Tue, 16 May 2023 16:07:44 +1200 Subject: [PATCH] Implement write barrier fastpath for sticky immix (#8) This PR implements the write barrier fastpath for sticky immix in both the runtime write barrier and the codegen write barrier. There are also a few other changes: 1. pass collection type to MMTk's `handle_user_collection_request`, 2. call MMTk in `jl_gc_notify_image_alloc`. --- src/jl_exported_funcs.inc | 2 ++ src/julia.h | 37 ++++++++++++++++++---- src/julia_internal.h | 7 ++-- src/llvm-final-gc-lowering.cpp | 35 ++++++++++++++++++-- src/llvm-late-gc-lowering.cpp | 57 +++++++++++++++++++++++++-------- src/llvm-pass-helpers.cpp | 58 ++++++++++++++++++++++++++++++++++ src/llvm-pass-helpers.h | 4 +++ src/mmtk-gc.c | 24 +++++++++++--- 8 files changed, 192 insertions(+), 32 deletions(-) diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index b51e55510e172..1f182f37f938f 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -188,6 +188,8 @@ XX(jl_gc_queue_root) \ XX(jl_gc_wb1_noinline) \ XX(jl_gc_wb2_noinline) \ + XX(jl_gc_wb1_slow) \ + XX(jl_gc_wb2_slow) \ XX(jl_gc_safepoint) \ XX(jl_gc_schedule_foreign_sweepfunc) \ XX(jl_gc_set_cb_notify_external_alloc) \ diff --git a/src/julia.h b/src/julia.h index 2396b7a38a00d..75ebab99dbbf7 100644 --- a/src/julia.h +++ b/src/julia.h @@ -961,23 +961,21 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_ } #else // MMTK_GC -// MMTk's write barrier method. This is the full write barier including fastpath and slowpath. -// TODO: We should inline fastpath in the following functions, and only call slowpath. -STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT; +STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT; STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT { - mmtk_gc_wb_full(parent, ptr); + mmtk_gc_wb(parent, ptr); } STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t* { - mmtk_gc_wb_full(ptr, (void*)0); + mmtk_gc_wb(ptr, (void*)0); } STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT { - mmtk_gc_wb_full(parent, (void*)0); + mmtk_gc_wb(parent, (void*)0); } #endif // MMTK_GC @@ -2284,12 +2282,39 @@ extern JL_DLLEXPORT int jl_default_debug_info_kind; #ifdef MMTK_GC extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr); +extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr); +extern const uint8_t MMTK_NEEDS_WRITE_BARRIER; +extern const uint8_t OBJECT_BARRIER; +extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS; + +// Directly call into MMTk for write barrier (debugging only) STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT { jl_task_t *ct = jl_current_task; jl_ptls_t ptls = ct->ptls; mmtk_object_reference_write_post(ptls->mmtk_mutator_ptr, parent, ptr); } + +// Inlined fastpath +STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSAFEPOINT +{ + if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) { + intptr_t addr = (intptr_t) (void*) parent; + uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6); + intptr_t shift = (addr >> 3) & 0b111; + uint8_t byte_val = *meta_addr; + if (((byte_val >> shift) & 1) == 1) { + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr); + } + } +} + +STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT +{ + mmtk_gc_wb_fast(parent, ptr); +} #endif #ifdef __cplusplus diff --git a/src/julia_internal.h b/src/julia_internal.h index 6d456b470a116..fb939e81b4a69 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -335,7 +335,6 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz); JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty); JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz); extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator); -extern uint8_t mmtk_needs_write_barrier(void); #endif // MMTK_GC JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT; extern uv_mutex_t gc_perm_lock; @@ -617,16 +616,14 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT #else // MMTK_GC -// TODO: We should inline fastpath in the following functions, and only call slowpath. - STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t* { - mmtk_gc_wb_full(bnd, val); + mmtk_gc_wb(bnd, val); } STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t* { - mmtk_gc_wb_full(parent, (void*)0); + mmtk_gc_wb(parent, (void*)0); } #endif // MMTK_GC diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index a41f69d74b1e5..d60a8e181177b 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -51,6 +51,8 @@ struct FinalLowerGC: private JuliaPassContext { #ifdef MMTK_GC Function *writeBarrier1Func; Function *writeBarrier2Func; + Function *writeBarrier1SlowFunc; + Function *writeBarrier2SlowFunc; #endif Instruction *pgcstack; @@ -78,6 +80,8 @@ struct FinalLowerGC: private JuliaPassContext { #ifdef MMTK_GC Value *lowerWriteBarrier1(CallInst *target, Function &F); Value *lowerWriteBarrier2(CallInst *target, Function &F); + Value *lowerWriteBarrier1Slow(CallInst *target, Function &F); + Value *lowerWriteBarrier2Slow(CallInst *target, Function &F); #endif }; @@ -227,6 +231,21 @@ Value *FinalLowerGC::lowerWriteBarrier2(CallInst *target, Function &F) target->setCalledFunction(writeBarrier2Func); return target; } + +Value *FinalLowerGC::lowerWriteBarrier1Slow(CallInst *target, Function &F) +{ + assert(target->arg_size() == 1); + target->setCalledFunction(writeBarrier1SlowFunc); + return target; +} + +Value *FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F) +{ + assert(target->arg_size() == 2); + target->setCalledFunction(writeBarrier2SlowFunc); + return target; +} + #endif Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) @@ -339,7 +358,9 @@ bool FinalLowerGC::doInitialization(Module &M) { #ifdef MMTK_GC writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1); writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2); - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func}; + writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow); + writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow); + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc}; #else GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc}; #endif @@ -359,8 +380,8 @@ bool FinalLowerGC::doInitialization(Module &M) { bool FinalLowerGC::doFinalization(Module &M) { #ifdef MMTK_GC - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func}; - queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = nullptr; + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc}; + queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = writeBarrier1SlowFunc = writeBarrier2SlowFunc = nullptr; #else GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc}; queueRootFunc = poolAllocFunc = bigAllocFunc = nullptr; @@ -437,6 +458,8 @@ bool FinalLowerGC::runOnFunction(Function &F) #ifdef MMTK_GC auto writeBarrier1Func = getOrNull(jl_intrinsics::writeBarrier1); auto writeBarrier2Func = getOrNull(jl_intrinsics::writeBarrier2); + auto writeBarrier1SlowFunc = getOrNull(jl_intrinsics::writeBarrier1Slow); + auto writeBarrier2SlowFunc = getOrNull(jl_intrinsics::writeBarrier2Slow); #endif // Lower all calls to supported intrinsics. @@ -478,6 +501,12 @@ bool FinalLowerGC::runOnFunction(Function &F) else if (callee == writeBarrier2Func) { replaceInstruction(CI, lowerWriteBarrier2(CI, F), it); } + else if (callee == writeBarrier1SlowFunc) { + replaceInstruction(CI, lowerWriteBarrier1Slow(CI, F), it); + } + else if (callee == writeBarrier2SlowFunc) { + replaceInstruction(CI, lowerWriteBarrier2Slow(CI, F), it); + } #endif else if (callee == safepointFunc) { lowerSafepoint(CI, F); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 8a0210c626935..eec21c0c64010 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2537,22 +2537,51 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { assert(false); } #else + // FIXME: Currently we call write barrier with the src object (parent). + // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all. + // But for other MMTk plans, we need to be careful. + const bool INLINE_WRITE_BARRIER = true; if (CI->getCalledOperand() == write_barrier_func) { - // if (CI->arg_size() == 2) { - // // parent, target - // Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier2); - // builder.CreateCall(wb_func, { parent, CI->getArgOperand(1) }); // We need to be careful about arg1, which may not match the type for wb_func. We probably need a bitcast - // } else { - // // parent and many targets - // Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1); - // builder.CreateCall(wb_func, { parent }); - // } - auto barrier = mmtk_needs_write_barrier(); - if (barrier == 1) { - // We only care about parent - Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1); - builder.CreateCall(wb_func, { parent }); + if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) { + if (INLINE_WRITE_BARRIER) { + auto i8_ty = Type::getInt8Ty(F.getContext()); + auto intptr_ty = T_size; + + // intptr_t addr = (intptr_t) (void*) src; + // uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6)); + intptr_t metadata_base_address = reinterpret_cast(MMTK_SIDE_LOG_BIT_BASE_ADDRESS); + auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address); + auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0)); + + auto parent_val = builder.CreatePtrToInt(parent, intptr_ty); + auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6)); + auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr); + + // intptr_t shift = (addr >> 3) & 0b111; + auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7)); + auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty); + + // uint8_t byte_val = *meta_addr; + auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align()); + + // if (((byte_val >> shift) & 1) == 1) { + auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8); + auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1)); + auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1)); + + // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target); + MDBuilder MDB(F.getContext()); + SmallVector Weights{1, 9}; + auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights)); + builder.SetInsertPoint(mayTriggerSlowpath); + builder.CreateCall(getOrDeclare(jl_intrinsics::writeBarrier1Slow), { parent }); + } else { + Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1); + builder.CreateCall(wb_func, { parent }); + } } + } else { + assert(false); } #endif diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index ff65ec7de3aab..1e1ae4bc7eada 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -120,6 +120,8 @@ namespace jl_intrinsics { #ifdef MMTK_GC static const char *WRITE_BARRIER_1_NAME = "julia.write_barrier1_noinline"; static const char *WRITE_BARRIER_2_NAME = "julia.write_barrier2_noinline"; + static const char *WRITE_BARRIER_1_SLOW_NAME = "julia.write_barrier_1_slow"; + static const char *WRITE_BARRIER_2_SLOW_NAME = "julia.write_barrier_2_slow"; #endif // Annotates a function with attributes suitable for GC allocation @@ -255,6 +257,32 @@ namespace jl_intrinsics { intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); return intrinsic; }); + const IntrinsicDescription writeBarrier1Slow( + WRITE_BARRIER_1_SLOW_NAME, + [](const JuliaPassContext &context) { + auto intrinsic = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue }, + false), + Function::ExternalLinkage, + WRITE_BARRIER_1_SLOW_NAME); + intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return intrinsic; + }); + const IntrinsicDescription writeBarrier2Slow( + WRITE_BARRIER_2_SLOW_NAME, + [](const JuliaPassContext &context) { + auto intrinsic = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + WRITE_BARRIER_2_SLOW_NAME); + intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return intrinsic; + }); #endif } @@ -265,6 +293,8 @@ namespace jl_well_known { #ifdef MMTK_GC static const char *GC_WB_1_NAME = XSTR(jl_gc_wb1_noinline); static const char *GC_WB_2_NAME = XSTR(jl_gc_wb2_noinline); + static const char *GC_WB_1_SLOW_NAME = XSTR(jl_gc_wb1_slow); + static const char *GC_WB_2_SLOW_NAME = XSTR(jl_gc_wb2_slow); #endif using jl_intrinsics::addGCAllocAttributes; @@ -342,5 +372,33 @@ namespace jl_well_known { func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); return func; }); + + const WellKnownFunctionDescription GCWriteBarrier1Slow( + GC_WB_1_SLOW_NAME, + [](const JuliaPassContext &context) { + auto func = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue }, + false), + Function::ExternalLinkage, + GC_WB_1_SLOW_NAME); + func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return func; + }); + + const WellKnownFunctionDescription GCWriteBarrier2Slow( + GC_WB_2_SLOW_NAME, + [](const JuliaPassContext &context) { + auto func = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + GC_WB_2_SLOW_NAME); + func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return func; + }); #endif } diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 7f4d7646829f3..d6e4be7e05338 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -133,6 +133,8 @@ namespace jl_intrinsics { #ifdef MMTK_GC extern const IntrinsicDescription writeBarrier1; extern const IntrinsicDescription writeBarrier2; + extern const IntrinsicDescription writeBarrier1Slow; + extern const IntrinsicDescription writeBarrier2Slow; #endif } @@ -158,6 +160,8 @@ namespace jl_well_known { #ifdef MMTK_GC extern const WellKnownFunctionDescription GCWriteBarrier1; extern const WellKnownFunctionDescription GCWriteBarrier2; + extern const WellKnownFunctionDescription GCWriteBarrier1Slow; + extern const WellKnownFunctionDescription GCWriteBarrier2Slow; #endif } diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c index b354d287baa14..a9feeb6ef4921 100644 --- a/src/mmtk-gc.c +++ b/src/mmtk-gc.c @@ -192,13 +192,13 @@ void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr) { - /* TODO: not needed? */ + unreachable(); } // TODO: exported, but not MMTk-specific? JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT { - /* TODO: confirm not needed? */ + unreachable(); } @@ -207,11 +207,13 @@ JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) { + unreachable(); return 0; } JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, jl_value_t **objs, size_t nobjs) { + unreachable(); } @@ -229,7 +231,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes); return; } - handle_user_collection_request(ptls); + handle_user_collection_request(ptls, collection); } // Per-thread initialization @@ -497,6 +499,20 @@ JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOT jl_gc_wb(parent, ptr); } +JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT +{ + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, (const void*) 0); +} + +JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT +{ + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr); +} + void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset) { jl_ptls_t ptls = jl_current_task->ptls; @@ -516,7 +532,7 @@ void jl_gc_notify_image_load(const char* img_data, size_t len) void jl_gc_notify_image_alloc(char* img_data, size_t len) { - // TODO: We should call MMTk to bulk set object metadata for the image region + mmtk_immortal_region_post_alloc((void*)img_data, len); } #ifdef __cplusplus