From b484fe7a4fe580713a50b0ddaf86081e6a2e85db Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Fri, 25 Oct 2024 14:54:12 +1100 Subject: [PATCH] Revert "Adding runtime option to inline allocation fastpath (#68)" (#69) This reverts commit 58d7d217ca47496d78fcdf7f106e0563aaf6c334. --- base/options.jl | 1 - src/jloptions.c | 26 ------ src/jloptions.h | 1 - src/llvm-late-gc-lowering.cpp | 153 +++++++++++++++++----------------- 4 files changed, 76 insertions(+), 105 deletions(-) diff --git a/base/options.jl b/base/options.jl index 4ef5be7c3d1e4..f535c27d99122 100644 --- a/base/options.jl +++ b/base/options.jl @@ -33,7 +33,6 @@ struct JLOptions warn_overwrite::Int8 can_inline::Int8 polly::Int8 - mmtk_inline_fastpath::Int8 trace_compile::Ptr{UInt8} trace_dispatch::Ptr{UInt8} fast_math::Int8 diff --git a/src/jloptions.c b/src/jloptions.c index d5adcfb21e6e2..35f0a76e3f6e7 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -76,11 +76,6 @@ JL_DLLEXPORT void jl_init_options(void) 0, // method overwrite warning 1, // can_inline JL_OPTIONS_POLLY_ON, // polly -#ifdef MMTK_GC - 1, // inline fastpath allocation for mmtk -#else - 0, -#endif NULL, // trace_compile NULL, // trace_dispatch JL_OPTIONS_FAST_MATH_DEFAULT, @@ -212,10 +207,6 @@ static const char opts[] = " --polly={yes*|no} Enable or disable the polyhedral optimizer Polly\n" " (overrides @polly declaration)\n" #endif -#ifdef MMTK_GC - " --inline-fastpath={yes*|no} Enable or disable inlining allocation fastpath for MMTk\n" - " during code generation.\n" -#endif // instrumentation options " --code-coverage[={none*|user|all}] Count executions of source lines (omitting setting is\n" @@ -302,7 +293,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_warn_scope, opt_inline, opt_polly, - opt_mmtk_inline_fastpath, opt_trace_compile, opt_trace_compile_timing, opt_trace_dispatch, @@ -382,7 +372,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "warn-scope", required_argument, 0, opt_warn_scope }, { "inline", required_argument, 0, opt_inline }, { "polly", required_argument, 0, opt_polly }, - { "inline-fastpath", required_argument, 0, opt_mmtk_inline_fastpath }, { "trace-compile", required_argument, 0, opt_trace_compile }, { "trace-compile-timing", no_argument, 0, opt_trace_compile_timing }, { "trace-dispatch", required_argument, 0, opt_trace_dispatch }, @@ -834,21 +823,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) jl_errorf("julia: invalid argument to --polly (%s)", optarg); } break; - case opt_mmtk_inline_fastpath: - if (!strcmp(optarg,"yes")) -#ifdef MMTK_GC - jl_options.mmtk_inline_fastpath = 1; -#else - // always set to 0 if not using MMTk - jl_options.mmtk_inline_fastpath = 0; - jl_printf(JL_STDERR, "WARNING: Attempting to set --inline-fastpath without using MMTk"); -#endif - else if (!strcmp(optarg,"no")) - jl_options.mmtk_inline_fastpath = 0; - else { - jl_errorf("julia: invalid argument to --inline-fastpath (%s)", optarg); - } - break; case opt_trace_compile: jl_options.trace_compile = strdup(optarg); if (!jl_options.trace_compile) diff --git a/src/jloptions.h b/src/jloptions.h index f49f8e3f60ea8..e58797caace3c 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -37,7 +37,6 @@ typedef struct { int8_t warn_overwrite; int8_t can_inline; int8_t polly; - int8_t mmtk_inline_fastpath; const char *trace_compile; const char *trace_dispatch; int8_t fast_math; diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index c3ea82b4d6b41..3201ae64cf984 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2505,19 +2505,9 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, St } } +#ifdef MMTK_GC Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) { -#ifndef MMTK_GC - // For Julia's stock GC, this option should always be 0 - assert(jl_options.mmtk_inline_fastpath == 0); -#endif - - // Setting --inline-fastpath=false with MMTk will increase allocation - // overhead a lot, and should only be used for debugging. - if (jl_options.mmtk_inline_fastpath == 0) { - return target; - } - assert(target->arg_size() == 3); IRBuilder<> builder(target); @@ -2535,72 +2525,78 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); - // Assuming we use the first immix allocator. - // FIXME: We should get the allocator index and type from MMTk. - auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); - - auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); - auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); - - auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); - auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); - auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); - - // offset = 8 - auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); - auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); - auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); - // alignment 16 (15 = 16 - 1) - auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); - auto result = builder.CreateNSWAdd(cursor, delta, "result"); - - auto new_cursor = builder.CreateNSWAdd(result, pool_osize); - - auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); - auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); - auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); - - auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); - - auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); - auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); - - auto next_instr = target->getNextNode(); - SmallVector Weights{1, 9}; - - MDBuilder MDB(F.getContext()); - SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); - - builder.SetInsertPoint(next_instr); - auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); - - // slowpath - builder.SetInsertPoint(slowpath); - auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); - auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); - new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); - builder.CreateBr(next_instr->getParent()); - - // fastpath - builder.SetInsertPoint(fastpath); - builder.CreateStore(new_cursor, cursor_ptr); - - // ptls->gc_tls.gc_num.allocd += osize; - auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); - auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); - auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); - auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); - auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); - builder.CreateStore(pool_allocd_total, pool_alloc_tls); - - auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); - auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); - builder.CreateBr(next_instr->getParent()); - - phiNode->addIncoming(new_call, slowpath); - phiNode->addIncoming(v_as_ptr, fastpath); - phiNode->takeName(target); - return phiNode; + // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. + // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. + const bool INLINE_FASTPATH_ALLOCATION = true; + + if (INLINE_FASTPATH_ALLOCATION) { + // Assuming we use the first immix allocator. + // FIXME: We should get the allocator index and type from MMTk. + auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); + + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + // offset = 8 + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + // alignment 16 (15 = 16 - 1) + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); + + auto next_instr = target->getNextNode(); + SmallVector Weights{1, 9}; + + MDBuilder MDB(F.getContext()); + SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); + + builder.SetInsertPoint(next_instr); + auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(next_instr->getParent()); + + // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_tls.gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); + builder.CreateBr(next_instr->getParent()); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + return phiNode; + } } } return target; @@ -2620,6 +2616,7 @@ static void replaceInstruction( ++it; } } +#endif bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { initAll(*F.getParent()); @@ -2639,6 +2636,7 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { PlaceRootsAndUpdateCalls(Colors, S, CallFrames); CleanupIR(F, &S, CFGModified); +#ifdef MMTK_GC // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk for (BasicBlock &BB : F) { for (auto it = BB.begin(); it != BB.end();) { @@ -2660,6 +2658,7 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { ++it; } } +#endif return true; }