Skip to content

Commit

Permalink
Revert "Adding runtime option to inline allocation fastpath (#68)" (#69)
Browse files Browse the repository at this point in the history
This reverts commit 58d7d21.
  • Loading branch information
udesou authored Oct 25, 2024
1 parent 58d7d21 commit b484fe7
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 105 deletions.
1 change: 0 additions & 1 deletion base/options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ struct JLOptions
warn_overwrite::Int8
can_inline::Int8
polly::Int8
mmtk_inline_fastpath::Int8
trace_compile::Ptr{UInt8}
trace_dispatch::Ptr{UInt8}
fast_math::Int8
Expand Down
26 changes: 0 additions & 26 deletions src/jloptions.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ JL_DLLEXPORT void jl_init_options(void)
0, // method overwrite warning
1, // can_inline
JL_OPTIONS_POLLY_ON, // polly
#ifdef MMTK_GC
1, // inline fastpath allocation for mmtk
#else
0,
#endif
NULL, // trace_compile
NULL, // trace_dispatch
JL_OPTIONS_FAST_MATH_DEFAULT,
Expand Down Expand Up @@ -212,10 +207,6 @@ static const char opts[] =
" --polly={yes*|no} Enable or disable the polyhedral optimizer Polly\n"
" (overrides @polly declaration)\n"
#endif
#ifdef MMTK_GC
" --inline-fastpath={yes*|no} Enable or disable inlining allocation fastpath for MMTk\n"
" during code generation.\n"
#endif

// instrumentation options
" --code-coverage[={none*|user|all}] Count executions of source lines (omitting setting is\n"
Expand Down Expand Up @@ -302,7 +293,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
opt_warn_scope,
opt_inline,
opt_polly,
opt_mmtk_inline_fastpath,
opt_trace_compile,
opt_trace_compile_timing,
opt_trace_dispatch,
Expand Down Expand Up @@ -382,7 +372,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
{ "warn-scope", required_argument, 0, opt_warn_scope },
{ "inline", required_argument, 0, opt_inline },
{ "polly", required_argument, 0, opt_polly },
{ "inline-fastpath", required_argument, 0, opt_mmtk_inline_fastpath },
{ "trace-compile", required_argument, 0, opt_trace_compile },
{ "trace-compile-timing", no_argument, 0, opt_trace_compile_timing },
{ "trace-dispatch", required_argument, 0, opt_trace_dispatch },
Expand Down Expand Up @@ -834,21 +823,6 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
jl_errorf("julia: invalid argument to --polly (%s)", optarg);
}
break;
case opt_mmtk_inline_fastpath:
if (!strcmp(optarg,"yes"))
#ifdef MMTK_GC
jl_options.mmtk_inline_fastpath = 1;
#else
// always set to 0 if not using MMTk
jl_options.mmtk_inline_fastpath = 0;
jl_printf(JL_STDERR, "WARNING: Attempting to set --inline-fastpath without using MMTk");
#endif
else if (!strcmp(optarg,"no"))
jl_options.mmtk_inline_fastpath = 0;
else {
jl_errorf("julia: invalid argument to --inline-fastpath (%s)", optarg);
}
break;
case opt_trace_compile:
jl_options.trace_compile = strdup(optarg);
if (!jl_options.trace_compile)
Expand Down
1 change: 0 additions & 1 deletion src/jloptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ typedef struct {
int8_t warn_overwrite;
int8_t can_inline;
int8_t polly;
int8_t mmtk_inline_fastpath;
const char *trace_compile;
const char *trace_dispatch;
int8_t fast_math;
Expand Down
153 changes: 76 additions & 77 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2505,19 +2505,9 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
}
}

#ifdef MMTK_GC
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
{
#ifndef MMTK_GC
// For Julia's stock GC, this option should always be 0
assert(jl_options.mmtk_inline_fastpath == 0);
#endif

// Setting --inline-fastpath=false with MMTk will increase allocation
// overhead a lot, and should only be used for debugging.
if (jl_options.mmtk_inline_fastpath == 0) {
return target;
}

assert(target->arg_size() == 3);

IRBuilder<> builder(target);
Expand All @@ -2535,72 +2525,78 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);

// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
const bool INLINE_FASTPATH_ALLOCATION = true;

if (INLINE_FASTPATH_ALLOCATION) {
// Assuming we use the first immix allocator.
// FIXME: We should get the allocator index and type from MMTk.
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);

auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));

auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");

// offset = 8
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
// alignment 16 (15 = 16 - 1)
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
auto result = builder.CreateNSWAdd(cursor, delta, "result");

auto new_cursor = builder.CreateNSWAdd(result, pool_osize);

auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");

auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);

auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());

auto next_instr = target->getNextNode();
SmallVector<uint32_t, 2> Weights{1, 9};

MDBuilder MDB(F.getContext());
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));

builder.SetInsertPoint(next_instr);
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");

// slowpath
builder.SetInsertPoint(slowpath);
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
builder.CreateBr(next_instr->getParent());

// fastpath
builder.SetInsertPoint(fastpath);
builder.CreateStore(new_cursor, cursor_ptr);

// ptls->gc_tls.gc_num.allocd += osize;
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
builder.CreateStore(pool_allocd_total, pool_alloc_tls);

auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
builder.CreateBr(next_instr->getParent());

phiNode->addIncoming(new_call, slowpath);
phiNode->addIncoming(v_as_ptr, fastpath);
phiNode->takeName(target);
return phiNode;
}
}
}
return target;
Expand All @@ -2620,6 +2616,7 @@ static void replaceInstruction(
++it;
}
}
#endif

bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
initAll(*F.getParent());
Expand All @@ -2639,6 +2636,7 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
CleanupIR(F, &S, CFGModified);

#ifdef MMTK_GC
// We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
for (BasicBlock &BB : F) {
for (auto it = BB.begin(); it != BB.end();) {
Expand All @@ -2660,6 +2658,7 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
++it;
}
}
#endif

return true;
}
Expand Down

0 comments on commit b484fe7

Please sign in to comment.