Skip to content

Commit

Permalink
Make jl_cumulative_compile_time_ns global (and reentrant).
Browse files Browse the repository at this point in the history
Now, multiple tasks (on the same or different Threads) can start and stop compilation
time measurement, without interrupting each other.

* Makes jl_cumulative_compile_time_ns into a global, atomic variable.

Instead of keeping per-task compilation time, this change keeps a
global counter of compilation time, protected with atomic mutations.

Fixes #41739

```julia
julia> include("./compilation-task-migration-17-example.jl")
start thread: 2
end thread: 2
  5.185706 seconds (3.53 M allocations: 2.570 GiB, 7.34% gc time, 15.57% compilation time)

julia> include("./compilation-task-migration-17-example.jl")
start thread: 3
WARNING: replacing module M.
end thread: 1
  4.110316 seconds (18.23 k allocations: 2.391 GiB, 5.67% gc time, 0.24% compilation time)
```

Compilation time measurement originally added in: #38885

Problems addressed:
- This fixes #41739, meaning it fixes compilation time reporting in 1.7 after task migration was enabled.
- It also fixes the race condition that existed previously, even on 1.6, where multiple Tasks on the thread measuring `@time` could break the measurement, as identified in (#41271 (comment)).
  - It fixes reentrant `@time` by making the `enable` flag a _counter,_ instead of a boolean.
  - It fixes `@time` called from multiple threads by making that flag thread-safe (via atomics).
  • Loading branch information
NHDaly committed Aug 23, 2021
1 parent 9cad1e0 commit b4ca196
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 42 deletions.
2 changes: 1 addition & 1 deletion base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ function gc_alloc_count(diff::GC_Diff)
diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc
end

# cumulative total time spent on compilation
# cumulative total time spent on compilation, in nanoseconds
cumulative_compile_time_ns_before() = ccall(:jl_cumulative_compile_time_ns_before, UInt64, ())
cumulative_compile_time_ns_after() = ccall(:jl_cumulative_compile_time_ns_after, UInt64, ())

Expand Down
16 changes: 8 additions & 8 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,8 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
JL_GC_PUSH1(&src);
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();

CompilationPolicy policy = (CompilationPolicy) _policy;
Expand Down Expand Up @@ -407,8 +407,8 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
}

data->M = std::move(clone);
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
if (policy == CompilationPolicy::ImagingMode)
imaging_mode = 0;
JL_UNLOCK(&codegen_lock); // Might GC
Expand Down Expand Up @@ -908,8 +908,8 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
jl_llvm_functions_t decls;
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();
std::tie(m, decls) = jl_emit_code(mi, src, jlrettype, output);

Expand All @@ -934,8 +934,8 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
m.release(); // the return object `llvmf` will be the owning pointer
}
JL_GC_POP();
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock); // Might GC
if (F)
return F;
Expand Down
12 changes: 8 additions & 4 deletions src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3160,19 +3160,23 @@ int jl_has_concrete_subtype(jl_value_t *typ)
#define typeinf_lock codegen_lock

static uint64_t inference_start_time = 0;
static uint8_t inference_is_measuring_compile_time = 0;

JL_DLLEXPORT void jl_typeinf_begin(void)
{
JL_LOCK(&typeinf_lock);
if (jl_measure_compile_time[jl_threadid()])
if (jl_atomic_load_relaxed(&jl_measure_compile_time_enabled)) {
inference_start_time = jl_hrtime();
inference_is_measuring_compile_time = 1;
}
}

JL_DLLEXPORT void jl_typeinf_end(void)
{
int tid = jl_threadid();
if (typeinf_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - inference_start_time);
if (typeinf_lock.count == 1 && inference_is_measuring_compile_time) {
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - inference_start_time));
inference_is_measuring_compile_time = 0;
}
JL_UNLOCK(&typeinf_lock);
}

Expand Down
44 changes: 22 additions & 22 deletions src/jitlayers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@ void jl_jit_globals(std::map<void *, GlobalVariable*> &globals)
extern "C" JL_DLLEXPORT
uint64_t jl_cumulative_compile_time_ns_before()
{
int tid = jl_threadid();
jl_measure_compile_time[tid] = 1;
return jl_cumulative_compile_time[tid];
// Increment the flag to allow reentrant callers to `@time`.
jl_atomic_fetch_add(&jl_measure_compile_time_enabled, 1);
return jl_atomic_load_relaxed(&jl_cumulative_compile_time);
}
extern "C" JL_DLLEXPORT
uint64_t jl_cumulative_compile_time_ns_after()
{
int tid = jl_threadid();
jl_measure_compile_time[tid] = 0;
return jl_cumulative_compile_time[tid];
// Decrement the flag when done measuring, allowing other callers to continue measuring.
jl_atomic_fetch_add(&jl_measure_compile_time_enabled, -1);
return jl_atomic_load_relaxed(&jl_cumulative_compile_time);
}

// this generates llvm code for the lambda info
Expand Down Expand Up @@ -231,8 +231,8 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
{
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();
jl_codegen_params_t params;
jl_codegen_params_t *pparams = (jl_codegen_params_t*)p;
Expand All @@ -256,8 +256,8 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
if (success && llvmmod == NULL)
jl_add_to_ee(std::unique_ptr<Module>(into));
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
return success;
}
Expand Down Expand Up @@ -313,8 +313,8 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
{
JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();
// if we don't have any decls already, try to generate it now
jl_code_info_t *src = NULL;
Expand Down Expand Up @@ -352,8 +352,8 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
else {
codeinst = NULL;
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
JL_GC_POP();
return codeinst;
Expand All @@ -367,8 +367,8 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
}
JL_LOCK(&codegen_lock);
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();
if (unspec->invoke == NULL) {
jl_code_info_t *src = NULL;
Expand Down Expand Up @@ -396,8 +396,8 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
}
JL_GC_POP();
}
if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (codegen_lock.count == 1 && measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock); // Might GC
}

Expand All @@ -420,8 +420,8 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
// so create an exception here so we can print pretty our lies
JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
uint64_t compiler_start_time = 0;
int tid = jl_threadid();
if (jl_measure_compile_time[tid])
uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
if (measure_compile_time_enabled)
compiler_start_time = jl_hrtime();
specfptr = (uintptr_t)codeinst->specptr.fptr;
if (specfptr == 0) {
Expand All @@ -446,8 +446,8 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
}
JL_GC_POP();
}
if (jl_measure_compile_time[tid])
jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
if (measure_compile_time_enabled)
jl_atomic_fetch_add_relaxed(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
JL_UNLOCK(&codegen_lock);
}
if (specfptr != 0)
Expand Down
5 changes: 3 additions & 2 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ static inline uint64_t cycleclock(void)

#include "timing.h"

extern uint8_t *jl_measure_compile_time;
extern uint64_t *jl_cumulative_compile_time;
// Global *atomic* integers controlling *process-wide* measurement of compilation time.
extern uint8_t jl_measure_compile_time_enabled;
extern uint64_t jl_cumulative_compile_time;

#ifdef _COMPILER_MICROSOFT_
# define jl_return_address() ((uintptr_t)_ReturnAddress())
Expand Down
5 changes: 4 additions & 1 deletion src/task.c
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,10 @@ static void JL_NORETURN throw_internal(jl_task_t *ct, jl_value_t *exception JL_M
ptls->io_wait = 0;
// @time needs its compile timer disabled on error,
// and cannot use a try-finally as it would break scope for assignments
jl_measure_compile_time[ptls->tid] = 0;
// We blindly disable compilation time tracking here, for all running Tasks, even though
// it may cause some incorrect measurements. This is a known bug, and is being tracked
// here: https://github.com/JuliaLang/julia/pull/39138
jl_atomic_store_relaxed(&jl_measure_compile_time_enabled, 0);
JL_GC_PUSH1(&exception);
jl_gc_unsafe_enter(ptls);
if (exception) {
Expand Down
6 changes: 2 additions & 4 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k)
#endif

jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED;
uint8_t *jl_measure_compile_time = NULL;
uint64_t *jl_cumulative_compile_time = NULL;
uint8_t jl_measure_compile_time_enabled = 0;
uint64_t jl_cumulative_compile_time = 0;

// return calling thread's ID
// Also update the suspended_threads list in signals-mach when changing the
Expand Down Expand Up @@ -467,8 +467,6 @@ void jl_init_threading(void)
}
if (jl_n_threads <= 0)
jl_n_threads = 1;
jl_measure_compile_time = (uint8_t*)calloc(jl_n_threads, sizeof(*jl_measure_compile_time));
jl_cumulative_compile_time = (uint64_t*)calloc(jl_n_threads, sizeof(*jl_cumulative_compile_time));
#ifndef __clang_analyzer__
jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*));
#endif
Expand Down
16 changes: 16 additions & 0 deletions test/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,22 @@ function timev_macro_scope()
end
@test timev_macro_scope() == 1

before = Base.cumulative_compile_time_ns_before();

# exercise concurrent calls to `@time` for reentrant compilation time measurement.
t1 = @async @time begin
sleep(2)
@eval module M ; f(x,y) = x+y ; end
@eval M.f(2,3)
end
t2 = @async begin
sleep(1)
@time 2 + 2
end

after = Base.cumulative_compile_time_ns_after();
@test after >= before;

# interactive utilities

struct ambigconvert; end # inject a problematic `convert` method to ensure it still works
Expand Down

0 comments on commit b4ca196

Please sign in to comment.