Make jl_cumulative_compile_time_ns global (and reentrant).

Now, multiple tasks (on the same or different Threads) can start and stop compilation time measurement, without interrupting each other. * Makes jl_cumulative_compile_time_ns into a global, atomic variable. Instead of keeping per-task compilation time, this change keeps a global counter of compilation time, protected with atomic mutations. Fixes JuliaLang#41739 ```julia julia> include("./compilation-task-migration-17-example.jl") start thread: 2 end thread: 2 5.185706 seconds (3.53 M allocations: 2.570 GiB, 7.34% gc time, 15.57% compilation time) julia> include("./compilation-task-migration-17-example.jl") start thread: 3 WARNING: replacing module M. end thread: 1 4.110316 seconds (18.23 k allocations: 2.391 GiB, 5.67% gc time, 0.24% compilation time) ``` Compilation time measurement originally added in: JuliaLang#38885 Problems addressed: - This fixes JuliaLang#41739, meaning it fixes compilation time reporting in 1.7 after task migration was enabled. - It also fixes the race condition that existed previously, even on 1.6, where multiple Tasks on the thread measuring `@time` could break the measurement, as identified in (JuliaLang#41271 (comment)). - It fixes reentrant `@time` by making the `enable` flag a _counter,_ instead of a boolean. - It fixes `@time` called from multiple threads by making that flag thread-safe (via atomics).
NHDaly · Aug 18, 2021 · 619dec8 · 619dec8
1 parent bdacfa2
commit 619dec8
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 42 deletions.
diff --git a/base/timing.jl b/base/timing.jl
@@ -55,7 +55,7 @@ function gc_alloc_count(diff::GC_Diff)
     diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc
 end
 
-# cumulative total time spent on compilation
+# cumulative total time spent on compilation, in nanoseconds
 cumulative_compile_time_ns_before() = ccall(:jl_cumulative_compile_time_ns_before, UInt64, ())
 cumulative_compile_time_ns_after() = ccall(:jl_cumulative_compile_time_ns_after, UInt64, ())
 

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
@@ -286,8 +286,8 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
     JL_GC_PUSH1(&src);
     JL_LOCK(&codegen_lock);
     uint64_t compiler_start_time = 0;
-    int tid = jl_threadid();
-    if (jl_measure_compile_time[tid])
+    uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+    if (measure_compile_time_enabled)
         compiler_start_time = jl_hrtime();
 
     CompilationPolicy policy = (CompilationPolicy) _policy;
@@ -415,8 +415,8 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
     }
 
     data->M = std::move(clone);
-    if (jl_measure_compile_time[tid])
-        jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+    if (measure_compile_time_enabled)
+        jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
     if (policy == CompilationPolicy::ImagingMode)
         imaging_mode = 0;
     JL_UNLOCK(&codegen_lock); // Might GC
@@ -916,8 +916,8 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
         jl_llvm_functions_t decls;
         JL_LOCK(&codegen_lock);
         uint64_t compiler_start_time = 0;
-        int tid = jl_threadid();
-        if (jl_measure_compile_time[tid])
+        uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+        if (measure_compile_time_enabled)
             compiler_start_time = jl_hrtime();
         std::tie(m, decls) = jl_emit_code(mi, src, jlrettype, output);
 
@@ -942,8 +942,8 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
             m.release(); // the return object `llvmf` will be the owning pointer
         }
         JL_GC_POP();
-        if (jl_measure_compile_time[tid])
-            jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+        if (measure_compile_time_enabled)
+            jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
         JL_UNLOCK(&codegen_lock); // Might GC
         if (F)
             return F;

diff --git a/src/gf.c b/src/gf.c
@@ -3160,19 +3160,23 @@ int jl_has_concrete_subtype(jl_value_t *typ)
 #define typeinf_lock codegen_lock
 
 static uint64_t inference_start_time = 0;
+static uint8_t inference_is_measuring_compile_time = 0;
 
 JL_DLLEXPORT void jl_typeinf_begin(void)
 {
     JL_LOCK(&typeinf_lock);
-    if (jl_measure_compile_time[jl_threadid()])
+    if (jl_atomic_load(&jl_measure_compile_time_enabled)) {
         inference_start_time = jl_hrtime();
+        inference_is_measuring_compile_time = 1;
+    }
 }
 
 JL_DLLEXPORT void jl_typeinf_end(void)
 {
-    int tid = jl_threadid();
-    if (typeinf_lock.count == 1 && jl_measure_compile_time[tid])
-        jl_cumulative_compile_time[tid] += (jl_hrtime() - inference_start_time);
+    if (typeinf_lock.count == 1 && inference_is_measuring_compile_time) {
+        jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - inference_start_time));
+        inference_is_measuring_compile_time = 0;
+    }
     JL_UNLOCK(&typeinf_lock);
 }
 

diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
@@ -78,16 +78,16 @@ void jl_jit_globals(std::map<void *, GlobalVariable*> &globals)
 extern "C" JL_DLLEXPORT
 uint64_t jl_cumulative_compile_time_ns_before()
 {
-    int tid = jl_threadid();
-    jl_measure_compile_time[tid] = 1;
-    return jl_cumulative_compile_time[tid];
+    // Increment the flag to allow reentrant callers to `@time`.
+    jl_atomic_fetch_add(&jl_measure_compile_time_enabled, 1);
+    return jl_atomic_load(&jl_cumulative_compile_time);
 }
 extern "C" JL_DLLEXPORT
 uint64_t jl_cumulative_compile_time_ns_after()
 {
-    int tid = jl_threadid();
-    jl_measure_compile_time[tid] = 0;
-    return jl_cumulative_compile_time[tid];
+    // Decrement the flag when done measuring, allowing other callers to continue measuring.
+    jl_atomic_fetch_add(&jl_measure_compile_time_enabled, -1);
+    return jl_atomic_load(&jl_cumulative_compile_time);
 }
 
 // this generates llvm code for the lambda info
@@ -233,8 +233,8 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
 {
     JL_LOCK(&codegen_lock);
     uint64_t compiler_start_time = 0;
-    int tid = jl_threadid();
-    if (jl_measure_compile_time[tid])
+    uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+    if (measure_compile_time_enabled)
         compiler_start_time = jl_hrtime();
     jl_codegen_params_t params;
     jl_codegen_params_t *pparams = (jl_codegen_params_t*)p;
@@ -258,8 +258,8 @@ int jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt
         if (success && llvmmod == NULL)
             jl_add_to_ee(std::unique_ptr<Module>(into));
     }
-    if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
-        jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+    if (codegen_lock.count == 1 && measure_compile_time_enabled)
+        jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
     JL_UNLOCK(&codegen_lock);
     return success;
 }
@@ -315,8 +315,8 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
 {
     JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
     uint64_t compiler_start_time = 0;
-    int tid = jl_threadid();
-    if (jl_measure_compile_time[tid])
+    uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+    if (measure_compile_time_enabled)
         compiler_start_time = jl_hrtime();
     // if we don't have any decls already, try to generate it now
     jl_code_info_t *src = NULL;
@@ -354,8 +354,8 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT
     else {
         codeinst = NULL;
     }
-    if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
-        jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+    if (codegen_lock.count == 1 && measure_compile_time_enabled)
+        jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
     JL_UNLOCK(&codegen_lock);
     JL_GC_POP();
     return codeinst;
@@ -369,8 +369,8 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
     }
     JL_LOCK(&codegen_lock);
     uint64_t compiler_start_time = 0;
-    int tid = jl_threadid();
-    if (jl_measure_compile_time[tid])
+    uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+    if (measure_compile_time_enabled)
         compiler_start_time = jl_hrtime();
     if (unspec->invoke == NULL) {
         jl_code_info_t *src = NULL;
@@ -398,8 +398,8 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec)
         }
         JL_GC_POP();
     }
-    if (codegen_lock.count == 1 && jl_measure_compile_time[tid])
-        jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+    if (codegen_lock.count == 1 && measure_compile_time_enabled)
+        jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
     JL_UNLOCK(&codegen_lock); // Might GC
 }
 
@@ -422,8 +422,8 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
             // so create an exception here so we can print pretty our lies
             JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion
             uint64_t compiler_start_time = 0;
-            int tid = jl_threadid();
-            if (jl_measure_compile_time[tid])
+            uint8_t measure_compile_time_enabled = jl_atomic_load(&jl_measure_compile_time_enabled);
+            if (measure_compile_time_enabled)
                 compiler_start_time = jl_hrtime();
             specfptr = (uintptr_t)codeinst->specptr.fptr;
             if (specfptr == 0) {
@@ -448,8 +448,8 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world,
                 }
                 JL_GC_POP();
             }
-            if (jl_measure_compile_time[tid])
-                jl_cumulative_compile_time[tid] += (jl_hrtime() - compiler_start_time);
+            if (measure_compile_time_enabled)
+                jl_atomic_fetch_add(&jl_cumulative_compile_time, (jl_hrtime() - compiler_start_time));
             JL_UNLOCK(&codegen_lock);
         }
         if (specfptr != 0)

diff --git a/src/julia_internal.h b/src/julia_internal.h
@@ -154,8 +154,9 @@ static inline uint64_t cycleclock(void)
 
 #include "timing.h"
 
-extern uint8_t *jl_measure_compile_time;
-extern uint64_t *jl_cumulative_compile_time;
+// Global *atomic* integers controlling *process-wide* measurement of compilation time.
+extern uint8_t jl_measure_compile_time_enabled;
+extern uint64_t jl_cumulative_compile_time;
 
 #ifdef _COMPILER_MICROSOFT_
 #  define jl_return_address() ((uintptr_t)_ReturnAddress())

diff --git a/src/task.c b/src/task.c
@@ -561,7 +561,10 @@ static void JL_NORETURN throw_internal(jl_task_t *ct, jl_value_t *exception JL_M
     ptls->io_wait = 0;
     // @time needs its compile timer disabled on error,
     // and cannot use a try-finally as it would break scope for assignments
-    jl_measure_compile_time[ptls->tid] = 0;
+    // We blindly disable compilation time tracking here, for all running Tasks, even though
+    // it may cause some incorrect measurements. This is a known bug, and is being tracked
+    // here: https://github.com/JuliaLang/julia/pull/39138
+    jl_atomic_store_release(&jl_measure_compile_time_enabled, 0);
     JL_GC_PUSH1(&exception);
     jl_gc_unsafe_enter(ptls);
     if (exception) {

diff --git a/src/threading.c b/src/threading.c
@@ -287,8 +287,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k)
 #endif
 
 jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED;
-uint8_t *jl_measure_compile_time = NULL;
-uint64_t *jl_cumulative_compile_time = NULL;
+uint8_t jl_measure_compile_time_enabled = 0;
+uint64_t jl_cumulative_compile_time = 0;
 
 // return calling thread's ID
 // Also update the suspended_threads list in signals-mach when changing the
@@ -467,8 +467,6 @@ void jl_init_threading(void)
     }
     if (jl_n_threads <= 0)
         jl_n_threads = 1;
-    jl_measure_compile_time = (uint8_t*)calloc(jl_n_threads, sizeof(*jl_measure_compile_time));
-    jl_cumulative_compile_time = (uint64_t*)calloc(jl_n_threads, sizeof(*jl_cumulative_compile_time));
 #ifndef __clang_analyzer__
     jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*));
 #endif

diff --git a/test/misc.jl b/test/misc.jl
@@ -263,6 +263,23 @@ function timev_macro_scope()
 end
 @test timev_macro_scope() == 1
 
+before = Base.cumulative_compile_time_ns_before();
+
+# exercise concurrent calls to `@time` for reentrant compilation time measurement.
+t1 = @async @time begin
+    sleep(2)
+    @eval module M ; f(x,y) = x+y ; end
+    @eval M.f(2,3)
+end
+t2 = @async begin
+    sleep(1)
+    @time 2 + 2
+end
+
+after = Base.cumulative_compile_time_ns_after();
+@test after >= before;
+@test Base.cumulative_compile_time_ns() >= after;
+
 # interactive utilities
 
 struct ambigconvert; end # inject a problematic `convert` method to ensure it still works