diff --git a/base/options.jl b/base/options.jl index f535c27d99122b..7fb7f8322e1545 100644 --- a/base/options.jl +++ b/base/options.jl @@ -60,6 +60,7 @@ struct JLOptions heap_size_hint::UInt64 trace_compile_timing::Int8 trim::Int8 + task_timing::Int8 end # This runs early in the sysimage != is not defined yet diff --git a/base/task.jl b/base/task.jl index f5abec70cb9d0f..77631b19393887 100644 --- a/base/task.jl +++ b/base/task.jl @@ -985,7 +985,7 @@ end function schedule(t::Task) # user_time -task-(re)scheduled-> wait_time - if t.first_enqueued_at == 0 + if t.is_timing_enabled && t.first_enqueued_at == 0 t.first_enqueued_at = time_ns() end enq_work(t) @@ -1036,7 +1036,7 @@ function schedule(t::Task, @nospecialize(arg); error=false) # schedule a task to be (re)started with the given value or exception t._state === task_state_runnable || Base.error("schedule: Task not runnable") # user_time -task-(re)scheduled-> wait_time - if t.first_enqueued_at == 0 + if t.is_timing_enabled && t.first_enqueued_at == 0 t.first_enqueued_at = time_ns() end if error @@ -1107,7 +1107,7 @@ function yieldto(t::Task, @nospecialize(x=nothing)) elseif t._state === task_state_failed throw(t.result) end - if t.first_enqueued_at == 0 + if t.is_timing_enabled && t.first_enqueued_at == 0 t.first_enqueued_at = time_ns() end t.result = x @@ -1125,8 +1125,10 @@ function try_yieldto(undo) ct = current_task() # scheduler -task-started-> user # scheduler -task-resumed-> user - # @assert ct.last_dequeued_at == 0 - ct.last_dequeued_at = time_ns() + if ct.is_timing_enabled + # @assert ct.last_dequeued_at == 0 + ct.last_dequeued_at = time_ns() + end if ct._isexception exc = ct.result ct.result = nothing @@ -1211,14 +1213,18 @@ else end function record_cpu_time!(t::Task, stopped_at::UInt64=time_ns()) - @assert t.last_dequeued_at != 0 - t.cpu_time_ns += stopped_at - t.last_dequeued_at - t.last_dequeued_at = 0 + if t.is_timing_enabled + @assert t.last_dequeued_at != 0 + t.cpu_time_ns += stopped_at - t.last_dequeued_at + t.last_dequeued_at = 0 + end return t end function record_wall_time!(t::Task, done_at::UInt64=time_ns()) - @assert t.first_enqueued_at != 0 - t.wall_time_ns = done_at - t.first_enqueued_at + if t.is_timing_enabled + @assert t.first_enqueued_at != 0 + t.wall_time_ns = done_at - t.first_enqueued_at + end return t end diff --git a/base/timing.jl b/base/timing.jl index d8679a12ffe6da..2a6b139989d275 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -91,6 +91,9 @@ Return the total nanoseconds that the task `t` has spent running. This metric is only updated when the task yields or completes. See also [`task_wall_time_ns`](@ref). +Will be `UInt64(0)` if task timings are not enabled. +See [`Base.task_timing`](@ref). + !!! note "This metric is from the Julia scheduler" A task may be running on an OS thread that is descheduled by the OS scheduler, this time still counts towards the metric. @@ -110,14 +113,35 @@ This is the time since the task entered the run queue until the time at which it or until the current time if the task has not yet completed. See also [`task_cpu_time_ns`](@ref). +Will be `UInt64(0)` if task timings are not enabled. +See [`Base.task_timing`](@ref). + !!! compat "Julia 1.12" This method was added in Julia 1.12. """ function task_wall_time_ns(t::Task) + # TODO: report up til current time if not done? too racy? # return istaskdone(t) ? t.wall_time_ns : time_ns() - t.first_enqueued_at return t.wall_time_ns end +""" + Base.task_timing(::Bool) + +Enable or disable the collection of per-task timing information. +Task created when Base.task_timing(true) is in effect will have [`task_cpu_time_ns`](@ref) +and [`task_wall_time_ns`](@ref) timing information available. +""" +function task_timing(b::Bool) + if b + ccall(:jl_task_timing_enable, Cvoid, ()) + else + # TODO: prevent decrementing the counter below zero + ccall(:jl_task_timing_disable, Cvoid, ()) + end + return nothing +end + """ Base.gc_live_bytes() diff --git a/doc/man/julia.1 b/doc/man/julia.1 index 56cb690d66eebb..c9c95bd80c9d40 100644 --- a/doc/man/julia.1 +++ b/doc/man/julia.1 @@ -294,6 +294,10 @@ If --trace-compile is enabled show how long each took to compile in ms --trace-dispatch={stderr|name} Print precompile statements for methods dispatched during execution or save to stderr or a path. +.TP +--task-timing +Enabled the collection of per-task timing metrics. + .TP -image-codegen Force generate code in imaging mode diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md index 734d7031db5e84..60089d1eeed9c4 100644 --- a/doc/src/manual/command-line-interface.md +++ b/doc/src/manual/command-line-interface.md @@ -217,6 +217,7 @@ The following is a complete list of command-line switches available when launchi |`--trace-compile={stderr\|name}` |Print precompile statements for methods compiled during execution or save to stderr or a path. Methods that were recompiled are printed in yellow or with a trailing comment if color is not supported| |`--trace-compile-timing` |If --trace-compile is enabled show how long each took to compile in ms| |`--trace-dispatch={stderr\|name}` |Print precompile statements for methods dispatched during execution or save to stderr or a path.| +|`--task-timing` |Enable tasks the collection of per-task timing metrics| |`--image-codegen` |Force generate code in imaging mode| |`--permalloc-pkgimg={yes\|no*}` |Copy the data section of package images into memory| |`--trim={no*\|safe\|unsafe\|unsafe-warn}` |Build a sysimage including only code provably reachable from methods marked by calling `entrypoint`. The three non-default options differ in how they handle dynamic call sites. In safe mode, such sites result in compile-time errors. In unsafe mode, such sites are allowed but the resulting binary might be missing needed code and can throw runtime errors. With unsafe-warn, such sites will trigger warnings at compile-time and might error at runtime.| diff --git a/src/init.c b/src/init.c index 413d4e8055e54b..b5ec4f3bba23a9 100644 --- a/src/init.c +++ b/src/init.c @@ -874,6 +874,9 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ post_boot_hooks(); } + if (jl_options.task_timing) { + jl_atomic_fetch_add(&jl_task_timing_enabled, 1); + } if (jl_base_module == NULL) { // nthreads > 1 requires code in Base jl_atomic_store_relaxed(&jl_n_threads, 1); diff --git a/src/jlapi.c b/src/jlapi.c index a3621385a437e9..d13ff0a7f763ef 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -809,6 +809,33 @@ JL_DLLEXPORT uint64_t jl_cumulative_recompile_time_ns(void) return jl_atomic_load_relaxed(&jl_cumulative_recompile_time); } +/** + * @brief Enable per-task timing. + */ +JL_DLLEXPORT void jl_task_timing_enable(void) +{ + // Increment the flag to allow reentrant callers. + jl_atomic_fetch_add(&jl_task_timing_enabled, 1); +} + +/** + * @brief Disable per-task timing. + */ +JL_DLLEXPORT void jl_task_timing_disable(void) +{ + jl_atomic_fetch_add(&jl_task_timing_enabled, -1); +} + +// TODO: remove this +JL_DLLEXPORT void jl_is_task_timing_enabled(void) +{ + jl_atomic_load_relaxed(&jl_task_timing_enabled); +} +JL_DLLEXPORT void jl_is_compile_timing_enabled(void) +{ + jl_atomic_load_relaxed(&jl_measure_compile_time_enabled); +} + /** * @brief Retrieve floating-point environment constants. * diff --git a/src/jloptions.c b/src/jloptions.c index 35f0a76e3f6e70..c0ae8b35d06392 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -103,6 +103,7 @@ JL_DLLEXPORT void jl_init_options(void) 0, // heap-size-hint 0, // trace_compile_timing 0, // trim + 0, // task_timing }; jl_options_initialized = 1; } @@ -258,13 +259,13 @@ static const char opts_hidden[] = " --output-bc Generate LLVM bitcode (.bc)\n" " --output-asm Generate an assembly file (.s)\n" " --output-incremental={yes|no*} Generate an incremental output file (rather than\n" - " complete)\n" " --trace-compile={stderr|name} Print precompile statements for methods compiled\n" " during execution or save to stderr or a path. Methods that\n" " were recompiled are printed in yellow or with a trailing\n" " comment if color is not supported\n" " --trace-compile-timing If --trace-compile is enabled show how long each took to\n" " compile in ms\n" + " --task-timing Enable collection of per-task timing data.\n" " --image-codegen Force generate code in imaging mode\n" " --permalloc-pkgimg={yes|no*} Copy the data section of package images into memory\n" " --trim={no*|safe|unsafe|unsafe-warn}\n" @@ -296,6 +297,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_trace_compile, opt_trace_compile_timing, opt_trace_dispatch, + opt_task_timing, opt_math_mode, opt_worker, opt_bind_to, @@ -375,6 +377,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "trace-compile", required_argument, 0, opt_trace_compile }, { "trace-compile-timing", no_argument, 0, opt_trace_compile_timing }, { "trace-dispatch", required_argument, 0, opt_trace_dispatch }, + { "task-timing", no_argument, 0, opt_task_timing }, { "math-mode", required_argument, 0, opt_math_mode }, { "handle-signals", required_argument, 0, opt_handle_signals }, // hidden command line options @@ -964,6 +967,9 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) else jl_errorf("julia: invalid argument to --trim={safe|no|unsafe|unsafe-warn} (%s)", optarg); break; + case opt_task_timing: + jl_options.task_timing = 1; + break; default: jl_errorf("julia: unhandled option -- %c\n" "This is a bug, please report it.", c); diff --git a/src/jloptions.h b/src/jloptions.h index e58797caace3cf..1eabc195b8d3e2 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -64,6 +64,7 @@ typedef struct { uint64_t heap_size_hint; int8_t trace_compile_timing; int8_t trim; + int8_t task_timing; } jl_options_t; #endif diff --git a/src/jltypes.c b/src/jltypes.c index a77b0add5672dd..9053689960a946 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -3744,7 +3744,7 @@ void jl_init_types(void) JL_GC_DISABLED NULL, jl_any_type, jl_emptysvec, - jl_perm_symsvec(20, + jl_perm_symsvec(21, "next", "queue", "storage", @@ -3761,11 +3761,12 @@ void jl_init_types(void) JL_GC_DISABLED "sticky", "_isexception", "priority", + "is_timing_enabled", "first_enqueued_at", "last_dequeued_at", "cpu_time_ns", "wall_time_ns"), - jl_svec(20, + jl_svec(21, jl_any_type, jl_any_type, jl_any_type, @@ -3782,6 +3783,7 @@ void jl_init_types(void) JL_GC_DISABLED jl_bool_type, jl_bool_type, jl_uint16_type, + jl_bool_type, jl_uint64_type, jl_uint64_type, jl_uint64_type, diff --git a/src/julia.h b/src/julia.h index 175ca4517e8217..940fce662848e0 100644 --- a/src/julia.h +++ b/src/julia.h @@ -2274,6 +2274,8 @@ typedef struct _jl_task_t { // uint8_t padding1; // multiqueue priority uint16_t priority; + // bool indicating whether or not this to measure the wall and cpu time of this task + uint8_t is_timing_enabled; // timestamp this task first entered the run queue (TODO: int32 of ms instead?) uint64_t first_enqueued_at; // timestamp this task was most recently scheduled to run diff --git a/src/julia_internal.h b/src/julia_internal.h index 20d90fede3d5e5..b34a44d20aaf3d 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -281,6 +281,8 @@ extern JL_DLLEXPORT void jl_typeinf_timing_end(uint64_t start, int is_recompile) extern JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled; extern JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time; extern JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time; +// Global *atomic* integer controlling *process-wide* task timing. +extern JL_DLLEXPORT _Atomic(uint8_t) jl_task_timing_enabled; #define jl_return_address() ((uintptr_t)__builtin_return_address(0)) @@ -295,7 +297,6 @@ STATIC_INLINE uint32_t jl_int32hash_fast(uint32_t a) return a; // identity hashing seems to work well enough here } - // this is a version of memcpy that preserves atomic memory ordering // which makes it safe to use for objects that can contain memory references // without risk of creating pointers out of thin air @@ -376,7 +377,6 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz); JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT; void gc_sweep_sysimg(void); - // pools are 16376 bytes large (GC_POOL_SZ - GC_PAGE_OFFSET) static const int jl_gc_sizeclasses[] = { #ifdef _P64 @@ -513,7 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE #define GC_MAX_SZCLASS (2032-sizeof(void*)) static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, ""); - // Size does NOT include the type tag!! STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) { @@ -834,7 +833,6 @@ JL_DLLEXPORT jl_value_t *jl_matching_methods(jl_tupletype_t *types, jl_value_t * size_t world, size_t *min_valid, size_t *max_valid, int *ambig); JL_DLLEXPORT jl_value_t *jl_gf_invoke_lookup_worlds(jl_value_t *types, jl_value_t *mt, size_t world, size_t *min_world, size_t *max_world); - jl_datatype_t *jl_nth_argument_datatype(jl_value_t *argtypes JL_PROPAGATES_ROOT, int n) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_value_t *jl_argument_datatype(jl_value_t *argt JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_methtable_t *jl_method_table_for( @@ -1479,7 +1477,6 @@ JL_DLLEXPORT jl_value_t *jl_get_cfunction_trampoline( void *(*init_trampoline)(void *tramp, void **nval), jl_unionall_t *env, jl_value_t **vals); - // Special filenames used to refer to internal julia libraries #define JL_EXE_LIBNAME ((const char*)1) #define JL_LIBJULIA_DL_LIBNAME ((const char*)2) @@ -1924,7 +1921,6 @@ JL_DLLIMPORT uint64_t jl_getUnwindInfo(uint64_t dwBase); #pragma GCC visibility pop - #ifdef USE_DTRACE // Generated file, needs to be searched in include paths so that the builddir // retains priority diff --git a/src/task.c b/src/task.c index e0554627f56021..87019c0db36ab8 100644 --- a/src/task.c +++ b/src/task.c @@ -1146,6 +1146,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->ptls = NULL; t->world_age = ct->world_age; t->reentrant_timing = 0; + t->is_timing_enabled = jl_atomic_load_relaxed(&jl_task_timing_enabled) != 0; t->first_enqueued_at = 0; t->last_dequeued_at = 0; t->cpu_time_ns = 0; @@ -1249,9 +1250,11 @@ CFI_NORETURN ct->ctx.started = 1; // wait_time -task-started-> user_time - assert(ct->first_enqueued_at != 0); - assert(ct->last_dequeued_at == 0); - ct->last_dequeued_at = jl_hrtime(); + if (ct->is_timing_enabled) { + assert(ct->first_enqueued_at != 0); + assert(ct->last_dequeued_at == 0); + ct->last_dequeued_at = jl_hrtime(); + } JL_PROBE_RT_START_TASK(ct); jl_timing_block_task_enter(ct, ptls, NULL); if (jl_atomic_load_relaxed(&ct->_isexception)) { @@ -1597,9 +1600,15 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) ct->reentrant_timing = 0; ct->cpu_time_ns = 0; ct->wall_time_ns = 0; - uint64_t now = jl_hrtime(); - ct->first_enqueued_at = now; - ct->last_dequeued_at = now; + if (ct->is_timing_enabled) { + uint64_t now = jl_hrtime(); + ct->first_enqueued_at = now; + ct->last_dequeued_at = now; + } + else { + ct->first_enqueued_at = 0; + ct->last_dequeued_at = 0; + } ptls->root_task = ct; jl_atomic_store_relaxed(&ptls->current_task, ct); JL_GC_PROMISE_ROOTED(ct); diff --git a/src/threading.c b/src/threading.c index c26028d2f3da24..365c5e6fd38541 100644 --- a/src/threading.c +++ b/src/threading.c @@ -49,6 +49,8 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0; JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0; JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0; +JL_DLLEXPORT _Atomic(uint8_t) jl_task_timing_enabled = 0; + JL_DLLEXPORT void *jl_get_ptls_states(void) { // mostly deprecated: use current_task instead diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index cc3f8950f0dc02..81e00dea1d5611 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -783,6 +783,17 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` "Int(Base.JLOptions().fast_math)"`)) == JL_OPTIONS_FAST_MATH_DEFAULT end + let JL_OPTIONS_TASK_TIMING_OFF = 0, JL_OPTIONS_TASK_TIMING_ON = 1 + @test parse(Int,readchomp(`$exename -E + "Int(Base.JLOptions().task_timing)"`)) == JL_OPTIONS_TASK_TIMING_OFF + @test parse(Int, readchomp(`$exename --task-timing -E + "Int(Base.JLOptions().task_timing)"`)) == JL_OPTIONS_TASK_TIMING_ON + @test !parse(Bool, readchomp(`$exename -E + "fetch(Threads.@spawn current_task().is_timing_enabled)"`)) + @test parse(Bool, readchomp(`$exename --task-timing -E + "fetch(Threads.@spawn current_task().is_timing_enabled)"`)) + end + # --worker takes default / custom as argument (default/custom arguments # tested in test/parallel.jl) @test errors_not_signals(`$exename --worker=true`) diff --git a/test/threads_exec.jl b/test/threads_exec.jl index b517937c1317bb..c3edd01a262cc2 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -1315,15 +1315,30 @@ end end @testset "task time counters" begin - start_time = time_ns() - t = Threads.@spawn peakflops() - wait(t) - end_time = time_ns() - wall_time_delta = end_time - start_time - @test Base.task_cpu_time_ns(t) > 0 - @test Base.task_wall_time_ns(t) > 0 - @test Base.task_wall_time_ns(t) >= Base.task_cpu_time_ns(t) - @test wall_time_delta > Base.task_wall_time_ns(t) + @testset "enabled" begin + try + Base.task_timing(true) + start_time = time_ns() + t = Threads.@spawn peakflops() + wait(t) + end_time = time_ns() + wall_time_delta = end_time - start_time + @test t.is_timing_enabled + @test Base.task_cpu_time_ns(t) > 0 + @test Base.task_wall_time_ns(t) > 0 + @test Base.task_wall_time_ns(t) >= Base.task_cpu_time_ns(t) + @test wall_time_delta > Base.task_wall_time_ns(t) + finally + Base.task_timing(false) + end + end + @testset "disabled" begin + t = Threads.@spawn peakflops() + wait(t) + @test !t.is_timing_enabled + @test Base.task_cpu_time_ns(t) == 0 + @test Base.task_wall_time_ns(t) == 0 + end end @testset "task time counters: lots of spawns" begin