From 2e02579a76cf4ea3acdc0e076f53ff9cb15fa38f Mon Sep 17 00:00:00 2001 From: Terry Wilmarth Date: Tue, 14 Dec 2021 15:04:55 -0600 Subject: [PATCH] [OpenMP] Add use of TPAUSE Add use of TPAUSE (from WAITPKG) to the runtime for Intel hardware, with an envirable to turn it on in a particular C-state. Always uses TPAUSE if it is selected and enabled by Intel hardware and presence of WAITPKG, and if not, falls back to old way of checking __kmp_use_yield, etc. Differential Revision: https://reviews.llvm.org/D115758 --- openmp/runtime/src/kmp.h | 202 ++++++++++++++--------- openmp/runtime/src/kmp_dispatch.cpp | 8 +- openmp/runtime/src/kmp_dispatch.h | 4 +- openmp/runtime/src/kmp_global.cpp | 8 + openmp/runtime/src/kmp_lock.cpp | 27 ++- openmp/runtime/src/kmp_lock.h | 13 +- openmp/runtime/src/kmp_runtime.cpp | 4 +- openmp/runtime/src/kmp_settings.cpp | 25 +++ openmp/runtime/src/kmp_tasking.cpp | 4 +- openmp/runtime/src/kmp_wait_release.h | 4 +- openmp/runtime/src/z_Windows_NT_util.cpp | 4 +- 11 files changed, 207 insertions(+), 96 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index ede6aa992d53..a41265a4763a 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1315,86 +1315,6 @@ static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); } #define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */ -#if KMP_ARCH_X86 -extern void __kmp_x86_pause(void); -#elif KMP_MIC -// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed -// regression after removal of extra PAUSE from spin loops. Changing -// the delay from 100 to 300 showed even better performance than double PAUSE -// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC. -static inline void __kmp_x86_pause(void) { _mm_delay_32(300); } -#else -static inline void __kmp_x86_pause(void) { _mm_pause(); } -#endif -#define KMP_CPU_PAUSE() __kmp_x86_pause() -#elif KMP_ARCH_PPC64 -#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1") -#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2") -#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory") -#define KMP_CPU_PAUSE() \ - do { \ - KMP_PPC64_PRI_LOW(); \ - KMP_PPC64_PRI_MED(); \ - KMP_PPC64_PRI_LOC_MB(); \ - } while (0) -#else -#define KMP_CPU_PAUSE() /* nothing to do */ -#endif - -#define KMP_INIT_YIELD(count) \ - { (count) = __kmp_yield_init; } - -#define KMP_OVERSUBSCRIBED \ - (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) - -#define KMP_TRY_YIELD \ - ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED))) - -#define KMP_TRY_YIELD_OVERSUB \ - ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED)) - -#define KMP_YIELD(cond) \ - { \ - KMP_CPU_PAUSE(); \ - if ((cond) && (KMP_TRY_YIELD)) \ - __kmp_yield(); \ - } - -#define KMP_YIELD_OVERSUB() \ - { \ - KMP_CPU_PAUSE(); \ - if ((KMP_TRY_YIELD_OVERSUB)) \ - __kmp_yield(); \ - } - -// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround, -// there should be no yielding since initial value from KMP_INIT_YIELD() is odd. -#define KMP_YIELD_SPIN(count) \ - { \ - KMP_CPU_PAUSE(); \ - if (KMP_TRY_YIELD) { \ - (count) -= 2; \ - if (!(count)) { \ - __kmp_yield(); \ - (count) = __kmp_yield_next; \ - } \ - } \ - } - -#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \ - { \ - KMP_CPU_PAUSE(); \ - if ((KMP_TRY_YIELD_OVERSUB)) \ - __kmp_yield(); \ - else if (__kmp_use_yield == 1) { \ - (count) -= 2; \ - if (!(count)) { \ - __kmp_yield(); \ - (count) = __kmp_yield_next; \ - } \ - } \ - } - // User-level Monitor/Mwait #if KMP_HAVE_UMWAIT // We always try for UMWAIT first @@ -1405,6 +1325,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); } #include #endif #endif // KMP_HAVE_WAITPKG_INTRINSICS + KMP_ATTRIBUTE_TARGET_WAITPKG static inline int __kmp_tpause(uint32_t hint, uint64_t counter) { #if !KMP_HAVE_WAITPKG_INTRINSICS @@ -1470,6 +1391,119 @@ __kmp_mm_mwait(unsigned extensions, unsigned hints) { } #endif // KMP_HAVE_UMWAIT +#if KMP_ARCH_X86 +extern void __kmp_x86_pause(void); +#elif KMP_MIC +// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed +// regression after removal of extra PAUSE from spin loops. Changing +// the delay from 100 to 300 showed even better performance than double PAUSE +// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC. +static inline void __kmp_x86_pause(void) { _mm_delay_32(300); } +#else +static inline void __kmp_x86_pause(void) { _mm_pause(); } +#endif +#define KMP_CPU_PAUSE() __kmp_x86_pause() +#elif KMP_ARCH_PPC64 +#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1") +#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2") +#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory") +#define KMP_CPU_PAUSE() \ + do { \ + KMP_PPC64_PRI_LOW(); \ + KMP_PPC64_PRI_MED(); \ + KMP_PPC64_PRI_LOC_MB(); \ + } while (0) +#else +#define KMP_CPU_PAUSE() /* nothing to do */ +#endif + +#define KMP_INIT_YIELD(count) \ + { (count) = __kmp_yield_init; } + +#define KMP_INIT_BACKOFF(time) \ + { (time) = __kmp_pause_init; } + +#define KMP_OVERSUBSCRIBED \ + (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) + +#define KMP_TRY_YIELD \ + ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED))) + +#define KMP_TRY_YIELD_OVERSUB \ + ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED)) + +#define KMP_YIELD(cond) \ + { \ + KMP_CPU_PAUSE(); \ + if ((cond) && (KMP_TRY_YIELD)) \ + __kmp_yield(); \ + } + +#define KMP_YIELD_OVERSUB() \ + { \ + KMP_CPU_PAUSE(); \ + if ((KMP_TRY_YIELD_OVERSUB)) \ + __kmp_yield(); \ + } + +// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround, +// there should be no yielding since initial value from KMP_INIT_YIELD() is odd. +#define KMP_YIELD_SPIN(count) \ + { \ + KMP_CPU_PAUSE(); \ + if (KMP_TRY_YIELD) { \ + (count) -= 2; \ + if (!(count)) { \ + __kmp_yield(); \ + (count) = __kmp_yield_next; \ + } \ + } \ + } + +// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower +// (C0.2) state, which improves performance of other SMT threads on the same +// core, otherwise, use the fast (C0.1) default state, or whatever the user has +// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't +// available, fall back to the regular CPU pause and yield combination. +#if KMP_HAVE_UMWAIT +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ + { \ + if (__kmp_tpause_enabled) { \ + if (KMP_OVERSUBSCRIBED) { \ + __kmp_tpause(0, (time)); \ + } else { \ + __kmp_tpause(__kmp_tpause_hint, (time)); \ + } \ + (time) *= 2; \ + } else { \ + KMP_CPU_PAUSE(); \ + if ((KMP_TRY_YIELD_OVERSUB)) { \ + __kmp_yield(); \ + } else if (__kmp_use_yield == 1) { \ + (count) -= 2; \ + if (!(count)) { \ + __kmp_yield(); \ + (count) = __kmp_yield_next; \ + } \ + } \ + } \ + } +#else +#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \ + { \ + KMP_CPU_PAUSE(); \ + if ((KMP_TRY_YIELD_OVERSUB)) \ + __kmp_yield(); \ + else if (__kmp_use_yield == 1) { \ + (count) -= 2; \ + if (!(count)) { \ + __kmp_yield(); \ + (count) = __kmp_yield_next; \ + } \ + } \ + } +#endif // KMP_HAVE_UMWAIT + /* ------------------------------------------------------------------------ */ /* Support datatypes for the orphaned construct nesting checks. */ /* ------------------------------------------------------------------------ */ @@ -3088,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield; extern kmp_int32 __kmp_use_yield_exp_set; extern kmp_uint32 __kmp_yield_init; extern kmp_uint32 __kmp_yield_next; +extern kmp_uint64 __kmp_pause_init; /* ------------------------------------------------------------------------- */ extern int __kmp_allThreadsSpecified; @@ -3290,6 +3325,13 @@ extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled extern int __kmp_mwait_hints; // Hints to pass in to mwait #endif +#if KMP_HAVE_UMWAIT +extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists +extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE +extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero) +#endif + /* ------------------------------------------------------------------------- */ extern kmp_global_t __kmp_global; /* global status */ diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index 1aaffc76909a..648332109dbb 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -2655,9 +2655,11 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, kmp_uint32 spins; kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; kmp_uint32 r; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = TCR_4(*spin), check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -2665,7 +2667,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, split. It causes problems with infinite recursion because of exit lock */ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; @@ -2680,15 +2682,17 @@ void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32 check = checker; kmp_uint32 spins; kmp_uint32 (*f)(void *, kmp_uint32) = pred; + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(obj, spin); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); /* if we have waited a bit, or are noversubscribed, yield */ /* pause is in the following code */ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); } diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h index ae11361ca512..154db174613d 100644 --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -292,10 +292,12 @@ static UT __kmp_wait(volatile UT *spinner, UT checker, UT check = checker; kmp_uint32 spins; kmp_uint32 (*f)(UT, UT) = pred; + kmp_uint64 time; UT r; KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); // main wait spin loop while (!f(r = *spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); @@ -305,7 +307,7 @@ static UT __kmp_wait(volatile UT *spinner, UT checker, /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ // If oversubscribed, or have waited a bit then yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 4aea5a2d8663..fdabaad21f7b 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -219,6 +219,13 @@ int __kmp_mwait_enabled = FALSE; int __kmp_mwait_hints = 0; #endif +#if KMP_HAVE_UMWAIT +int __kmp_waitpkg_enabled = 0; +int __kmp_tpause_state = 0; +int __kmp_tpause_hint = 1; +int __kmp_tpause_enabled = 0; +#endif + /* map OMP 3.0 schedule types with our internal schedule types */ enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2] = { @@ -425,6 +432,7 @@ kmp_int32 __kmp_use_yield_exp_set = 0; kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT; kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT; +kmp_uint64 __kmp_pause_init = 1; // for tpause /* ------------------------------------------------------ */ /* STATE mostly syncronized with global lock */ diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp index f3bdb03663a6..fff7305b57f5 100644 --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -96,12 +96,19 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) { } kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); kmp_backoff_t backoff = __kmp_spin_backoff_params; do { +#if !KMP_HAVE_UMWAIT __kmp_spin_backoff(&backoff); - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); +#else + if (!__kmp_tpause_enabled) + __kmp_spin_backoff(&backoff); +#endif + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free || !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)); KMP_FSYNC_ACQUIRED(lck); @@ -2227,10 +2234,12 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) { // The current implementation of KMP_WAIT doesn't allow for mask // and poll to be re-read every spin iteration. kmp_uint32 spins; + kmp_uint64 time; KMP_FSYNC_PREPARE(lck); KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); while (polls[ticket & mask] < ticket) { // atomic load - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); // Re-read the mask and the poll pointer from the lock structure. // // Make certain that "mask" is read before "polls" !!! @@ -2659,9 +2668,17 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) { kmp_uint32 i; for (i = boff->step; i > 0; i--) { kmp_uint64 goal = __kmp_tsc() + boff->min_tick; - do { - KMP_CPU_PAUSE(); - } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + if (__kmp_umwait_enabled) { + __kmp_tpause(0, boff->min_tick); + } else { +#endif + do { + KMP_CPU_PAUSE(); + } while (before(__kmp_tsc(), goal)); +#if KMP_HAVE_UMWAIT + } +#endif } boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1); } diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h index 90afd8fd7eb3..a19f4ca323b8 100644 --- a/openmp/runtime/src/kmp_lock.h +++ b/openmp/runtime/src/kmp_lock.h @@ -651,12 +651,15 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck, if (lck->tas.lk.poll != 0 || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ - } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \ - &lck->tas.lk.poll, 0, gtid + 1)); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ + } while ( \ + lck->tas.lk.poll != 0 || \ + !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ } \ KMP_FSYNC_ACQUIRED(lck); \ } else { \ @@ -758,10 +761,12 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck, if ((lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \ kmp_uint32 spins; \ + kmp_uint64 time; \ KMP_FSYNC_PREPARE(lck); \ KMP_INIT_YIELD(spins); \ + KMP_INIT_BACKOFF(time); \ do { \ - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \ + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \ } while ( \ (lck->tas.lk.poll != 0) || \ !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \ diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 7af970803a30..e1af2f43dae7 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -6895,7 +6895,9 @@ static void __kmp_check_mic_type() { static void __kmp_user_level_mwait_init() { struct kmp_cpuid buf; __kmp_x86_cpuid(7, 0, &buf); - __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; + __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); + __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; + __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", __kmp_umwait_enabled)); } diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index 302abb4042d8..27ab51dbf7ed 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -5171,6 +5171,27 @@ static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name, #endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT +#if KMP_HAVE_UMWAIT +// ----------------------------------------------------------------------------- +// KMP_TPAUSE +// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state + +static void __kmp_stg_parse_tpause(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state); + if (__kmp_tpause_state != 0) { + // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1 + if (__kmp_tpause_state == 2) // use C0.2 + __kmp_tpause_hint = 0; // default was set to 1 for C0.1 + } +} // __kmp_stg_parse_tpause + +static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_tpause_state); +} // __kmp_stg_print_tpause +#endif // KMP_HAVE_UMWAIT + // ----------------------------------------------------------------------------- // OMP_DISPLAY_ENV @@ -5536,6 +5557,10 @@ static kmp_setting_t __kmp_stg_table[] = { {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints, __kmp_stg_print_mwait_hints, NULL, 0, 0}, #endif + +#if KMP_HAVE_UMWAIT + {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0}, +#endif {"", NULL, NULL, NULL, 0, 0}}; // settings static int const __kmp_stg_count = diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index d6665a7ccfb4..e445438524c8 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -3552,9 +3552,11 @@ void __kmp_reap_task_teams(void) { void __kmp_wait_to_unref_task_teams(void) { kmp_info_t *thread; kmp_uint32 spins; + kmp_uint64 time; int done; KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); for (;;) { done = TRUE; @@ -3604,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) { } // If oversubscribed or have waited a bit, yield. - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } } diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h index 226150dfb781..b32cb15de1b2 100644 --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -377,6 +377,7 @@ __kmp_wait_template(kmp_info_t *this_thr, #else kmp_uint32 hibernate; #endif + kmp_uint64 time; KMP_FSYNC_SPIN_INIT(spin, NULL); if (flag->done_check()) { @@ -476,6 +477,7 @@ final_spin=FALSE) #endif KMP_INIT_YIELD(spins); // Setup for waiting + KMP_INIT_BACKOFF(time); if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || __kmp_pause_status == kmp_soft_paused) { @@ -563,7 +565,7 @@ final_spin=FALSE) // If we are oversubscribed, or have waited a bit (and // KMP_LIBRARY=throughput), then yield - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); #if KMP_STATS_ENABLED // Check if thread has been signalled to idle state diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp index 0a0801c7ece2..8fa198563a79 100644 --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -1327,16 +1327,18 @@ static void __kmp_reap_common(kmp_info_t *th) { // KMP_WAIT to cover this usage also. void *obj = NULL; kmp_uint32 spins; + kmp_uint64 time; #if USE_ITT_BUILD KMP_FSYNC_SPIN_INIT(obj, (void *)&th->th.th_info.ds.ds_alive); #endif /* USE_ITT_BUILD */ KMP_INIT_YIELD(spins); + KMP_INIT_BACKOFF(time); do { #if USE_ITT_BUILD KMP_FSYNC_SPIN_PREPARE(obj); #endif /* USE_ITT_BUILD */ __kmp_is_thread_alive(th, &exit_val); - KMP_YIELD_OVERSUB_ELSE_SPIN(spins); + KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); } while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive)); #if USE_ITT_BUILD if (exit_val == STILL_ACTIVE) {