diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82db990fd24f..1b325aee62a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -99,10 +99,17 @@ static void group_balancer_enable(void) { sched_init_group_balancer_sched_domains(); static_branch_enable(&__group_balancer_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe group_balancer_enabled(). + */ + synchronize_rcu(); + util_est_reenqueue_all(); } static void group_balancer_disable(void) { + util_est_clear_all(); static_branch_disable(&__group_balancer_enabled); sched_clear_group_balancer_sched_domains(); } @@ -112,6 +119,11 @@ bool group_balancer_enabled(void) return static_branch_unlikely(&__group_balancer_enabled); } +bool group_balancer_rq_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__group_balancer_enabled) && rq->group_balancer_enabled; +} + int sched_group_balancer_enable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 229612b0f091..1fc6bc53c881 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6038,6 +6038,64 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +#ifdef CONFIG_GROUP_BALANCER +void util_est_reenqueue_all(void) +{ + int cpu; + struct rq *rq; + struct rq_flags rf; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct task_struct *p; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + se = &p->se; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + if (cfs_rq != &rq->cfs) + util_est_enqueue(cfs_rq, p); + } + } + rq->group_balancer_enabled = true; + rq_unlock_irqrestore(rq, &rf); + } + cpus_read_unlock(); +} + +static int tg_util_est_clear_down(struct task_group *tg, void *data) +{ + int cpu; + struct rq *rq; + struct rq_flags rf; + struct cfs_rq *cfs_rq; + + if (tg == &root_task_group) + return 0; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + rq_lock_irqsave(rq, &rf); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, 0); + rq->group_balancer_enabled = false; + rq_unlock_irqrestore(rq, &rf); + } + cpus_read_unlock(); + + return 0; +} + +void util_est_clear_all(void) +{ + walk_tg_tree_from(&root_task_group, tg_util_est_clear_down, tg_nop, NULL); +} +#endif + #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) /* @@ -8221,13 +8279,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - util_est_enqueue(cfs_rq, p); + if (group_balancer_rq_enabled(rq)) { + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + util_est_enqueue(cfs_rq, p); + } + se = &p->se; + } else { + util_est_enqueue(&rq->cfs, p); } - se = &p->se; - /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag @@ -8351,13 +8412,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - util_est_dequeue(cfs_rq, p); + if (group_balancer_rq_enabled(rq)) { + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + util_est_dequeue(cfs_rq, p); + } + se = &p->se; + } else { + util_est_dequeue(&rq->cfs, p); } - se = &p->se; - for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 66068270fafe..088213f8ba09 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1542,6 +1542,7 @@ struct rq { #ifdef CONFIG_GROUP_BALANCER struct group_balancer_sched_domain *gb_sd; + bool group_balancer_enabled; #endif CK_KABI_RESERVE(1) @@ -3705,6 +3706,7 @@ extern void sched_dynamic_update(int mode); #ifdef CONFIG_GROUP_BALANCER extern bool group_balancer_enabled(void); +extern bool group_balancer_rq_enabled(struct rq *rq); static inline const struct cpumask *task_allowed_cpu(struct task_struct *p) { if (group_balancer_enabled()) { @@ -3746,7 +3748,10 @@ extern unsigned long cfs_h_load(struct cfs_rq *cfs_rq); extern bool gb_cpu_overutilized(int cpu); extern void gb_load_balance(struct lb_env *env); extern void task_tick_gb(struct task_struct *p); +extern void util_est_reenqueue_all(void); +extern void util_est_clear_all(void); #else +static inline bool group_balancer_rq_enabled(struct rq *rq) { return false; } static inline const struct cpumask *task_allowed_cpu(struct task_struct *p) { return p->cpus_ptr;