anolis: mm: unevictable: add memcg granularity core implementation

ANBZ: #2674

This patch introduces the core implementation about code section
unevictable with memcg granularity, after this patch, the unevictable
of memcg granularity has been realized.

With this patch, we mainly implemented the global switch
"/sys/kernel/mm/unevictable/enabled" to enable or disable this function,
and we also implemented the "memory.allow_text_unevictable" and
"memory.text_unevictable_percent" to enable and limit the size of code
section unevictable on per memcg.

About the unevictable text size, you can check through memory.exstat
interface.

Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/953
This commit is contained in:
Xin Hao 2022-12-03 01:28:21 +08:00 committed by 小龙
parent 358a9fe5e4
commit 179050e6e6
5 changed files with 282 additions and 2 deletions

View File

@ -457,6 +457,12 @@ struct mem_cgroup {
#ifdef CONFIG_TEXT_UNEVICTABLE
bool allow_unevictable;
unsigned int unevictable_percent;
/*
* the unevictable_size is larger than the real unevictable memory
* size, due to there may be multiple tasks sharing the same memory,
* such as binary and dynamic library sharing.
*/
atomic_long_t unevictable_size;
#endif
#if IS_ENABLED(CONFIG_RECLAIM_COLDPGS)

View File

@ -3,6 +3,8 @@
#ifndef _TEXT_UNEVICTABLE_H
#define _TEXT_UNEVICTABLE_H
struct mem_cgroup;
#ifdef CONFIG_TEXT_UNEVICTABLE
DECLARE_STATIC_KEY_FALSE(unevictable_enabled_key);
@ -10,10 +12,56 @@ static inline bool unevictable_enabled(void)
{
return static_branch_unlikely(&unevictable_enabled_key);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg);
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size);
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size);
bool is_unevictable_size_overflow(struct mem_cgroup *memcg);
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg);
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to);
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset);
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable);
void del_unevict_task(struct task_struct *tsk);
void clean_task_unevict_size(struct task_struct *tsk);
#else
static inline bool unevictable_enabled(void)
{
return false;
}
static inline bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
return false;
}
static inline void memcg_increase_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline void memcg_decrease_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
return false;
}
static inline unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return 0;
}
static inline void mem_cgroup_can_unevictable(struct task_struct *tsk,
struct mem_cgroup *to)
{
}
static inline void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
}
static inline void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
}
static inline void del_unevict_task(struct task_struct *tsk)
{
}
static inline void clean_task_unevict_size(struct task_struct *tsk)
{
}
#endif
#endif

View File

@ -64,6 +64,9 @@
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <linux/uaccess.h>
#include <asm/unistd.h>
@ -796,6 +799,9 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
#ifdef CONFIG_TEXT_UNEVICTABLE
clean_task_unevict_size(tsk);
#endif
exit_mm();
if (group_dead)

View File

@ -67,6 +67,9 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <trace/events/vmscan.h>
@ -4853,6 +4856,10 @@ static int memcg_exstat_show(struct seq_file *m, void *v)
memcg_exstat_gather(memcg, MEMCG_WMARK_MIN));
seq_printf(m, "wmark_reclaim_work_ms %llu\n",
memcg_exstat_gather(memcg, MEMCG_WMARK_RECLAIM) >> 20);
#ifdef CONFIG_TEXT_UNEVICTABLE
seq_printf(m, "unevictable_text_size_kb %lu\n",
memcg_exstat_text_unevict_gather(memcg) >> 10);
#endif
return 0;
}
@ -6302,6 +6309,10 @@ static int mem_cgroup_allow_unevictable_write(struct cgroup_subsys_state *css,
return 0;
memcg->allow_unevictable = val;
if (val)
memcg_all_processes_unevict(memcg, true);
else
memcg_all_processes_unevict(memcg, false);
return 0;
}
@ -7552,6 +7563,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
if (!p)
return 0;
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_can_unevictable(p, memcg);
#endif
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
@ -7595,6 +7610,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_cancel_unevictable(tset);
#endif
if (mc.to)
mem_cgroup_clear_mc();
}

View File

@ -41,6 +41,11 @@
#ifdef CONFIG_TEXT_UNEVICTABLE
DEFINE_STATIC_KEY_FALSE(unevictable_enabled_key);
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
#endif
struct evict_pids_t {
@ -52,6 +57,9 @@ struct evict_pid_entry {
struct list_head list;
pid_t rootpid;
u64 start_time;
#ifdef CONFIG_TEXT_UNEVICTABLE
u64 unevict_size;
#endif
struct task_struct *tsk;
bool done;
};
@ -103,6 +111,10 @@ static void __evict_pid(struct evict_pid_entry *pid)
if (!(mm->def_flags & VM_LOCKED)) {
struct vm_area_struct *vma, *next, *prev = NULL;
vm_flags_t flag;
#ifdef CONFIG_TEXT_UNEVICTABLE
unsigned long size = 0;
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
mmap_write_lock(mm);
for (vma = mm->mmap; vma; prev = vma, vma = next) {
@ -115,9 +127,17 @@ static void __evict_pid(struct evict_pid_entry *pid)
vma->vm_start, vma->vm_end, flag);
vma = prev;
next = prev->vm_next;
#ifdef CONFIG_TEXT_UNEVICTABLE
size += vma->vm_end - vma->vm_start;
#endif
}
}
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_decrease_unevict_size(memcg, size);
css_put(&memcg->css);
pid->unevict_size -= size;
#endif
}
mmput(mm);
}
@ -251,6 +271,9 @@ static void add_unevict_task(struct task_struct *tsk)
if (!result) {
result = new_entry;
result->rootpid = rootpid;
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size = 0;
#endif
rb_link_node(&result->node, parent, link);
rb_insert_color(&result->node, &base_tree->root);
list_add_tail(&result->list, &pid_list);
@ -295,6 +318,12 @@ static void unevict_pid(pid_t pid)
get_task_struct(tsk);
rcu_read_unlock();
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_memcg_unevictable_enabled(mem_cgroup_from_task(tsk))) {
put_task_struct(tsk);
return;
}
#endif
add_unevict_task(tsk);
put_task_struct(tsk);
}
@ -434,12 +463,19 @@ static void execute_vm_lock(struct work_struct *unused)
mm = get_task_mm(tsk);
if (mm && !(mm->def_flags & VM_LOCKED)) {
#ifdef CONFIG_TEXT_UNEVICTABLE
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
if (mmap_write_trylock(mm)) {
struct vm_area_struct *vma, *next, *prev = NULL;
vm_flags_t flag;
for (vma = mm->mmap; vma; prev = vma, vma = next) {
next = vma->vm_next;
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_unevictable_size_overflow(memcg))
break;
#endif
if (vma->vm_file &&
(vma->vm_flags & VM_EXEC) &&
(vma->vm_flags & VM_READ)) {
@ -449,6 +485,9 @@ static void execute_vm_lock(struct work_struct *unused)
vma->vm_start, vma->vm_end, flag);
vma = prev;
next = prev->vm_next;
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size += vma->vm_end - vma->vm_start;
#endif
}
}
@ -456,6 +495,11 @@ static void execute_vm_lock(struct work_struct *unused)
result->start_time = tsk->start_boottime;
result->done = true;
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_increase_unevict_size(memcg,
result->unevict_size);
css_put(&memcg->css);
#endif
} else {
need_again = true;
}
@ -554,6 +598,163 @@ const static struct proc_ops del_proc_fops = {
};
#ifdef CONFIG_TEXT_UNEVICTABLE
void clean_task_unevict_size(struct task_struct *tsk)
{
struct evict_pid_entry *result;
struct mem_cgroup *memcg;
/*
* There must make sure unevictable
* function is finished.
*/
if (!tsk || !base_tree)
return;
mutex_lock(&pid_mutex);
result = lookup_unevict_entry(tsk);
if (result) {
if (result->unevict_size) {
rcu_read_lock();
memcg = mem_cgroup_from_task(tsk);
memcg_decrease_unevict_size(memcg, result->unevict_size);
rcu_read_unlock();
}
list_del(&result->list);
__remove_entry(result);
mutex_unlock(&pid_mutex);
kfree(result);
} else
mutex_unlock(&pid_mutex);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
if (!unevictable_enabled())
return false;
if (!memcg)
return false;
if (memcg->allow_unevictable)
return true;
return false;
}
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_add(size, &memcg->unevictable_size);
}
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_sub(size, &memcg->unevictable_size);
}
bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
struct page_counter *counter;
u64 res_limit;
u64 size;
counter = &memcg->memory;
res_limit = (u64)counter->max * PAGE_SIZE;
size = atomic_long_read(&memcg->unevictable_size);
size = size * 100 / res_limit;
if (size >= memcg->unevictable_percent)
return true;
return false;
}
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return atomic_long_read(&memcg->unevictable_size);
}
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to)
{
struct mem_cgroup *from;
if (!unevictable_enabled())
return;
from = mem_cgroup_from_task(tsk);
VM_BUG_ON(from == to);
if (to->allow_unevictable && !from->allow_unevictable) {
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
}
if (!to->allow_unevictable && from->allow_unevictable)
del_unevict_task(tsk);
}
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
struct task_struct *tsk;
struct cgroup_subsys_state *dst_css;
struct mem_cgroup *memcg;
if (!unevictable_enabled())
return;
cgroup_taskset_for_each(tsk, dst_css, tset) {
memcg = mem_cgroup_from_task(tsk);
if (memcg->allow_unevictable)
del_unevict_task(tsk);
}
}
static inline int schedule_unevict_task(struct task_struct *tsk, void *arg)
{
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
return 0;
}
static inline int schedule_evict_task(struct task_struct *tsk, void *arg)
{
del_unevict_task(tsk);
return 0;
}
static inline void make_all_memcg_evictable(void)
{
struct mem_cgroup *memcg;
for_each_mem_cgroup(memcg) {
if (!memcg->allow_unevictable)
continue;
mem_cgroup_scan_tasks(memcg, schedule_unevict_task, NULL);
memcg->allow_unevictable = 0;
memcg->unevictable_percent = 100;
atomic_long_set(&memcg->unevictable_size, 0);
}
}
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
struct mem_cgroup *tmp_memcg;
if (!unevictable_enabled())
return;
if (!memcg)
tmp_memcg = root_mem_cgroup;
else
tmp_memcg = memcg;
if (enable)
mem_cgroup_scan_tasks(tmp_memcg, schedule_unevict_task, NULL);
else
mem_cgroup_scan_tasks(tmp_memcg, schedule_evict_task, NULL);
}
static int __init setup_unevictable(char *s)
{
if (!strcmp(s, "1"))
@ -581,9 +782,10 @@ static ssize_t unevictable_enabled_store(struct kobject *kobj,
if (!strncmp(buf, "1", 1))
static_branch_enable(&unevictable_enabled_key);
else if (!strncmp(buf, "0", 1))
else if (!strncmp(buf, "0", 1)) {
static_branch_disable(&unevictable_enabled_key);
else
make_all_memcg_evictable();
} else
ret = -EINVAL;
mutex_unlock(&mutex);