anolis: mm: support pre oom
ANBZ: #9079 Provide a universal and reliable rapid OOM resolution solution to assist businesses in enhancing memory deployment density and improving the stability of online business performance during high watermark operation. Signed-off-by: Kaihao Bai <carlo.bai@linux.alibaba.com> Reviewed-by: Xu Yu <xuyu@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/3223
This commit is contained in:
parent
a173be2021
commit
8bd02f6cf2
|
@ -525,6 +525,10 @@ struct mem_cgroup {
|
|||
bool allow_pgtable_bind;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
bool oom_offline;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
CK_KABI_USE(1, unsigned long mglru_batch_size)
|
||||
CK_KABI_USE(2, unsigned long mglru_reclaim_pages)
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#ifndef _LINUX_PRE_OOM_H
|
||||
#define _LINUX_PRE_OOM_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(pre_oom_enabled_key);
|
||||
static inline bool pre_oom_enabled(void)
|
||||
{
|
||||
return static_branch_unlikely(&pre_oom_enabled_key);
|
||||
}
|
||||
|
||||
|
||||
int pre_oom_enter(void);
|
||||
void pre_oom_leave(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline bool pre_oom_enabled(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline int pre_oom_enter(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void pre_oom_leave(void) {}
|
||||
|
||||
#endif /* CONFIG_PRE_OOM */
|
||||
#endif /* _LINUX_PRE_OOM_H */
|
|
@ -976,7 +976,9 @@ struct task_struct {
|
|||
#ifdef CONFIG_IOMMU_SVA
|
||||
unsigned pasid_activated:1;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
unsigned reclaim_stall:1;
|
||||
#endif
|
||||
unsigned long atomic_flags; /* Flags requiring atomic access. */
|
||||
|
||||
struct restart_block restart_block;
|
||||
|
|
|
@ -64,6 +64,7 @@
|
|||
#include <linux/rcuwait.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/pre_oom.h>
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
#include <linux/unevictable.h>
|
||||
#endif
|
||||
|
@ -859,6 +860,15 @@ void __noreturn do_exit(long code)
|
|||
|
||||
validate_creds_for_do_exit(tsk);
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
/*
|
||||
* Killed task has been stalled in reclaim path, release the semaphore
|
||||
* here.
|
||||
*/
|
||||
if (unlikely(tsk->reclaim_stall))
|
||||
pre_oom_leave();
|
||||
#endif
|
||||
|
||||
check_stack_usage();
|
||||
preempt_disable();
|
||||
if (tsk->nr_dirtied)
|
||||
|
|
|
@ -1032,4 +1032,10 @@ config PAGETABLE_SHARE
|
|||
|
||||
If unsure, say N.
|
||||
|
||||
config PRE_OOM
|
||||
bool "Enable pre oom control"
|
||||
depends on MEMCG
|
||||
help
|
||||
This feature is used to ensure that higher priority tasks would not enter the direct
|
||||
reclaim path when applying for memory allocation.
|
||||
endmenu
|
||||
|
|
|
@ -138,3 +138,4 @@ obj-$(CONFIG_PAGECACHE_LIMIT) += pagecache_limit.o
|
|||
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
|
||||
obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o
|
||||
obj-$(CONFIG_PAGETABLE_SHARE) += pgtable_share.o
|
||||
obj-$(CONFIG_PRE_OOM) += pre_oom.o
|
||||
|
|
|
@ -68,6 +68,7 @@
|
|||
#include <net/ip.h>
|
||||
#include "slab.h"
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/pre_oom.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#ifdef CONFIG_TEXT_UNEVICTABLE
|
||||
|
@ -2751,11 +2752,13 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
|
|||
* simply record the whole duration of reclaim_wmark work for the
|
||||
* overhead-accuracy trade-off.
|
||||
*/
|
||||
pre_oom_enter();
|
||||
start = ktime_get_ns();
|
||||
psi_memstall_enter(&pflags);
|
||||
try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
|
||||
psi_memstall_leave(&pflags);
|
||||
duration = ktime_get_ns() - start;
|
||||
pre_oom_leave();
|
||||
|
||||
if (!css_tryget_online(&memcg->css))
|
||||
return;
|
||||
|
@ -2791,10 +2794,12 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
|
|||
|
||||
memcg_memory_event(memcg, MEMCG_HIGH);
|
||||
|
||||
pre_oom_enter();
|
||||
psi_memstall_enter(&pflags);
|
||||
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
|
||||
gfp_mask, true);
|
||||
psi_memstall_leave(&pflags);
|
||||
pre_oom_leave();
|
||||
} while ((memcg = parent_mem_cgroup(memcg)) &&
|
||||
!mem_cgroup_is_root(memcg));
|
||||
|
||||
|
@ -3121,12 +3126,14 @@ retry:
|
|||
|
||||
memcg_memory_event(mem_over_limit, MEMCG_MAX);
|
||||
|
||||
pre_oom_enter();
|
||||
memcg_lat_stat_start(&start);
|
||||
psi_memstall_enter(&pflags);
|
||||
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
|
||||
gfp_mask, may_swap);
|
||||
psi_memstall_leave(&pflags);
|
||||
memcg_lat_stat_end(MEM_LAT_MEMCG_DIRECT_RECLAIM, start);
|
||||
pre_oom_leave();
|
||||
|
||||
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
|
||||
goto retry;
|
||||
|
@ -6724,6 +6731,29 @@ static int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css,
|
|||
}
|
||||
#endif /* CONFIG_PGTABLE_BIND */
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
static u64 memcg_oom_offline_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
return READ_ONCE(memcg->oom_offline);
|
||||
}
|
||||
|
||||
static int memcg_oom_offline_write(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, u64 val)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
if (val)
|
||||
memcg->oom_offline = true;
|
||||
else
|
||||
memcg->oom_offline = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_PRE_OOM */
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
static bool mglru_size_valid_check(struct mem_cgroup *memcg)
|
||||
{
|
||||
|
@ -7399,6 +7429,13 @@ static struct cftype mem_cgroup_legacy_files[] = {
|
|||
.write_u64 = memcg_pgtable_misplaced_write,
|
||||
.read_u64 = memcg_pgtable_misplaced_read,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
{
|
||||
.name = "oom_offline",
|
||||
.write_u64 = memcg_oom_offline_write,
|
||||
.read_u64 = memcg_oom_offline_read,
|
||||
},
|
||||
#endif
|
||||
{ }, /* terminate */
|
||||
};
|
||||
|
|
|
@ -76,6 +76,7 @@
|
|||
#include <linux/buffer_head.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/prezero.h>
|
||||
#include <linux/pre_oom.h>
|
||||
|
||||
#include <asm/sections.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
@ -4741,6 +4742,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
|
|||
cond_resched();
|
||||
|
||||
/* We now go into synchronous reclaim */
|
||||
pre_oom_enter();
|
||||
cpuset_memory_pressure_bump();
|
||||
memcg_lat_stat_start(&start);
|
||||
psi_memstall_enter(&pflags);
|
||||
|
@ -4754,6 +4756,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
|
|||
fs_reclaim_release(gfp_mask);
|
||||
psi_memstall_leave(&pflags);
|
||||
memcg_lat_stat_end(MEM_LAT_GLOBAL_DIRECT_RECLAIM, start);
|
||||
pre_oom_leave();
|
||||
|
||||
cond_resched();
|
||||
|
||||
|
@ -5029,7 +5032,6 @@ static inline struct page *
|
|||
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
struct alloc_context *ac)
|
||||
{
|
||||
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
|
||||
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
|
||||
struct page *page = NULL;
|
||||
unsigned int alloc_flags;
|
||||
|
@ -5041,6 +5043,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
|||
unsigned int cpuset_mems_cookie;
|
||||
unsigned int zonelist_iter_cookie;
|
||||
int reserve_flags;
|
||||
bool can_direct_reclaim;
|
||||
|
||||
/*
|
||||
* We also sanity check to catch abuse of atomic reserves being used by
|
||||
|
@ -5050,6 +5053,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
|||
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
|
||||
gfp_mask &= ~__GFP_ATOMIC;
|
||||
|
||||
#ifdef CONFIG_PRE_OOM
|
||||
/*
|
||||
* If Pre-OOM is enabled, the cgroup of QoS sensitive should avoid
|
||||
* direct reclaim and trigger OOM as soon as possible. Thus gfp_mask
|
||||
* should be reset here.
|
||||
*/
|
||||
if (pre_oom_enabled()) {
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
memcg = get_mem_cgroup_from_mm(current->mm);
|
||||
if (memcg && !memcg->oom_offline) {
|
||||
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
|
||||
css_put(&memcg->css);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
|
||||
restart:
|
||||
compaction_retries = 0;
|
||||
no_progress_loops = 0;
|
||||
|
@ -5168,7 +5189,7 @@ retry:
|
|||
|
||||
/* Caller is not willing to reclaim, we can't balance anything */
|
||||
if (!can_direct_reclaim)
|
||||
goto nopage;
|
||||
goto oom;
|
||||
|
||||
/* Avoid recursion of direct reclaim */
|
||||
if (current->flags & PF_MEMALLOC)
|
||||
|
@ -5222,6 +5243,7 @@ retry:
|
|||
check_retry_zonelist(zonelist_iter_cookie))
|
||||
goto restart;
|
||||
|
||||
oom:
|
||||
/* Reclaim has failed us, start killing things */
|
||||
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
|
||||
if (page)
|
||||
|
|
|
@ -0,0 +1,158 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/init.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pre_oom.h>
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(pre_oom_enabled_key);
|
||||
|
||||
/*
|
||||
* From 0 .. 3, which means the kernel can support up to
|
||||
* num_online_cpus / oom_level tasks to reclaim memory.
|
||||
*/
|
||||
static int oom_level;
|
||||
static struct semaphore *sem;
|
||||
|
||||
int pre_oom_enter(void)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (!pre_oom_enabled())
|
||||
return 0;
|
||||
|
||||
result = down_killable(sem);
|
||||
if (!result)
|
||||
current->reclaim_stall = 1;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void pre_oom_leave(void)
|
||||
{
|
||||
if (pre_oom_enabled() && current->reclaim_stall) {
|
||||
current->reclaim_stall = 0;
|
||||
up(sem);
|
||||
}
|
||||
}
|
||||
|
||||
static int adjust_oom_level(int level)
|
||||
{
|
||||
unsigned long flags;
|
||||
int count = num_online_cpus() / (level + 1);
|
||||
int result = 0;
|
||||
|
||||
raw_spin_lock_irqsave(&sem->lock, flags);
|
||||
|
||||
/* There are no other tasks reclaiming memory */
|
||||
if (sem->count == (num_online_cpus() / (oom_level + 1))) {
|
||||
sem->count = count;
|
||||
oom_level = level;
|
||||
} else
|
||||
result = -EPERM;
|
||||
|
||||
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
static ssize_t pre_oom_enabled_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%d\n", !!static_branch_unlikely(&pre_oom_enabled_key));
|
||||
}
|
||||
|
||||
static ssize_t pre_oom_enabled_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
static DEFINE_MUTEX(mutex);
|
||||
ssize_t ret = count;
|
||||
|
||||
mutex_lock(&mutex);
|
||||
|
||||
if (!strncmp(buf, "1", 1))
|
||||
static_branch_enable(&pre_oom_enabled_key);
|
||||
else if (!strncmp(buf, "0", 1))
|
||||
static_branch_disable(&pre_oom_enabled_key);
|
||||
else
|
||||
ret = -EINVAL;
|
||||
|
||||
mutex_unlock(&mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t pre_oom_level_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%d\n", oom_level);
|
||||
}
|
||||
|
||||
static ssize_t pre_oom_level_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
static DEFINE_MUTEX(mutex);
|
||||
ssize_t ret = count;
|
||||
unsigned long level;
|
||||
|
||||
ret = kstrtoul(buf, 10, &level);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&mutex);
|
||||
|
||||
if (level < 0 || level > 3)
|
||||
ret = -EINVAL;
|
||||
|
||||
adjust_oom_level(level);
|
||||
|
||||
mutex_unlock(&mutex);
|
||||
return count;
|
||||
|
||||
}
|
||||
|
||||
static struct kobj_attribute pre_oom_enabled_attr =
|
||||
__ATTR(enabled, 0644, pre_oom_enabled_show,
|
||||
pre_oom_enabled_store);
|
||||
|
||||
static struct kobj_attribute pre_oom_level_attr =
|
||||
__ATTR(level, 0644, pre_oom_level_show,
|
||||
pre_oom_level_store);
|
||||
|
||||
static struct attribute *pre_oom_attrs[] = {
|
||||
&pre_oom_enabled_attr.attr,
|
||||
&pre_oom_level_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group pre_oom_attr_group = {
|
||||
.attrs = pre_oom_attrs,
|
||||
.name = "pre_oom",
|
||||
};
|
||||
#endif /* CONFIG_SYSFS */
|
||||
|
||||
static int __init pre_oom_init(void)
|
||||
{
|
||||
#ifdef CONFIG_SYSFS
|
||||
int err;
|
||||
|
||||
err = sysfs_create_group(mm_kobj, &pre_oom_attr_group);
|
||||
if (err) {
|
||||
pr_err("pre_oom: register sysfs failed\n");
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
|
||||
if (!sem)
|
||||
return -ENOMEM;
|
||||
|
||||
sema_init(sem, num_online_cpus());
|
||||
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(pre_oom_init);
|
|
@ -61,6 +61,7 @@
|
|||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/pre_oom.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
|
@ -6964,6 +6965,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
|
|||
};
|
||||
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
pre_oom_enter();
|
||||
psi_memstall_enter(&pflags);
|
||||
__fs_reclaim_acquire();
|
||||
|
||||
|
@ -7156,6 +7158,7 @@ out:
|
|||
snapshot_refaults(NULL, pgdat);
|
||||
__fs_reclaim_release();
|
||||
psi_memstall_leave(&pflags);
|
||||
pre_oom_leave();
|
||||
set_task_reclaim_state(current, NULL);
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue