anolis: mm: support pre oom

ANBZ: #9079

Provide a universal and reliable rapid OOM resolution solution to assist
businesses in enhancing memory deployment density and improving the
stability of online business performance during high watermark operation.

Signed-off-by: Kaihao Bai <carlo.bai@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/3223
This commit is contained in:
Kaihao Bai 2023-07-12 15:44:00 +08:00 committed by 小龙
parent a173be2021
commit 8bd02f6cf2
10 changed files with 282 additions and 3 deletions

View File

@ -525,6 +525,10 @@ struct mem_cgroup {
bool allow_pgtable_bind;
#endif
#ifdef CONFIG_PRE_OOM
bool oom_offline;
#endif
#ifdef CONFIG_LRU_GEN
CK_KABI_USE(1, unsigned long mglru_batch_size)
CK_KABI_USE(2, unsigned long mglru_reclaim_pages)

36
include/linux/pre_oom.h Normal file
View File

@ -0,0 +1,36 @@
#ifndef _LINUX_PRE_OOM_H
#define _LINUX_PRE_OOM_H
#include <linux/sched.h>
#ifdef CONFIG_PRE_OOM
#include <linux/types.h>
#include <linux/jump_label.h>
DECLARE_STATIC_KEY_FALSE(pre_oom_enabled_key);
static inline bool pre_oom_enabled(void)
{
return static_branch_unlikely(&pre_oom_enabled_key);
}
int pre_oom_enter(void);
void pre_oom_leave(void);
#else
static inline bool pre_oom_enabled(void)
{
return false;
}
static inline int pre_oom_enter(void)
{
return 0;
}
static inline void pre_oom_leave(void) {}
#endif /* CONFIG_PRE_OOM */
#endif /* _LINUX_PRE_OOM_H */

View File

@ -976,7 +976,9 @@ struct task_struct {
#ifdef CONFIG_IOMMU_SVA
unsigned pasid_activated:1;
#endif
#ifdef CONFIG_PRE_OOM
unsigned reclaim_stall:1;
#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;

View File

@ -64,6 +64,7 @@
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/pre_oom.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
@ -859,6 +860,15 @@ void __noreturn do_exit(long code)
validate_creds_for_do_exit(tsk);
#ifdef CONFIG_PRE_OOM
/*
* Killed task has been stalled in reclaim path, release the semaphore
* here.
*/
if (unlikely(tsk->reclaim_stall))
pre_oom_leave();
#endif
check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)

View File

@ -1032,4 +1032,10 @@ config PAGETABLE_SHARE
If unsure, say N.
config PRE_OOM
bool "Enable pre oom control"
depends on MEMCG
help
This feature is used to ensure that higher priority tasks would not enter the direct
reclaim path when applying for memory allocation.
endmenu

View File

@ -138,3 +138,4 @@ obj-$(CONFIG_PAGECACHE_LIMIT) += pagecache_limit.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o
obj-$(CONFIG_PAGETABLE_SHARE) += pgtable_share.o
obj-$(CONFIG_PRE_OOM) += pre_oom.o

View File

@ -68,6 +68,7 @@
#include <net/ip.h>
#include "slab.h"
#include <linux/proc_fs.h>
#include <linux/pre_oom.h>
#include <linux/uaccess.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
@ -2751,11 +2752,13 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
* simply record the whole duration of reclaim_wmark work for the
* overhead-accuracy trade-off.
*/
pre_oom_enter();
start = ktime_get_ns();
psi_memstall_enter(&pflags);
try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
psi_memstall_leave(&pflags);
duration = ktime_get_ns() - start;
pre_oom_leave();
if (!css_tryget_online(&memcg->css))
return;
@ -2791,10 +2794,12 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
memcg_memory_event(memcg, MEMCG_HIGH);
pre_oom_enter();
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask, true);
psi_memstall_leave(&pflags);
pre_oom_leave();
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@ -3121,12 +3126,14 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_MAX);
pre_oom_enter();
memcg_lat_stat_start(&start);
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
psi_memstall_leave(&pflags);
memcg_lat_stat_end(MEM_LAT_MEMCG_DIRECT_RECLAIM, start);
pre_oom_leave();
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
@ -6724,6 +6731,29 @@ static int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_PGTABLE_BIND */
#ifdef CONFIG_PRE_OOM
static u64 memcg_oom_offline_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return READ_ONCE(memcg->oom_offline);
}
static int memcg_oom_offline_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val)
memcg->oom_offline = true;
else
memcg->oom_offline = false;
return 0;
}
#endif /* CONFIG_PRE_OOM */
#ifdef CONFIG_LRU_GEN
static bool mglru_size_valid_check(struct mem_cgroup *memcg)
{
@ -7399,6 +7429,13 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write_u64 = memcg_pgtable_misplaced_write,
.read_u64 = memcg_pgtable_misplaced_read,
},
#endif
#ifdef CONFIG_PRE_OOM
{
.name = "oom_offline",
.write_u64 = memcg_oom_offline_write,
.read_u64 = memcg_oom_offline_read,
},
#endif
{ }, /* terminate */
};

View File

@ -76,6 +76,7 @@
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
#include <linux/prezero.h>
#include <linux/pre_oom.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@ -4741,6 +4742,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
cond_resched();
/* We now go into synchronous reclaim */
pre_oom_enter();
cpuset_memory_pressure_bump();
memcg_lat_stat_start(&start);
psi_memstall_enter(&pflags);
@ -4754,6 +4756,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
fs_reclaim_release(gfp_mask);
psi_memstall_leave(&pflags);
memcg_lat_stat_end(MEM_LAT_GLOBAL_DIRECT_RECLAIM, start);
pre_oom_leave();
cond_resched();
@ -5029,7 +5032,6 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@ -5041,6 +5043,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int cpuset_mems_cookie;
unsigned int zonelist_iter_cookie;
int reserve_flags;
bool can_direct_reclaim;
/*
* We also sanity check to catch abuse of atomic reserves being used by
@ -5050,6 +5053,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
#ifdef CONFIG_PRE_OOM
/*
* If Pre-OOM is enabled, the cgroup of QoS sensitive should avoid
* direct reclaim and trigger OOM as soon as possible. Thus gfp_mask
* should be reset here.
*/
if (pre_oom_enabled()) {
struct mem_cgroup *memcg;
memcg = get_mem_cgroup_from_mm(current->mm);
if (memcg && !memcg->oom_offline) {
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
css_put(&memcg->css);
}
}
#endif
can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
restart:
compaction_retries = 0;
no_progress_loops = 0;
@ -5168,7 +5189,7 @@ retry:
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
goto nopage;
goto oom;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
@ -5222,6 +5243,7 @@ retry:
check_retry_zonelist(zonelist_iter_cookie))
goto restart;
oom:
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)

158
mm/pre_oom.c Normal file
View File

@ -0,0 +1,158 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/slab.h>
#include <linux/pre_oom.h>
DEFINE_STATIC_KEY_FALSE(pre_oom_enabled_key);
/*
* From 0 .. 3, which means the kernel can support up to
* num_online_cpus / oom_level tasks to reclaim memory.
*/
static int oom_level;
static struct semaphore *sem;
int pre_oom_enter(void)
{
int result;
if (!pre_oom_enabled())
return 0;
result = down_killable(sem);
if (!result)
current->reclaim_stall = 1;
return result;
}
void pre_oom_leave(void)
{
if (pre_oom_enabled() && current->reclaim_stall) {
current->reclaim_stall = 0;
up(sem);
}
}
static int adjust_oom_level(int level)
{
unsigned long flags;
int count = num_online_cpus() / (level + 1);
int result = 0;
raw_spin_lock_irqsave(&sem->lock, flags);
/* There are no other tasks reclaiming memory */
if (sem->count == (num_online_cpus() / (oom_level + 1))) {
sem->count = count;
oom_level = level;
} else
result = -EPERM;
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
#ifdef CONFIG_SYSFS
static ssize_t pre_oom_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!static_branch_unlikely(&pre_oom_enabled_key));
}
static ssize_t pre_oom_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
static DEFINE_MUTEX(mutex);
ssize_t ret = count;
mutex_lock(&mutex);
if (!strncmp(buf, "1", 1))
static_branch_enable(&pre_oom_enabled_key);
else if (!strncmp(buf, "0", 1))
static_branch_disable(&pre_oom_enabled_key);
else
ret = -EINVAL;
mutex_unlock(&mutex);
return ret;
}
static ssize_t pre_oom_level_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", oom_level);
}
static ssize_t pre_oom_level_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
static DEFINE_MUTEX(mutex);
ssize_t ret = count;
unsigned long level;
ret = kstrtoul(buf, 10, &level);
if (ret)
return ret;
mutex_lock(&mutex);
if (level < 0 || level > 3)
ret = -EINVAL;
adjust_oom_level(level);
mutex_unlock(&mutex);
return count;
}
static struct kobj_attribute pre_oom_enabled_attr =
__ATTR(enabled, 0644, pre_oom_enabled_show,
pre_oom_enabled_store);
static struct kobj_attribute pre_oom_level_attr =
__ATTR(level, 0644, pre_oom_level_show,
pre_oom_level_store);
static struct attribute *pre_oom_attrs[] = {
&pre_oom_enabled_attr.attr,
&pre_oom_level_attr.attr,
NULL,
};
static const struct attribute_group pre_oom_attr_group = {
.attrs = pre_oom_attrs,
.name = "pre_oom",
};
#endif /* CONFIG_SYSFS */
static int __init pre_oom_init(void)
{
#ifdef CONFIG_SYSFS
int err;
err = sysfs_create_group(mm_kobj, &pre_oom_attr_group);
if (err) {
pr_err("pre_oom: register sysfs failed\n");
return err;
}
#endif
sem = kmalloc(sizeof(*sem), GFP_KERNEL);
if (!sem)
return -ENOMEM;
sema_init(sem, num_online_cpus());
return 0;
}
subsys_initcall(pre_oom_init);

View File

@ -61,6 +61,7 @@
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/pre_oom.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@ -6964,6 +6965,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
};
set_task_reclaim_state(current, &sc.reclaim_state);
pre_oom_enter();
psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
@ -7156,6 +7158,7 @@ out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
pre_oom_leave();
set_task_reclaim_state(current, NULL);
/*