anolis: mm: support allocating page table pages binded with local numa node

ANBZ: #6618

Currently page table can not be migrated through numa balancing. If most of page table
pages are located on remote numa node, the performance can be degraded in some scenarios.
Thus this patch provides a way to allocate page table pages in local numa node with
reserved memory range.

To switch on the pgtable bind feature globally:
echo 2 > /sys/kernel/mm/pgtable_bind/enabled

Only for misplaced page table pages statisitcs
echo 1 > /sys/kernel/mm/pgtable_bind/enabled

To enable the pgtable bind feature of cgroup level:
echo 1 > /sys/fs/cgroup/memory/<online_memcg>/memory.pgtable_bind

To get the amount of misplaced page table pages
cat /sys/fs/cgroup/memory/<online_memcg>/memory.pgtable_misplaced
Besides, if needs to reset the value of pgtable_misplaced
echo 0 > /sys/fs/cgroup/memory/<online_memcg>/memory.pgtable_misplaced

Signed-off-by: Kaihao Bai <carlo.bai@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Reviewed-by: Guixin Liu <kanie@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/2254
This commit is contained in:
Kaihao Bai 2023-10-07 13:54:27 +08:00 committed by 小龙
parent e17630f874
commit deaf2108f0
12 changed files with 301 additions and 2 deletions

View File

@ -10,6 +10,8 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/memcontrol.h>
#include <linux/pgtable_bind.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@ -21,6 +23,27 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
{
gfp_t gfp = GFP_PGTABLE_USER;
#ifdef CONFIG_PGTABLE_BIND
if (pgtable_stat_enabled()) {
struct mem_cgroup *memcg;
bool pgtable_alloc = false;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg) {
pgtable_alloc = memcg->allow_pgtable_bind;
css_put(&memcg->css);
}
/* Only target on user processes */
if (pgtable_alloc) {
gfp |= __GFP_PGTABLE;
if (pgtable_bind_enabled())
gfp |= __GFP_HIGH | __GFP_THISNODE;
}
}
#endif
if (PGD_SIZE == PAGE_SIZE)
return (pgd_t *)__get_free_page(gfp);
else

View File

@ -2,6 +2,9 @@
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H
#include <linux/memcontrol.h>
#include <linux/pgtable_bind.h>
#ifdef CONFIG_MMU
#define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_NOKFENCE)
@ -60,6 +63,27 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
{
struct page *pte;
#ifdef CONFIG_PGTABLE_BIND
if (pgtable_stat_enabled()) {
struct mem_cgroup *memcg;
bool pgtable_alloc = false;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg) {
pgtable_alloc = memcg->allow_pgtable_bind;
css_put(&memcg->css);
}
/* Only target on user processes */
if (pgtable_alloc) {
gfp |= __GFP_PGTABLE;
if (pgtable_bind_enabled())
gfp |= __GFP_HIGH | __GFP_THISNODE;
}
}
#endif
pte = alloc_page(gfp);
if (!pte)
return NULL;
@ -121,6 +145,27 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
struct page *page;
gfp_t gfp = GFP_PGTABLE_USER;
#ifdef CONFIG_PGTABLE_BIND
if (pgtable_stat_enabled()) {
struct mem_cgroup *memcg;
bool pgtable_alloc = false;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg) {
pgtable_alloc = memcg->allow_pgtable_bind;
css_put(&memcg->css);
}
/* Only target on user processes */
if (pgtable_alloc) {
gfp |= __GFP_PGTABLE;
if (pgtable_bind_enabled())
gfp |= __GFP_HIGH | __GFP_THISNODE;
}
}
#endif
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
page = alloc_pages(gfp, 0);
@ -161,6 +206,27 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_PGTABLE_USER;
#ifdef CONFIG_PGTABLE_BIND
if (pgtable_stat_enabled()) {
struct mem_cgroup *memcg;
bool pgtable_alloc = false;
memcg = get_mem_cgroup_from_mm(mm);
if (memcg) {
pgtable_alloc = memcg->allow_pgtable_bind;
css_put(&memcg->css);
}
/* Only target on user processes */
if (pgtable_alloc) {
gfp |= __GFP_PGTABLE;
if (pgtable_bind_enabled())
gfp |= __GFP_HIGH | __GFP_THISNODE;
}
}
#endif
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
return (pud_t *)get_zeroed_page(gfp);

View File

@ -44,6 +44,7 @@ struct vm_area_struct;
#else
#define ___GFP_NOLOCKDEP 0
#endif
#define ___GFP_PGTABLE 0x4000000u
#define ___GFP_NOKFENCE 0x8000000u
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
@ -87,6 +88,8 @@ struct vm_area_struct;
*
* %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
*
* %__GFP_PGTABLE indicates the allocation of page table pages.
*
* %__GFP_NOKFENCE informs DO NOT try to alloc page from kfence pool.
*/
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
@ -94,6 +97,7 @@ struct vm_area_struct;
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
#define __GFP_PGTABLE ((__force gfp_t)___GFP_PGTABLE)
#define __GFP_NOKFENCE ((__force gfp_t)___GFP_NOKFENCE)
/**

View File

@ -520,6 +520,11 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif
#ifdef CONFIG_PGTABLE_BIND
unsigned long pgtable_misplaced;
bool allow_pgtable_bind;
#endif
CK_KABI_RESERVE(1)
CK_KABI_RESERVE(2)
CK_KABI_RESERVE(3)

View File

@ -0,0 +1,32 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_BIND_H_
#define _LINUX_PGTABLE_BIND_H_
#include <linux/types.h>
#include <linux/jump_label.h>
#ifdef CONFIG_PGTABLE_BIND
DECLARE_STATIC_KEY_FALSE(pgtable_bind_enabled_key);
DECLARE_STATIC_KEY_FALSE(pgtable_stat_enabled_key);
static inline bool pgtable_bind_enabled(void)
{
return static_key_enabled(&pgtable_bind_enabled_key);
}
static inline bool pgtable_stat_enabled(void)
{
return static_key_enabled(&pgtable_stat_enabled_key);
}
#else
static inline bool pgtable_bind_enabled(void)
{
return false;
}
static inline bool pgtable_stat_enabled(void)
{
return false;
}
#endif
#endif /* _LINUX_PGTABLE_BIND_H_ */

View File

@ -48,7 +48,8 @@
{(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
{(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
{(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
{(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\
{(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\
{(unsigned long)__GFP_PGTABLE, "__GFP_PGTABLE"} \
#define show_gfp_flags(flags) \
(flags) ? __print_flags(flags, "|", \

View File

@ -982,6 +982,18 @@ config PAGECACHE_LIMIT
If unsure, say N.
config PGTABLE_BIND
bool "Enable page table allocation binded with CPUs and misplaced statistics"
depends on MEMCG
default n
help
This feature is used to solve the problem that the page table pages can not be
migrated through numa balancing. If pages are located on remote numa node, the
performance can be degraded in some scenarios. Thus this configuration provides a
way to allocate page table pages in local numa node with reserved memory range.
If unsure, say N.
# multi-gen LRU {
config LRU_GEN
bool "Multi-Gen LRU"

View File

@ -136,3 +136,4 @@ obj-$(CONFIG_ASYNC_FORK) += async_fork.o
obj-$(CONFIG_PAGE_PREZERO) += prezero.o
obj-$(CONFIG_PAGECACHE_LIMIT) += pagecache_limit.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o

View File

@ -6615,6 +6615,51 @@ static int mem_cgroup_allow_pgcache_sync_write(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_PAGECACHE_LIMIT */
#ifdef CONFIG_PGTABLE_BIND
static u64 memcg_pgtable_bind_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return READ_ONCE(memcg->allow_pgtable_bind);
}
static int memcg_pgtable_bind_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val)
memcg->allow_pgtable_bind = true;
else
memcg->allow_pgtable_bind = false;
return 0;
}
static u64 memcg_pgtable_misplaced_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return READ_ONCE(memcg->pgtable_misplaced);
}
static int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val)
return -EINVAL;
/* reset the stat of current memcg */
memcg->pgtable_misplaced = 0;
return 0;
}
#endif /* CONFIG_PGTABLE_BIND */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int memcg_thp_reclaim_show(struct seq_file *m, void *v)
{
@ -7208,7 +7253,18 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = memcg_thp_control_write,
},
#endif
#ifdef CONFIG_PGTABLE_BIND
{
.name = "pgtable_bind",
.write_u64 = memcg_pgtable_bind_write,
.read_u64 = memcg_pgtable_bind_read,
},
{
.name = "pgtable_misplaced",
.write_u64 = memcg_pgtable_misplaced_write,
.read_u64 = memcg_pgtable_misplaced_read,
},
#endif
{ }, /* terminate */
};

View File

@ -4196,6 +4196,24 @@ try_this_zone:
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
#ifdef CONFIG_PGTABLE_BIND
/*
* If allocated page belongs to remote numa node,
* accumulate memcg->ck_reserved2 to show how many pages
* are from remote node.
*/
if ((gfp_mask & __GFP_PGTABLE) &&
(zone_to_nid(ac->preferred_zoneref->zone) != zone_to_nid(zone))) {
struct mem_cgroup *memcg;
memcg = get_mem_cgroup_from_mm(current->mm);
if (memcg) {
memcg->pgtable_misplaced++;
css_put(&memcg->css);
}
}
#endif
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
@ -5280,6 +5298,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
*/
ac.nodemask = nodemask;
/*
* Restore the __GFP_THISNODE restriction if current allocation is page
* table.
*/
if (gfp_mask & __GFP_PGTABLE) {
gfp_mask &= ~__GFP_THISNODE;
alloc_mask &= ~__GFP_THISNODE;
ac.zonelist = node_zonelist(preferred_nid, gfp_mask);
}
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:

70
mm/pgtable_bind.c Normal file
View File

@ -0,0 +1,70 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/pgtable_bind.h>
#ifdef CONFIG_PGTABLE_BIND
#ifdef CONFIG_SYSFS
DEFINE_STATIC_KEY_FALSE(pgtable_bind_enabled_key);
DEFINE_STATIC_KEY_FALSE(pgtable_stat_enabled_key);
static ssize_t pgtable_bind_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", !!static_key_enabled(&pgtable_bind_enabled_key) +
!!static_key_enabled(&pgtable_stat_enabled_key));
}
static ssize_t pgtable_bind_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
static DEFINE_MUTEX(mutex);
ssize_t ret = count;
mutex_lock(&mutex);
if (!strncmp(buf, "2", 1)) {
static_branch_enable(&pgtable_bind_enabled_key);
static_branch_enable(&pgtable_stat_enabled_key);
} else if (!strncmp(buf, "1", 1)) {
static_branch_disable(&pgtable_bind_enabled_key);
static_branch_enable(&pgtable_stat_enabled_key);
} else if (!strncmp(buf, "0", 1)) {
static_branch_disable(&pgtable_bind_enabled_key);
static_branch_disable(&pgtable_stat_enabled_key);
}
mutex_unlock(&mutex);
return ret;
}
static struct kobj_attribute pgtable_bind_enabled_attr =
__ATTR(enabled, 0644, pgtable_bind_enabled_show,
pgtable_bind_enabled_store);
static struct attribute *pgtable_bind_attrs[] = {
&pgtable_bind_enabled_attr.attr,
NULL,
};
static const struct attribute_group pgtable_bind_attr_group = {
.attrs = pgtable_bind_attrs,
.name = "pgtable_bind",
};
static int __init pgtable_bind_init(void)
{
int ret;
ret = sysfs_create_group(mm_kobj, &pgtable_bind_attr_group);
if (ret)
pr_err("pgtable_bind: register sysfs failed\n");
return ret;
}
subsys_initcall(pgtable_bind_init);
#endif /* CONFIG_SYSFS */
#endif /* CONFIG_PGTABLE_BIND */

View File

@ -660,6 +660,7 @@ static const struct {
{ "__GFP_RECLAIM", "R" },
{ "__GFP_DIRECT_RECLAIM", "DR" },
{ "__GFP_KSWAPD_RECLAIM", "KR" },
{ "__GFP_PGTABLE", "PT" },
};
static size_t max_gfp_len;