mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED)
ANBZ: #20722
commit 6375e95f38
upstream.
Now in order to pursue high performance, applications mostly use some
high-performance user-mode memory allocators, such as jemalloc or
tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE)
to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will
release page table memory, which may cause huge page table memory usage.
The following are a memory usage snapshot of one process which actually
happened on our server:
VIRT: 55t
RES: 590g
VmPTE: 110g
In this case, most of the page table entries are empty. For such a PTE
page where all entries are empty, we can actually free it back to the
system for others to use.
As a first step, this commit aims to synchronously free the empty PTE
pages in madvise(MADV_DONTNEED) case. We will detect and free empty PTE
pages in zap_pte_range(), and will add zap_details.reclaim_pt to exclude
cases other than madvise(MADV_DONTNEED).
Once an empty PTE is detected, we first try to hold the pmd lock within
the pte lock. If successful, we clear the pmd entry directly (fast path).
Otherwise, we wait until the pte lock is released, then re-hold the pmd
and pte locks and loop PTRS_PER_PTE times to check pte_none() to re-detect
whether the PTE page is empty and free it (slow path).
For other cases such as madvise(MADV_FREE), consider scanning and freeing
empty PTE pages asynchronously in the future.
The following code snippet can show the effect of optimization:
mmap 50G
while (1) {
for (; i < 1024 * 25; i++) {
touch 2M memory
madvise MADV_DONTNEED 2M
}
}
As we can see, the memory usage of VmPTE is reduced:
before after
VIRT 50.0 GB 50.0 GB
RES 3.1 MB 3.1 MB
VmPTE 102640 KB 240 KB
[Zelin Deng: change and export zap_page_range_single() in this patch in
order to make it be used by external caller. This change is originally
introduced by commit: 21b85b09527c("madvise: use zap_page_range_single for madvise dontneed")]
[zhengqi.arch@bytedance.com: fix uninitialized symbol 'ptl']
Link: https://lkml.kernel.org/r/20241206112348.51570-1-zhengqi.arch@bytedance.com
Link: https://lore.kernel.org/linux-mm/224e6a4e-43b5-4080-bdd8-b0a6fb2f0853@stanley.mountain/
Link: https://lkml.kernel.org/r/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Zach O'Keefe <zokeefe@google.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Zelin Deng <zelin.deng@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/5202
This commit is contained in:
parent
626a51aaea
commit
bc307a6c2d
|
@ -1857,6 +1857,7 @@ struct zap_details {
|
|||
pgoff_t first_index; /* Lowest page->index to unmap */
|
||||
pgoff_t last_index; /* Highest page->index to unmap */
|
||||
struct page *single_page; /* Locked page to be unmapped */
|
||||
bool reclaim_pt; /* Need reclaim page tables? */
|
||||
unsigned int flags; /* Flags to indicate pages to unmap */
|
||||
};
|
||||
|
||||
|
@ -1869,6 +1870,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
|
|||
unsigned long size);
|
||||
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned long size);
|
||||
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned long size, struct zap_details *details);
|
||||
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long start, unsigned long end);
|
||||
|
||||
|
|
15
mm/Kconfig
15
mm/Kconfig
|
@ -1042,4 +1042,19 @@ config PRE_OOM
|
|||
help
|
||||
This feature is used to ensure that higher priority tasks would not enter the direct
|
||||
reclaim path when applying for memory allocation.
|
||||
|
||||
config ARCH_SUPPORTS_PT_RECLAIM
|
||||
def_bool n
|
||||
|
||||
config PT_RECLAIM
|
||||
bool "reclaim empty user page table pages"
|
||||
default y
|
||||
depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
|
||||
select MMU_GATHER_RCU_TABLE_FREE
|
||||
help
|
||||
Try to reclaim empty user page table pages in paths other than munmap
|
||||
and exit_mmap path.
|
||||
|
||||
Note: now only empty user PTE page table pages will be reclaimed.
|
||||
|
||||
endmenu
|
||||
|
|
|
@ -139,3 +139,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
|
|||
obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o
|
||||
obj-$(CONFIG_PAGETABLE_SHARE) += pgtable_share.o
|
||||
obj-$(CONFIG_PRE_OOM) += pre_oom.o
|
||||
obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
|
||||
|
|
|
@ -722,4 +722,22 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
|
|||
|
||||
DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
|
||||
|
||||
/* pt_reclaim.c */
|
||||
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
|
||||
void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
|
||||
pmd_t pmdval);
|
||||
void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
|
||||
struct mmu_gather *tlb);
|
||||
|
||||
#ifdef CONFIG_PT_RECLAIM
|
||||
bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
|
||||
struct zap_details *details);
|
||||
#else
|
||||
static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
|
||||
struct zap_details *details)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_PT_RECLAIM */
|
||||
|
||||
#endif /* __MM_INTERNAL_H */
|
||||
|
|
|
@ -763,9 +763,13 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
|
|||
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct zap_details details = {
|
||||
.reclaim_pt = true,
|
||||
};
|
||||
|
||||
if (unlikely(vma_is_pgtable_shared(vma)))
|
||||
return pgtable_share_dontneed_single_vma(vma, start, end);
|
||||
zap_page_range(vma, start, end - start);
|
||||
zap_page_range_single(vma, start, end - start, &details);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
32
mm/memory.c
32
mm/memory.c
|
@ -1228,7 +1228,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
|
|||
static inline bool should_zap_cows(struct zap_details *details)
|
||||
{
|
||||
/* By default, zap all pages */
|
||||
if (!details)
|
||||
if (!details || details->reclaim_pt)
|
||||
return true;
|
||||
|
||||
/* Or, we zap COWed pages only if the caller wants to */
|
||||
|
@ -1371,6 +1371,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
|||
spinlock_t *ptl;
|
||||
pte_t *start_pte;
|
||||
pte_t *pte;
|
||||
pmd_t pmdval;
|
||||
unsigned long start = addr;
|
||||
bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
|
||||
bool direct_reclaim = false;
|
||||
|
||||
tlb_change_page_size(tlb, PAGE_SIZE);
|
||||
again:
|
||||
|
@ -1391,6 +1395,9 @@ again:
|
|||
}
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
|
||||
if (can_reclaim_pt && addr == end)
|
||||
direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
|
||||
|
||||
add_mm_rss_vec(mm, rss);
|
||||
arch_leave_lazy_mmu_mode();
|
||||
|
||||
|
@ -1417,6 +1424,13 @@ again:
|
|||
goto again;
|
||||
}
|
||||
|
||||
if (can_reclaim_pt) {
|
||||
if (direct_reclaim)
|
||||
free_pte(mm, start, tlb, pmdval);
|
||||
else
|
||||
try_to_free_pte(mm, pmd, start, tlb);
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
|
@ -1665,19 +1679,27 @@ EXPORT_SYMBOL_GPL(zap_page_range);
|
|||
*
|
||||
* The range must fit into one VMA.
|
||||
*/
|
||||
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned long size, struct zap_details *details)
|
||||
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned long size, struct zap_details *details)
|
||||
{
|
||||
const unsigned long end = address + size;
|
||||
struct mmu_notifier_range range;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
lru_add_drain();
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
|
||||
address, address + size);
|
||||
address, end);
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
adjust_range_if_pmd_sharing_possible(vma, &range.start,
|
||||
&range.end);
|
||||
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
|
||||
update_hiwater_rss(vma->vm_mm);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
unmap_single_vma(&tlb, vma, address, range.end, details);
|
||||
/*
|
||||
* unmap 'address-end' not 'range.start-range.end' as range
|
||||
* could have been expanded for hugetlb pmd sharing.
|
||||
*/
|
||||
unmap_single_vma(&tlb, vma, address, end, details);
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
tlb_finish_mmu(&tlb, address, range.end);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/hugetlb.h>
|
||||
#include <asm-generic/tlb.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <linux/pgtable.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
|
||||
struct zap_details *details)
|
||||
{
|
||||
return details && details->reclaim_pt && (end - start >= PMD_SIZE);
|
||||
}
|
||||
|
||||
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
|
||||
{
|
||||
spinlock_t *pml = pmd_lockptr(mm, pmd);
|
||||
|
||||
if (!spin_trylock(pml))
|
||||
return false;
|
||||
|
||||
*pmdval = pmd_read_atomic(pmd);
|
||||
pmd_clear(pmd);
|
||||
spin_unlock(pml);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
|
||||
pmd_t pmdval)
|
||||
{
|
||||
pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
|
||||
mm_dec_nr_ptes(mm);
|
||||
}
|
||||
|
||||
void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
|
||||
struct mmu_gather *tlb)
|
||||
{
|
||||
pmd_t pmdval;
|
||||
spinlock_t *pml, *ptl = NULL;
|
||||
pte_t *start_pte, *pte;
|
||||
int i;
|
||||
|
||||
pml = pmd_lock(mm, pmd);
|
||||
start_pte = pte_offset_map(pmd, addr);
|
||||
if (!start_pte)
|
||||
goto out_ptl;
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
|
||||
if (ptl != pml)
|
||||
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
|
||||
|
||||
pmdval = pmd_read_atomic(pmd);
|
||||
|
||||
/* Check if it is empty PTE page */
|
||||
for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
|
||||
if (!pte_none(ptep_get(pte)))
|
||||
goto out_ptl;
|
||||
}
|
||||
pte_unmap(start_pte);
|
||||
|
||||
pmd_clear(pmd);
|
||||
|
||||
if (ptl != pml)
|
||||
spin_unlock(ptl);
|
||||
spin_unlock(pml);
|
||||
|
||||
free_pte(mm, addr, tlb, pmdval);
|
||||
|
||||
return;
|
||||
out_ptl:
|
||||
if (start_pte)
|
||||
pte_unmap_unlock(start_pte, ptl);
|
||||
if (ptl != pml)
|
||||
spin_unlock(pml);
|
||||
}
|
Loading…
Reference in New Issue