anolis: mm/memory_hotplug: support parallel deferred memory online

ANBZ: #18841 Memory hotplug is a serial process that adds memory to Linux in the granularity of memory blocks. We identified two memory initialization functions that consume significant time when onling memory blocks: - `__init_single_page`: initialize the struct page - `__free_pages_core`: add page to the buddy allocator We attempted to execute these two functions in parallel during the process of hotplugging a memory block. The experimental results showed that when the memory block size was 1GB, the hotplug speed was increased by approximately 200%. However, when the memory block size was 128MB, which is the more commonly used size, the hotplug speed was even worse than that of serial execution. Therefore, how to improve the hotplug speed when the memory block size is 128MB remains a challenge. Here is my idea: - Defer the execution of these two functions and their associated processs to the final phase of the entire hotplug process, so that the hotplug speed will no longer be limited by the memory block size. - Perform parallel execution in the final phase, as previous implementations have proven that this can accelerate the hotplug process. We introduce the new online function, `deferred_online_memory`, for deferring the actual online process of memory blocks. Additionally, we have added a command-line argument, parallel_hotplug_ratio, which sets the ratio of parallel workers to the number of CPUs on the node. When parallel_hotplug_ratio is 0, the memory online process will no longer be deferred. Signed-off-by: Yang Rong <youngrong@linux.alibaba.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/4622
2025-07-08 15:56:43 +08:00 · 2025-07-08 15:56:43 +08:00 · 8dc3571250
parent 05a35a41ac
commit 8dc3571250
5 changed files with 184 additions and 11 deletions
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@ -198,8 +198,22 @@ static int memory_block_online(struct memory_block *mem)
 		if (ret)
 			return ret;
 	}
-
-	ret = online_pages(start_pfn + nr_vmemmap_pages,
+	/*
+	 * Defer struct pages initialization and defer freeing pages to buddy
+	 * allocator starting from at least the second memory block of the zone,
+	 * as rebuilding the zone is not required from that point onwards.
+	 */
+	if (parallel_hotplug_ratio &&
+	    start_pfn + nr_vmemmap_pages >=
+		    zone->zone_start_pfn +
+			    (memory_block_size_bytes() >> PAGE_SHIFT)) {
+		ret = __online_pages(start_pfn + nr_vmemmap_pages,
+				     nr_pages - nr_vmemmap_pages, zone,
+				     mem->group, MHP_PHASE_PREPARE);
+		atomic_set(&mem->deferred_state, MEM_NEED_DEFER);
+		mem->deferred_zone = zone;
+	} else
+		ret = online_pages(start_pfn + nr_vmemmap_pages,
 			   nr_pages - nr_vmemmap_pages, zone, mem->group);
 	if (ret) {
 		if (nr_vmemmap_pages)
@ -286,7 +300,9 @@ static int memory_block_change_state(struct memory_block *mem,
 		mem->state = MEM_GOING_OFFLINE;

 	ret = memory_block_action(mem, to_state);
-	mem->state = ret ? from_state_req : to_state;
+	mem->state =
+		(ret || atomic_read(&mem->deferred_state) == MEM_NEED_DEFER) ?
+			from_state_req : to_state;

 	return ret;
 }
@ -675,6 +691,8 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
 	mem->nr_vmemmap_pages = nr_vmemmap_pages;
+	atomic_set(&mem->deferred_state, MEM_SKIP_DEFER);
+	mem->deferred_zone = NULL;
 	INIT_LIST_HEAD(&mem->group_next);

 	if (group) {
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@ -65,6 +65,10 @@ struct memory_group {
 	};
 };

+/* Memory block defer state flags */
+#define MEM_SKIP_DEFER 0
+#define MEM_NEED_DEFER 1
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
@ -76,6 +80,12 @@ struct memory_block {
 	 * lay at the beginning of the memory block.
 	 */
 	unsigned long nr_vmemmap_pages;
+	/*
+	 * Whether struct pages initialization and free pages
+	 * to buddy allocator needs to be deferred or not.
+	 */
+	atomic_t deferred_state;
+	struct zone *deferred_zone; /* zone for this defered block */
 	struct memory_group *group;	/* group (if any) for this block */
 	struct list_head group_next;	/* next block inside memory group */
 };
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@ -183,6 +183,9 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group);
+extern int __online_pages(unsigned long pfn, unsigned long nr_pages,
+			  struct zone *zone, struct memory_group *group,
+			  int phase);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@ -203,6 +206,7 @@ extern u64 max_mem_size;
 extern int memhp_online_type_from_str(const char *str);

 extern bool skip_set_contiguous;
+extern unsigned int parallel_hotplug_ratio;
 /* Default online_type (MMOP_*) when new memory blocks are added. */
 extern int memhp_default_online_type;
 /* If movable_node boot option specified */
@ -359,6 +363,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid,
 		struct memory_group *group, unsigned long start_pfn,
 		unsigned long nr_pages);
 extern bool mhp_supports_memmap_on_memory(unsigned long size);
+extern int deferred_online_memory(int nid, u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTPLUG */

 #endif /* __LINUX_MEMORY_HOTPLUG_H */
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@ -1052,6 +1052,13 @@ typedef struct pglist_data {
 	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * This workqueue is used to handle deferred pages
+	 * initialization of hotplugged memory.
+	 */
+	struct workqueue_struct *deferred_hotplug_wq;
 #endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@ -122,6 +122,13 @@ bool skip_set_contiguous __read_mostly;
 module_param(skip_set_contiguous, bool, 0644);
 MODULE_PARM_DESC(skip_set_contiguous, "Do not set zone contiguous when online/offline pages");

+unsigned int parallel_hotplug_ratio __read_mostly;
+EXPORT_SYMBOL_GPL(parallel_hotplug_ratio);
+module_param(parallel_hotplug_ratio, uint, 0644);
+MODULE_PARM_DESC(parallel_hotplug_ratio,
+		"Set the ratio of parallel hotplug workers to the number of CPUs on "
+		"the node, with values constrained between 0 and 100. Default: 0");
+
 /*
 * memory_hotplug.auto_movable_numa_aware: consider numa node stats
 */
@ -1107,8 +1114,13 @@ int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,
 	/* associate pfn range with the zone */
 	__move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE, phase);

-	if (phase == MHP_PHASE_PREPARE)
-		goto adjust_count;
+	if (phase == MHP_PHASE_PREPARE) {
+		__adjust_present_page_count(pfn_to_page(pfn), group, nr_pages,
+					    zone, phase);
+		atomic_long_add(nr_pages, &zone->deferred_pages);
+		mem_hotplug_done();
+		return 0;
+	}

 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@ -1139,12 +1151,8 @@ int __ref __online_pages(unsigned long pfn, unsigned long nr_pages,

 	online_pages_range(pfn, nr_pages);

-adjust_count:
 	__adjust_present_page_count(pfn_to_page(pfn), group, nr_pages, zone, phase);
-	if (phase == MHP_PHASE_PREPARE) {
-		atomic_long_add(nr_pages, &zone->deferred_pages);
-		goto out;
-	} else if (phase == MHP_PHASE_DEFERRED)
+	if (phase == MHP_PHASE_DEFERRED)
 		atomic_long_sub(nr_pages, &zone->deferred_pages);

 	node_states_set_node(nid, &arg);
@ -1173,7 +1181,6 @@ adjust_count:

 	memory_notify(MEM_ONLINE, &arg);

-out:
 	if (need_lock)
 		mem_hotplug_done();
 	return 0;
@ -1194,6 +1201,132 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 {
 	return __online_pages(pfn, nr_pages, zone, group, MHP_PHASE_DEFAULT);
 }
+
+static int deferred_memory_block_online_pages(struct memory_block *mem,
+					      void *arg)
+{
+	unsigned long start_pfn, nr_pages;
+	unsigned long nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	/* Continue if struct pages initialization need to be deferred */
+	if (memhp_default_online_type == MMOP_OFFLINE ||
+	    mem->state == MEM_ONLINE || !mem->deferred_zone ||
+	    atomic_cmpxchg(&mem->deferred_state, MEM_NEED_DEFER,
+			   MEM_SKIP_DEFER) != MEM_NEED_DEFER)
+		return 0;
+
+	zone = mem->deferred_zone;
+	mem->deferred_zone = NULL;
+
+	start_pfn = section_nr_to_pfn(mem->start_section_nr);
+	nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
+	nr_vmemmap_pages = mem->nr_vmemmap_pages;
+
+	ret = __online_pages(start_pfn + nr_vmemmap_pages,
+			     nr_pages - nr_vmemmap_pages, zone, mem->group,
+			     MHP_PHASE_DEFERRED);
+	if (ret) {
+		if (nr_vmemmap_pages)
+			mhp_deinit_memmap_on_memory(start_pfn,
+						    nr_vmemmap_pages);
+		return ret;
+	}
+
+	mem->state = MEM_ONLINE;
+	return 0;
+}
+
+struct deferred_walk_memory_blocks_work {
+	struct work_struct work;
+	u64 start;
+	u64 size;
+	int ret;
+};
+
+static void deferred_walk_memory_blocks_worker(struct work_struct *work)
+{
+	struct deferred_walk_memory_blocks_work *w = container_of(
+		work, struct deferred_walk_memory_blocks_work, work);
+
+	w->ret = walk_memory_blocks(w->start, w->size, NULL,
+				 deferred_memory_block_online_pages);
+}
+
+int __ref deferred_online_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	int i, ret = 0;
+	struct workqueue_struct *wq;
+	struct deferred_walk_memory_blocks_work *ws, *w;
+	const struct cpumask *cpumask;
+	u64 chunk_start = start;
+	u64 chunk_size, chunk_num, chunk_remain;
+
+	if (!parallel_hotplug_ratio)
+		return -EINVAL;
+
+	wq = pgdat->deferred_hotplug_wq;
+	if (!wq) {
+		pr_warn("Deferred hotplug work queue is not initialized for node %d\n",
+			nid);
+		goto sequential;
+	}
+
+	cpumask = cpumask_of_node(nid);
+	/*
+	 * The number of parallel workers (chunk_num) should be less than
+	 * or equal to the maximum number of CPUs on the node.
+	 * And the memory size handled by each worker needs to be aligned
+	 * with the memory block size.
+	 */
+	chunk_num =
+		max_t(uint, 1,
+		      max_t(uint, cpumask_weight(cpumask), 1) *
+			      min_t(uint, parallel_hotplug_ratio, 100) / 100);
+	chunk_size = ALIGN(size / chunk_num, memory_block_size_bytes());
+	chunk_num = size / chunk_size;
+	chunk_remain = size % chunk_size;
+
+	if (chunk_num == 1)
+		goto sequential;
+
+	ws = kmalloc_array_node(chunk_num, sizeof(*ws), GFP_KERNEL, nid);
+	if (!ws)
+		goto sequential;
+
+	for (i = 0; i < chunk_num; i++) {
+		w = ws + i;
+		INIT_WORK(&w->work, deferred_walk_memory_blocks_worker);
+		w->start = chunk_start;
+		if (i == chunk_num - 1)
+			w->size = chunk_size + chunk_remain;
+		else
+			w->size = chunk_size;
+		chunk_start += w->size;
+		queue_work_node(nid, wq, &w->work);
+	}
+
+	flush_workqueue(wq);
+
+	for (i = 0; i < chunk_num; i++) {
+		w = ws + i;
+		if (w->ret) {
+			ret = w->ret;
+			pr_err("Deferred online memory failed for node %d, start: %#llx, size: %#llx, ret: %d\n",
+			       nid, w->start, w->size, ret);
+			break;
+		}
+	}
+	kfree(ws);
+	return ret;
+
+sequential:
+	return walk_memory_blocks(start, size, NULL,
+				 deferred_memory_block_online_pages);
+}
+EXPORT_SYMBOL_GPL(deferred_online_memory);
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */

 static void reset_node_present_pages(pg_data_t *pgdat)