anolis: virtio-mem: apply parallel deferred memory online

ANBZ: #18841 We applied parallel deferred memory online to virtio-mem and observed a significant speedup in the hot-plug process. We conducted an experiment to hotplug 400G of memory, and the results were as follows: - Before applying the patch: - Total Time = Origin Hotplug Time = 5537ms (72.24 GB/s) - After applying the patch (with `parallel_hotplug_ratio=80`): - Origin Hotplug Time = 178ms - Deferred Parallel Hotplug Time = 1200ms - Total Time = 1378ms (76% reduction, 290.28 GB/s) Lastly, there's an issue regarding the guest's plug request to the VMM. The VMM relies on the plug requests sent by the guest to determine the size of the hot-plugged memory. Therefore, we should defer the sending of the plug requests after the memory has been actually onlined. Signed-off-by: Yang Rong <youngrong@linux.alibaba.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/4622
2025-02-24 11:30:33 +08:00 · 2025-02-24 11:30:33 +08:00 · 1535649a03
parent 8dc3571250
commit 1535649a03
1 changed files with 160 additions and 14 deletions
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@ -598,6 +598,15 @@ static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 	if (WARN_ON_ONCE(size > vm->offline_threshold))
 		return false;

+	/*
+	 * TODO: If memory online is deferred, offiine_size will exceed offline_threashold
+	 * immediately. However, even if we hotplug 400G memory on a machine with only
+	 * 256M boot memory, OOM is still not triggered. So in most cases, adding memory
+	 * is okay. We may have a better way to deal with it in the future.
+	 */
+	if (parallel_hotplug_ratio)
+		return true;
+
 	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 }

@ -1456,14 +1465,16 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
 * of the memory block.
 */
 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
-				  int sb_id, int count)
+				  int sb_id, int count, bool skip_send_req)
 {
 	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
 			      sb_id * vm->sbm.sb_size;
 	const uint64_t size = count * vm->sbm.sb_size;
-	int rc;
+	int rc = 0;

-	rc = virtio_mem_send_plug_request(vm, addr, size);
+	/* memory not onlined yet, so we also need defer the request. */
+	if (!skip_send_req)
+		rc = virtio_mem_send_plug_request(vm, addr, size);
 	if (!rc)
 		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
 	return rc;
@ -1613,7 +1624,7 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
 	 * Plug the requested number of subblocks before adding it to linux,
 	 * so that onlining will directly online all plugged subblocks.
 	 */
-	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
+	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count, parallel_hotplug_ratio);
 	if (rc)
 		return rc;

@ -1672,7 +1683,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
 		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
 			count++;

-		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
+		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count, false);
 		if (rc)
 			return rc;
 		*nb_sb -= count;
@ -1692,6 +1703,57 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
 	return 0;
 }

+struct deferred_mb_range {
+	unsigned long start_id;
+	unsigned long end_id;
+};
+
+struct deferred_mb_range_list {
+	struct deferred_mb_range *ranges;
+	unsigned long size;
+	unsigned long capacity;
+	int nid;
+};
+
+#define deferred_mb_range_list_for_each(_i, _ranges, _start, _end) \
+	for (_i = 0; \
+	     _i < _ranges.size && (_start = _ranges.ranges[_i].start_id, \
+				_end = _ranges.ranges[_i].end_id, true); \
+	     _i++)
+
+static int deferred_mb_range_list_add(struct deferred_mb_range_list *rs,
+				      unsigned long mb_id)
+{
+	struct deferred_mb_range *new_ranges;
+
+	if (!rs)
+		return -EINVAL;
+
+	if (rs->size && rs->ranges &&
+	    rs->ranges[rs->size - 1].end_id + 1 == mb_id) {
+		rs->ranges[rs->size - 1].end_id = mb_id;
+	} else {
+		if (rs->size == rs->capacity) {
+			rs->capacity++;
+			new_ranges = kmalloc_array_node(rs->capacity,
+						 sizeof(*rs->ranges), GFP_KERNEL, rs->nid);
+			if (!new_ranges)
+				return -ENOMEM;
+			if (rs->ranges) {
+				memcpy(new_ranges, rs->ranges,
+				       rs->size * sizeof(*rs->ranges));
+				kfree(rs->ranges);
+			}
+			rs->ranges = new_ranges;
+		}
+		rs->ranges[rs->size++] = (struct deferred_mb_range){
+			.start_id = mb_id,
+			.end_id = mb_id,
+		};
+	}
+	return 0;
+}
+
 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 {
 	const int mb_states[] = {
@ -1701,6 +1763,17 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 	};
 	uint64_t nb_sb = diff / vm->sbm.sb_size;
 	unsigned long mb_id;
+	struct deferred_mb_range_list rs = {
+		.ranges = NULL,
+		.size = 0,
+		.capacity = 0,
+		.nid = vm->nid,
+	};
+	unsigned long sid, eid;
+	uint64_t addr, size;
+	/* Last deferred memory block may not plug all subblocks */
+	uint64_t part_nb_sb = 0;
+	unsigned long timestamp;
 	int rc, i;

 	if (!nb_sb)
@ -1726,32 +1799,87 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)

 	/* Try to plug and add unused blocks */
 	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
-		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
-			return -ENOSPC;
+		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
+			rc = -ENOSPC;
+			goto out_free;
+		}

+		if (!nb_sb)
+			break;
+		if (parallel_hotplug_ratio) {
+			if (nb_sb < vm->sbm.sbs_per_mb)
+				part_nb_sb = nb_sb;
+			rc = deferred_mb_range_list_add(&rs, mb_id);
+			if (rc)
+				goto out_free;
+		}
 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
-		if (rc || !nb_sb)
-			return rc;
+		if (rc)
+			goto out_free;
 		cond_resched();
 	}

 	/* Try to prepare, plug and add new blocks */
 	while (nb_sb) {
-		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
-			return -ENOSPC;
+		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
+			rc = -ENOSPC;
+			goto out_free;
+		}

 		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
 		if (rc)
-			return rc;
+			goto out_free;
+		if (parallel_hotplug_ratio) {
+			if (nb_sb < vm->sbm.sbs_per_mb)
+				part_nb_sb = nb_sb;
+			rc = deferred_mb_range_list_add(&rs, mb_id);
+			if (rc)
+				goto out_free;
+		}
 		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
 		if (rc)
-			return rc;
+			goto out_free;
 		cond_resched();
 	}

-	return 0;
+	if (parallel_hotplug_ratio) {
+		timestamp = jiffies;
+		deferred_mb_range_list_for_each(i, rs, sid, eid) {
+			addr = virtio_mem_mb_id_to_phys(sid);
+			/* Always add complete memory block to Linux */
+			size = (eid - sid + 1) * memory_block_size_bytes();
+			/*
+			 * Deferred struct pages initialization and
+			 * Deferred free pages to buddy allocator.
+			 */
+			rc = deferred_online_memory(vm->nid, addr, size);
+			if (rc)
+				goto out_free;
+
+			/* Deferred send plug requests */
+			for (mb_id = sid; mb_id <= eid; mb_id++) {
+				addr = virtio_mem_mb_id_to_phys(mb_id);
+				if (part_nb_sb && i == rs.size - 1 &&
+				    mb_id == eid)
+					size = part_nb_sb * vm->sbm.sb_size;
+				else
+					size = memory_block_size_bytes();
+
+				rc = virtio_mem_send_plug_request(vm, addr, size);
+				if (rc)
+					goto out_free;
+			}
+		}
+		dev_info(&vm->vdev->dev, "deferred time: %ums",
+			 jiffies_to_msecs(jiffies - timestamp));
+	}
+	goto out_free;
+
 out_unlock:
 	mutex_unlock(&vm->hotplug_mutex);
+out_free:
+	if (parallel_hotplug_ratio)
+		kfree(rs.ranges);
 	return rc;
 }

@ -2496,6 +2624,8 @@ static int virtio_mem_init(struct virtio_mem *vm)
 	const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
 	uint64_t sb_size, addr;
 	uint16_t node_id;
+	struct pglist_data *pgdat;
+	char deferred_wq_name[24];

 	if (!vm->vdev->config->get) {
 		dev_err(&vm->vdev->dev, "config access disabled\n");
@ -2527,6 +2657,22 @@ static int virtio_mem_init(struct virtio_mem *vm)
 	if (vm->nid == NUMA_NO_NODE)
 		vm->nid = memory_add_physaddr_to_nid(vm->addr);

+	if (parallel_hotplug_ratio) {
+		pgdat = NODE_DATA(vm->nid);
+		if (!pgdat->deferred_hotplug_wq) {
+			snprintf(deferred_wq_name, sizeof(deferred_wq_name),
+				 "deferred_hotplug_wq_%d", vm->nid);
+			pgdat->deferred_hotplug_wq =
+				alloc_workqueue(deferred_wq_name,
+						WQ_UNBOUND | WQ_HIGHPRI, 0);
+			if (!pgdat->deferred_hotplug_wq)
+				return -ENOMEM;
+			dev_info(&vm->vdev->dev,
+				 "deferred workqueue created on node: %d\n",
+				 vm->nid);
+		}
+	}
+
 	/* bad device setup - warn only */
 	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
 		dev_warn(&vm->vdev->dev,