anolis: virtio-mem: apply parallel deferred memory online

ANBZ: #18841

We applied parallel deferred memory online to virtio-mem and
observed a significant speedup in the hot-plug process.

We conducted an experiment to hotplug 400G of memory, and
the results were as follows:
- Before applying the patch:
  - Total Time = Origin Hotplug Time = 5537ms (72.24 GB/s)
- After applying the patch (with `parallel_hotplug_ratio=80`):
  - Origin Hotplug Time = 178ms
  - Deferred Parallel Hotplug Time = 1200ms
  - Total Time = 1378ms (76% reduction, 290.28 GB/s)

Lastly, there's an issue regarding the guest's plug
request to the VMM. The VMM relies on the plug requests
sent by the guest to determine the size of the hot-plugged
memory. Therefore, we should defer the sending of the plug
requests after the memory has been actually onlined.

Signed-off-by: Yang Rong <youngrong@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4622
This commit is contained in:
Yang Rong 2025-02-24 11:30:33 +08:00 committed by 小龙
parent 8dc3571250
commit 1535649a03
1 changed files with 160 additions and 14 deletions

View File

@ -598,6 +598,15 @@ static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
if (WARN_ON_ONCE(size > vm->offline_threshold))
return false;
/*
* TODO: If memory online is deferred, offiine_size will exceed offline_threashold
* immediately. However, even if we hotplug 400G memory on a machine with only
* 256M boot memory, OOM is still not triggered. So in most cases, adding memory
* is okay. We may have a better way to deal with it in the future.
*/
if (parallel_hotplug_ratio)
return true;
return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
}
@ -1456,14 +1465,16 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
* of the memory block.
*/
static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
int sb_id, int count)
int sb_id, int count, bool skip_send_req)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->sbm.sb_size;
const uint64_t size = count * vm->sbm.sb_size;
int rc;
int rc = 0;
rc = virtio_mem_send_plug_request(vm, addr, size);
/* memory not onlined yet, so we also need defer the request. */
if (!skip_send_req)
rc = virtio_mem_send_plug_request(vm, addr, size);
if (!rc)
virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
return rc;
@ -1613,7 +1624,7 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
* Plug the requested number of subblocks before adding it to linux,
* so that onlining will directly online all plugged subblocks.
*/
rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count, parallel_hotplug_ratio);
if (rc)
return rc;
@ -1672,7 +1683,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
!virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
count++;
rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count, false);
if (rc)
return rc;
*nb_sb -= count;
@ -1692,6 +1703,57 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
return 0;
}
struct deferred_mb_range {
unsigned long start_id;
unsigned long end_id;
};
struct deferred_mb_range_list {
struct deferred_mb_range *ranges;
unsigned long size;
unsigned long capacity;
int nid;
};
#define deferred_mb_range_list_for_each(_i, _ranges, _start, _end) \
for (_i = 0; \
_i < _ranges.size && (_start = _ranges.ranges[_i].start_id, \
_end = _ranges.ranges[_i].end_id, true); \
_i++)
static int deferred_mb_range_list_add(struct deferred_mb_range_list *rs,
unsigned long mb_id)
{
struct deferred_mb_range *new_ranges;
if (!rs)
return -EINVAL;
if (rs->size && rs->ranges &&
rs->ranges[rs->size - 1].end_id + 1 == mb_id) {
rs->ranges[rs->size - 1].end_id = mb_id;
} else {
if (rs->size == rs->capacity) {
rs->capacity++;
new_ranges = kmalloc_array_node(rs->capacity,
sizeof(*rs->ranges), GFP_KERNEL, rs->nid);
if (!new_ranges)
return -ENOMEM;
if (rs->ranges) {
memcpy(new_ranges, rs->ranges,
rs->size * sizeof(*rs->ranges));
kfree(rs->ranges);
}
rs->ranges = new_ranges;
}
rs->ranges[rs->size++] = (struct deferred_mb_range){
.start_id = mb_id,
.end_id = mb_id,
};
}
return 0;
}
static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
{
const int mb_states[] = {
@ -1701,6 +1763,17 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
};
uint64_t nb_sb = diff / vm->sbm.sb_size;
unsigned long mb_id;
struct deferred_mb_range_list rs = {
.ranges = NULL,
.size = 0,
.capacity = 0,
.nid = vm->nid,
};
unsigned long sid, eid;
uint64_t addr, size;
/* Last deferred memory block may not plug all subblocks */
uint64_t part_nb_sb = 0;
unsigned long timestamp;
int rc, i;
if (!nb_sb)
@ -1726,32 +1799,87 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
/* Try to plug and add unused blocks */
virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC;
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
rc = -ENOSPC;
goto out_free;
}
if (!nb_sb)
break;
if (parallel_hotplug_ratio) {
if (nb_sb < vm->sbm.sbs_per_mb)
part_nb_sb = nb_sb;
rc = deferred_mb_range_list_add(&rs, mb_id);
if (rc)
goto out_free;
}
rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc || !nb_sb)
return rc;
if (rc)
goto out_free;
cond_resched();
}
/* Try to prepare, plug and add new blocks */
while (nb_sb) {
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC;
if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) {
rc = -ENOSPC;
goto out_free;
}
rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
if (rc)
return rc;
goto out_free;
if (parallel_hotplug_ratio) {
if (nb_sb < vm->sbm.sbs_per_mb)
part_nb_sb = nb_sb;
rc = deferred_mb_range_list_add(&rs, mb_id);
if (rc)
goto out_free;
}
rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc)
return rc;
goto out_free;
cond_resched();
}
return 0;
if (parallel_hotplug_ratio) {
timestamp = jiffies;
deferred_mb_range_list_for_each(i, rs, sid, eid) {
addr = virtio_mem_mb_id_to_phys(sid);
/* Always add complete memory block to Linux */
size = (eid - sid + 1) * memory_block_size_bytes();
/*
* Deferred struct pages initialization and
* Deferred free pages to buddy allocator.
*/
rc = deferred_online_memory(vm->nid, addr, size);
if (rc)
goto out_free;
/* Deferred send plug requests */
for (mb_id = sid; mb_id <= eid; mb_id++) {
addr = virtio_mem_mb_id_to_phys(mb_id);
if (part_nb_sb && i == rs.size - 1 &&
mb_id == eid)
size = part_nb_sb * vm->sbm.sb_size;
else
size = memory_block_size_bytes();
rc = virtio_mem_send_plug_request(vm, addr, size);
if (rc)
goto out_free;
}
}
dev_info(&vm->vdev->dev, "deferred time: %ums",
jiffies_to_msecs(jiffies - timestamp));
}
goto out_free;
out_unlock:
mutex_unlock(&vm->hotplug_mutex);
out_free:
if (parallel_hotplug_ratio)
kfree(rs.ranges);
return rc;
}
@ -2496,6 +2624,8 @@ static int virtio_mem_init(struct virtio_mem *vm)
const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
uint64_t sb_size, addr;
uint16_t node_id;
struct pglist_data *pgdat;
char deferred_wq_name[24];
if (!vm->vdev->config->get) {
dev_err(&vm->vdev->dev, "config access disabled\n");
@ -2527,6 +2657,22 @@ static int virtio_mem_init(struct virtio_mem *vm)
if (vm->nid == NUMA_NO_NODE)
vm->nid = memory_add_physaddr_to_nid(vm->addr);
if (parallel_hotplug_ratio) {
pgdat = NODE_DATA(vm->nid);
if (!pgdat->deferred_hotplug_wq) {
snprintf(deferred_wq_name, sizeof(deferred_wq_name),
"deferred_hotplug_wq_%d", vm->nid);
pgdat->deferred_hotplug_wq =
alloc_workqueue(deferred_wq_name,
WQ_UNBOUND | WQ_HIGHPRI, 0);
if (!pgdat->deferred_hotplug_wq)
return -ENOMEM;
dev_info(&vm->vdev->dev,
"deferred workqueue created on node: %d\n",
vm->nid);
}
}
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
dev_warn(&vm->vdev->dev,