anolis: virtio-blk: add bidirectional request support

ANBZ: #12584

We support virtio-blk bidirectional request, which contains both read
and write bios. Now we assume that A bidirectional request only
contains two bios, 1st is write and 2nd is read.

We should divide 'bios' into two sglists, so virtblk-req need extend its
structure, use an extra sglist to store them.

In order to avoid corruption of the logic of 'request', using a trick in
'virtblk_map_user_bidirectional': when map iovec from usermode, set
REQ_OP_WRITE for req->cmd_flags first, remove it, set read flag and
repeat above steps.

For block I/O request, add bidirectional flag for passthrough command.
If a request is bidirectional, its bio contains read and write iovecs.

Add extra and flag in virtblk_uring_cmd, which will be used in
bidirectional request.

'write_iov_count' refers to 'num of write iovec' in a bidirectional
request. 'flag' indicate this is a bidirectional req or not.

We assume that a bidirectional request should satisfy the model "first
write then read", and they need to be continuous in iovec. In other
words, they should be like this:

	write - write - write - read - read

In this example, virtblk_uring_cmd is:

data: iovec addr base
data_len: 0x5
flag: 0x1
write_iov_count: 0x3

Signed-off-by: Ferry Meng <mengferry@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/4272
This commit is contained in:
Ferry Meng 2024-07-22 16:56:55 +08:00
parent fb9c8a40b3
commit b8b7e0926c
3 changed files with 169 additions and 12 deletions

View File

@ -110,6 +110,7 @@ struct virtblk_req {
struct virtio_blk_outhdr out_hdr;
u8 status;
struct sg_table sg_table;
struct sg_table sg_table_extra;
struct scatterlist sg[];
};
@ -142,6 +143,38 @@ static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx
return vq;
}
static inline bool vbr_is_bidirectional(struct virtblk_req *vbr)
{
struct request *req = blk_mq_rq_from_pdu(vbr);
return op_is_bidirectional(req->cmd_flags);
}
static int virtblk_add_req_bidirectional(struct virtqueue *vq,
struct virtblk_req *vbr, struct scatterlist *data_sg,
struct scatterlist *data_sg_extra)
{
struct scatterlist hdr, status, *sgs[4];
unsigned int num_out = 0, num_in = 0;
/*
* vritblk_add_req use 'bool' have_data, while we use int num to
* validate both OUT and IN direction have data. For bidirectional
* request, __blk_bios_map_sg_bidir() should map at least 2 segments.
*/
if ((sg_nents(data_sg) == 0) || (sg_nents(data_sg_extra) == 0))
return -EINVAL;
sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
sg_init_one(&status, &vbr->status, sizeof(vbr->status));
sgs[num_out++] = &hdr;
sgs[num_out++] = data_sg;
sgs[num_out + num_in++] = data_sg_extra;
sgs[num_out + num_in++] = &status;
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
struct scatterlist *data_sg, bool have_data)
{
@ -219,6 +252,46 @@ static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
VIRTIO_BLK_INLINE_SG_CNT);
}
static void virtblk_unmap_data_bidirectional(struct request *req,
struct virtblk_req *vbr)
{
if (blk_rq_nr_phys_segments(req)) {
sg_free_table_chained(&vbr->sg_table,
VIRTIO_BLK_INLINE_SG_CNT);
sg_free_table_chained(&vbr->sg_table_extra,
VIRTIO_BLK_INLINE_SG_CNT);
}
}
static int virtblk_map_data_bidirectional(struct blk_mq_hw_ctx *hctx,
struct request *req, struct virtblk_req *vbr)
{
int err;
vbr->sg_table.sgl = vbr->sg;
err = sg_alloc_table_chained(&vbr->sg_table,
blk_rq_nr_phys_segments(req),
vbr->sg_table.sgl,
VIRTIO_BLK_INLINE_SG_CNT);
if (unlikely(err))
return -ENOMEM;
vbr->sg_table_extra.sgl = &vbr->sg[VIRTIO_BLK_INLINE_SG_CNT];
err = sg_alloc_table_chained(&vbr->sg_table_extra,
blk_rq_nr_phys_segments(req),
vbr->sg_table_extra.sgl,
VIRTIO_BLK_INLINE_SG_CNT);
if (unlikely(err)) {
sg_free_table_chained(&vbr->sg_table,
VIRTIO_BLK_INLINE_SG_CNT);
return -ENOMEM;
}
return blk_rq_map_sg_bidir(hctx->queue, req,
vbr->sg_table.sgl, vbr->sg_table_extra.sgl);
}
static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
struct virtblk_req *vbr)
{
@ -301,7 +374,10 @@ static inline void virtblk_request_done(struct request *req)
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
virtblk_unmap_data(req, vbr);
if (vbr_is_bidirectional(vbr))
virtblk_unmap_data_bidirectional(req, vbr);
else
virtblk_unmap_data(req, vbr);
virtblk_cleanup_cmd(req);
blk_mq_end_request(req, virtblk_result(vbr));
}
@ -368,14 +444,25 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(req);
num = virtblk_map_data(hctx, req, vbr);
if (vbr_is_bidirectional(vbr))
num = virtblk_map_data_bidirectional(hctx, req, vbr);
else
num = virtblk_map_data(hctx, req, vbr);
if (unlikely(num < 0)) {
virtblk_cleanup_cmd(req);
return BLK_STS_RESOURCE;
}
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num);
if (vbr_is_bidirectional(vbr))
err = virtblk_add_req_bidirectional(vblk->vqs[qid].vq,
vbr, vbr->sg_table.sgl,
vbr->sg_table_extra.sgl);
else
err = virtblk_add_req(vblk->vqs[qid].vq, vbr,
vbr->sg_table.sgl, num);
if (err) {
virtqueue_kick(vblk->vqs[qid].vq);
/* Don't stop the queue if -ENOMEM: we may have failed to
@ -384,7 +471,10 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
if (err == -ENOSPC)
blk_mq_stop_hw_queue(hctx);
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
virtblk_unmap_data(req, vbr);
if (vbr_is_bidirectional(vbr))
virtblk_unmap_data_bidirectional(req, vbr);
else
virtblk_unmap_data(req, vbr);
virtblk_cleanup_cmd(req);
switch (err) {
case -ENOSPC:
@ -842,7 +932,11 @@ static void virtblk_complete_batch(struct io_comp_batch *iob)
struct request *req;
rq_list_for_each(&iob->req_list, req) {
virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
if (op_is_bidirectional(req->cmd_flags))
virtblk_unmap_data_bidirectional(req,
blk_mq_rq_to_pdu(req));
else
virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
virtblk_cleanup_cmd(req);
}
blk_mq_end_request_batch(iob);
@ -925,10 +1019,48 @@ static void virtblk_uring_cmd_end_io(struct request *req, blk_status_t err)
io_uring_cmd_complete_in_task(ioucmd, virtblk_uring_task_cb);
}
static int virtblk_map_user_bidirectional(struct request *req, uintptr_t ubuffer,
unsigned int iov_count, unsigned int write_iov_count)
{
int ret;
/*
* USER command should ensure write_iov_count < iov_count
*/
if (write_iov_count >= iov_count)
return -EINVAL;
/*
* now bidirectional only support READ-after-WRITE mode,
* set WRITE first and clear it later.
*/
req->cmd_flags |= WRITE;
ret = blk_rq_map_user_io(req, NULL, (void __user *)ubuffer,
write_iov_count, GFP_KERNEL, true,
0, false, rq_data_dir(req));
if (ret)
return ret;
ubuffer += write_iov_count * sizeof(struct iovec);
req->cmd_flags &= ~WRITE;
ret = blk_rq_map_user_io(req, NULL, (void __user *)ubuffer,
(iov_count - write_iov_count), GFP_KERNEL,
true, 0, false, rq_data_dir(req));
if (ret)
blk_rq_unmap_user(req->bio);
return ret;
}
static int virtblk_map_user_request(struct request *req, uintptr_t ubuffer,
unsigned int bufflen, bool vec)
unsigned int bufflen, bool vec, unsigned int num)
{
struct request_queue *q = req->q;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
if (vbr_is_bidirectional(vbr))
return virtblk_map_user_bidirectional(req, ubuffer,
bufflen, num);
if (!vec)
return blk_rq_map_user(q, req, NULL, (void __user *)ubuffer,
@ -946,17 +1078,19 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk,
struct request_queue *q = vblk->disk->queue;
struct virtblk_req *vbr;
struct request *req;
struct bio *bio;
unsigned int rq_flags = 0;
blk_mq_req_flags_t blk_flags = 0;
u32 type;
uintptr_t data;
unsigned long data_len, flag;
unsigned long data_len, flag, write_iov_count;
int ret;
type = READ_ONCE(cmd->type);
flag = READ_ONCE(cmd->flag);
data = READ_ONCE(cmd->data);
data_len = READ_ONCE(cmd->data_len);
write_iov_count = READ_ONCE(cmd->write_iov_count);
/* Only support OUT and IN for uring_cmd currently */
if ((type != VIRTIO_BLK_T_OUT) && (type != VIRTIO_BLK_T_IN))
@ -968,7 +1102,8 @@ static int virtblk_uring_cmd_io(struct virtio_blk *vblk,
}
if (issue_flags & IO_URING_F_IOPOLL)
rq_flags |= REQ_POLLED;
if (flag & VIRTBLK_URING_F_BIDIR)
rq_flags |= REQ_BIDIR;
rq_flags |= (type & VIRTIO_BLK_T_OUT) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
retry:
req = blk_mq_alloc_request(q, rq_flags, blk_flags);
@ -982,7 +1117,7 @@ retry:
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
if (data && data_len) {
ret = virtblk_map_user_request(req, data, data_len, vec);
ret = virtblk_map_user_request(req, data, data_len, vec, write_iov_count);
if (ret) {
blk_mq_free_request(req);
return ret;
@ -1001,14 +1136,18 @@ retry:
goto retry;
} else {
WRITE_ONCE(ioucmd->cookie, req);
req->bio->bi_opf |= REQ_POLLED;
/* In fact, only first bio in req will use REQ_POLLED */
for (bio = req->bio; bio; bio = bio->bi_next)
req->bio->bi_opf |= REQ_POLLED;
}
}
/* to free bio on completion, as req->bio will be null at that time */
pdu->bio = req->bio;
req->end_io_data = ioucmd;
virtblk_bio_set_disk(req->bio, vblk->disk);
/* for bid command, req have more than one bio, should associate all */
for (bio = req->bio; bio; bio = bio->bi_next)
virtblk_bio_set_disk(bio, vblk->disk);
blk_execute_rq_nowait(NULL, req, 0, virtblk_uring_cmd_end_io);
return -EIOCBQUEUED;
@ -1202,9 +1341,15 @@ static int virtblk_probe(struct virtio_device *vdev)
vblk->tag_set.queue_depth = queue_depth;
vblk->tag_set.numa_node = NUMA_NO_NODE;
vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
/* For bidirectional passthrough vblk request, both WRITE and READ
* operations need pre-alloc inline SGs. So we should prealloc twice
* the size than original ways. Due to the inability to predict whether
* a request is bidirectional, there may be memory wastage, but won't
* be significant.
*/
vblk->tag_set.cmd_size =
sizeof(struct virtblk_req) +
sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
sizeof(struct scatterlist) * 2 * VIRTIO_BLK_INLINE_SG_CNT;
vblk->tag_set.driver_data = vblk;
vblk->tag_set.nr_hw_queues = vblk->num_vqs;
vblk->tag_set.nr_maps = 1;

View File

@ -459,6 +459,9 @@ enum req_flag_bits {
/* for driver use */
__REQ_DRV,
__REQ_SWAP, /* swapping request. */
__REQ_BIDIR,
__REQ_NR_BITS, /* stops here */
};
@ -483,6 +486,7 @@ enum req_flag_bits {
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
#define REQ_BIDIR (1ULL << __REQ_BIDIR)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@ -516,6 +520,11 @@ static inline bool op_is_write(unsigned int op)
return (op & 1);
}
static inline bool op_is_bidirectional(unsigned int op)
{
return op & REQ_BIDIR;
}
/*
* Check if the bio or request is one that needs special treatment in the
* flush state machine.

View File

@ -207,11 +207,14 @@ struct virtblk_uring_cmd {
__u64 data;
__u32 data_len;
__u32 flag;
__u32 write_iov_count;
};
#define VIRTBLK_URING_CMD_IO 1
#define VIRTBLK_URING_CMD_IO_VEC 2
#define VIRTBLK_URING_F_BIDIR (1 << 0)
/* And this is the final byte of the write scatter-gather list. */
#define VIRTIO_BLK_S_OK 0
#define VIRTIO_BLK_S_IOERR 1