virtio-pci
QEMU Patch: dfb8e184db758bff275f94f7aa634300886cfe21 virtio-pci: initial virtio 1.0 support
因为是 VirtIO over PCI,所以我们应该按照 PCI 的方式来呈现一个 VirtIO device。PCI 的 configuration space 需要有 Vendor ID 和 device ID,那么 Vendor ID 是 0x1AF4。device ID 就是 0 是网络设备,1 是块设备等等。
QEMU 这里对于一个 PCI virtio device,支持 1024 个 virtqueue。从打的 log 来看,virtio-blk 好像只用了 1 个 virtqueue(从 init_vq()
里设置的)。这个内容可以从 guest 的 dmesg 里 grep 下面的 string 得到:
dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
vblk->io_queues[HCTX_TYPE_DEFAULT],
vblk->io_queues[HCTX_TYPE_READ],
vblk->io_queues[HCTX_TYPE_POLL]);
// 类似下面的信息,可见默认 virtio blk 虽然 enable 了 4 个 queue,但是只用了一个
virtio_blk virtio2: 4/0/0 default/read/poll queues
virtio_pci_modern_regions_init()
QEMU
要理解这部分代码,需要先理解 Virtio Structure PCI Capabilities^。也就是说一个 virtio PCI 设备有很多能够配置的东西:Common configuration • Notifications • ISR Status • Device-specific configuration (optional) • PCI configuration access。
注意这些东西对于 VirtIO MMIO 设备是没有的。
static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
const char *vdev_name)
{
static const MemoryRegionOps common_ops = {
.read = virtio_pci_common_read,
.write = virtio_pci_common_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
static const MemoryRegionOps isr_ops = {
.read = virtio_pci_isr_read,
.write = virtio_pci_isr_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
static const MemoryRegionOps device_ops = {
.read = virtio_pci_device_read,
.write = virtio_pci_device_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
static const MemoryRegionOps notify_ops = {
.read = virtio_pci_notify_read,
.write = virtio_pci_notify_write,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
static const MemoryRegionOps notify_pio_ops = {
.read = virtio_pci_notify_read,
.write = virtio_pci_notify_write_pio,
.impl = {
.min_access_size = 1,
.max_access_size = 4,
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
g_autoptr(GString) name = g_string_new(NULL);
g_string_printf(name, "virtio-pci-common-%s", vdev_name);
memory_region_init_io(&proxy->common.mr, OBJECT(proxy),
&common_ops,
proxy,
name->str,
proxy->common.size);
g_string_printf(name, "virtio-pci-isr-%s", vdev_name);
memory_region_init_io(&proxy->isr.mr, OBJECT(proxy),
&isr_ops,
proxy,
name->str,
proxy->isr.size);
g_string_printf(name, "virtio-pci-device-%s", vdev_name);
memory_region_init_io(&proxy->device.mr, OBJECT(proxy),
&device_ops,
proxy,
name->str,
proxy->device.size);
g_string_printf(name, "virtio-pci-notify-%s", vdev_name);
memory_region_init_io(&proxy->notify.mr, OBJECT(proxy),
¬ify_ops,
proxy,
name->str,
proxy->notify.size);
g_string_printf(name, "virtio-pci-notify-pio-%s", vdev_name);
memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy),
¬ify_pio_ops,
proxy,
name->str,
proxy->notify_pio.size);
}
struct VirtIOPCIRegion
QEMU
对应于 VirtIO Spec 里面的一个 structure。
See Virtio Structure PCI Capabilities
^.
typedef struct VirtIOPCIRegion {
MemoryRegion mr;
// 相比于 bar 中地址的 offset
uint32_t offset;
// 大小
uint32_t size;
// Common configuration • Notifications • ISR Status • Device-specific configuration (optional) • PCI configuration access
uint32_t type;
} VirtIOPCIRegion;
struct VirtIOPCIQueue
QEMU
所有 PCI 设备共享的结构体:virtio-scsi-pci
, virtio-pmem-pci
, virtio-mem-pci
, virtio-input-pci
, vhost-user-vsock-pci
。
从 QEMU 角度来看的一个表示一个 virtqueue 的结构体。
我们已经有 struct VirtQueue
了,为什么还要定义这个新的结构体呢?
typedef struct VirtIOPCIQueue {
uint16_t num;
bool enabled;
// No need to migrate the reset status, because it is always 0
// when the migration starts.
bool reset;
// 为什么是 2 呢,因为地址可能是 64bit,也就是 low 和 high
uint32_t desc[2];
uint32_t avail[2];
uint32_t used[2];
} VirtIOPCIQueue;
struct VirtIOPCIProxy
QEMU
一个 PCI Device 对应一个此结构。
叫 proxy 的原因可能是因为这个结构里面的 field 都直接关联了外部的 interface。可以参考这个变量 virtio_pci_properties
。
struct VirtIOPCIProxy {
// 对应的 PCI Device
PCIDevice pci_dev;
MemoryRegion bar;
union {
// 对应了五个 capability structure
// 请看 virtio 的 spec
struct {
VirtIOPCIRegion common;
VirtIOPCIRegion isr;
VirtIOPCIRegion device;
VirtIOPCIRegion notify;
VirtIOPCIRegion notify_pio;
};
VirtIOPCIRegion regs[5];
};
// 上述每一个 structure 都由一个 bar(一共有 6 个 bar)中所表示的地址指着。
// 这个 MR 包含了这 6 个 bar 的 region。
MemoryRegion modern_bar;
MemoryRegion io_bar;
uint32_t legacy_io_bar_idx;
uint32_t msix_bar_idx;
uint32_t modern_io_bar_idx;
uint32_t modern_mem_bar_idx;
int config_cap;
uint32_t flags;
bool disable_modern;
bool ignore_backend_features;
OnOffAuto disable_legacy;
/* Transitional device id */
uint16_t trans_devid;
uint32_t class_code;
uint32_t nvectors;
uint32_t dfselect;
uint32_t gfselect;
// 是 driver 传给 device 想要置上的 feature,也就是是 driver 的一厢情愿。
// 不是 device 最后置上的 feature,device 最后置上的 feature 还
// 需要和自己支持的 features 进行一个与。
uint32_t guest_features[2];
// 这个 device 对应的所有的 virtqueue
// 这个值是 1024,相当于是说我们预分配了 1024 个 queue
// 给这个 device 来用。
VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX];
VirtIOIRQFD *vector_irqfd;
int nvqs_with_notifiers;
VirtioBusState bus;
};
virtio_pci_modern_region_map()
QEMU
// virtio_pci_modern_region_map(proxy, region, cap, &proxy->modern_bar, proxy->modern_mem_bar_idx);
static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
VirtIOPCIRegion *region,
struct virtio_pci_cap *cap,
MemoryRegion *mr,
uint8_t bar)
{
// bar(type) 和 stucture 是一对多,modern bar 这个 MR 和 bar 也是一对多(因为有 6 个 bar)。
// 所以加入为 subregion
memory_region_add_subregion(mr, region->offset, ®ion->mr);
//
cap->cfg_type = region->type;
cap->bar = bar;
cap->offset = cpu_to_le32(region->offset);
cap->length = cpu_to_le32(region->size);
virtio_pci_add_mem_cap(proxy, cap);
}
virtio_pci_add_mem_cap()
QEMU
static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, struct virtio_pci_cap *cap)
{
PCIDevice *dev = &proxy->pci_dev;
int offset;
offset = pci_add_capability(dev, PCI_CAP_ID_VNDR, 0, cap->cap_len, &error_abort);
memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len, cap->cap_len - PCI_CAP_FLAGS);
return offset;
}
virtio_pci_common_write()
QEMU
pci_host_config_write_common
pci_dev->config_write()
virtio_write_config
virtio_address_space_write
memory_region_dispatch_write
access_with_adjusted_size
memory_region_write_accessor
mr->ops->write(mr->opaque, addr, tmp, size);
virtio_pci_common_write
static void virtio_pci_common_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
{
VirtIOPCIProxy *proxy = opaque;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
uint16_t vector;
//...
switch (addr) {
case VIRTIO_PCI_COMMON_DFSELECT:
proxy->dfselect = val;
break;
case VIRTIO_PCI_COMMON_GFSELECT:
proxy->gfselect = val;
break;
case VIRTIO_PCI_COMMON_GF:
if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) {
proxy->guest_features[proxy->gfselect] = val;
virtio_set_features(vdev, (((uint64_t)proxy->guest_features[1]) << 32) |
proxy->guest_features[0]);
}
break;
case VIRTIO_PCI_COMMON_MSIX:
if (vdev->config_vector != VIRTIO_NO_VECTOR) {
msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
}
/* Make it possible for guest to discover an error took place. */
if (val < proxy->nvectors) {
msix_vector_use(&proxy->pci_dev, val);
} else {
val = VIRTIO_NO_VECTOR;
}
vdev->config_vector = val;
break;
case VIRTIO_PCI_COMMON_STATUS:
if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
virtio_pci_stop_ioeventfd(proxy);
}
virtio_set_status(vdev, val & 0xFF);
if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
virtio_pci_start_ioeventfd(proxy);
}
if (vdev->status == 0) {
virtio_pci_reset(DEVICE(proxy));
}
break;
// 选择上这个 virtqueue
case VIRTIO_PCI_COMMON_Q_SELECT:
if (val < VIRTIO_QUEUE_MAX)
vdev->queue_sel = val;
break;
case VIRTIO_PCI_COMMON_Q_SIZE:
proxy->vqs[vdev->queue_sel].num = val;
virtio_queue_set_num(vdev, vdev->queue_sel, proxy->vqs[vdev->queue_sel].num);
virtio_init_region_cache(vdev, vdev->queue_sel);
break;
case VIRTIO_PCI_COMMON_Q_MSIX:
vector = virtio_queue_vector(vdev, vdev->queue_sel);
if (vector != VIRTIO_NO_VECTOR) {
msix_vector_unuse(&proxy->pci_dev, vector);
}
/* Make it possible for guest to discover an error took place. */
if (val < proxy->nvectors) {
msix_vector_use(&proxy->pci_dev, val);
} else {
val = VIRTIO_NO_VECTOR;
}
virtio_queue_set_vector(vdev, vdev->queue_sel, val);
break;
// enable 当前选中的 virtqueue
case VIRTIO_PCI_COMMON_Q_ENABLE:
if (val == 1) {
virtio_queue_set_num(vdev, vdev->queue_sel,
proxy->vqs[vdev->queue_sel].num);
virtio_queue_set_rings(vdev, vdev->queue_sel,
((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
proxy->vqs[vdev->queue_sel].desc[0],
((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
proxy->vqs[vdev->queue_sel].avail[0],
((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
proxy->vqs[vdev->queue_sel].used[0]);
proxy->vqs[vdev->queue_sel].enabled = 1;
proxy->vqs[vdev->queue_sel].reset = 0;
virtio_queue_enable(vdev, vdev->queue_sel);
} else {
virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
}
break;
// desc 的地址(GPA)low
case VIRTIO_PCI_COMMON_Q_DESCLO:
proxy->vqs[vdev->queue_sel].desc[0] = val;
break;
// desc 的地址(GPA)high
case VIRTIO_PCI_COMMON_Q_DESCHI:
proxy->vqs[vdev->queue_sel].desc[1] = val;
break;
// avail 的地址(GPA)low
case VIRTIO_PCI_COMMON_Q_AVAILLO:
proxy->vqs[vdev->queue_sel].avail[0] = val;
break;
// avail 的地址(GPA)high
case VIRTIO_PCI_COMMON_Q_AVAILHI:
proxy->vqs[vdev->queue_sel].avail[1] = val;
break;
// used 的地址(GPA)low
case VIRTIO_PCI_COMMON_Q_USEDLO:
proxy->vqs[vdev->queue_sel].used[0] = val;
break;
// used 的地址(GPA)high
case VIRTIO_PCI_COMMON_Q_USEDHI:
proxy->vqs[vdev->queue_sel].used[1] = val;
break;
case VIRTIO_PCI_COMMON_Q_RESET:
if (val == 1) {
proxy->vqs[vdev->queue_sel].reset = 1;
virtio_queue_reset(vdev, vdev->queue_sel);
proxy->vqs[vdev->queue_sel].reset = 0;
proxy->vqs[vdev->queue_sel].enabled = 0;
}
break;
default:
break;
}
}
VirtIO PCI Modern
setup_vq()
Guest kernel modern
注意这个函数是同时定义在文件
drivers/virtio/virtio_pci_legacy.c
drivers/virtio/virtio_pci_modern.c
这意味着这个函数并不是仅仅给 modern 用的,legacy 也会用到。下面的 code 是 modern 的。
static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
struct virtio_pci_vq_info *info,
unsigned int index,
void (*callback)(struct virtqueue *vq),
const char *name,
bool ctx,
u16 msix_vec)
{
struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
bool (*notify)(struct virtqueue *vq);
struct virtqueue *vq;
u16 num;
int err;
if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA))
notify = vp_notify_with_data;
else
notify = vp_notify;
//...
/* Check if queue is either not available or already active. */
num = vp_modern_get_queue_size(mdev, index);
// !num 表示我们这个 queue 是 unavailale 的,所以我们应该报错。
// 同样,如果这个 queue 之前已经被 enable 过了,也要报错
// 也就是我们要保证我们的 queue 是 available 同时还没有被 enable 的。
if (!num || vp_modern_get_queue_enable(mdev, index))
return ERR_PTR(-ENOENT);
info->msix_vector = msix_vec;
// 不如说这个 virtqueue 之前就已经存在了。
// 我们只是创建描述这个 virtqueue 的结构体
vq = vring_create_virtqueue(index, num,
SMP_CACHE_BYTES, &vp_dev->vdev,
true, true, ctx,
notify, callback, name);
//...
vq->num_max = num;
err = vp_active_vq(vq, msix_vec);
//...
vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL);
//...
return vq;
//...
}
vp_active_vq()
Guest kernel modern
Enable 和 active 的区别,先 active 再 enable。
vp_modern_find_vqs
vp_find_vqs
vp_find_vqs_msix
vp_setup_vq
setup_vq
// 此处是 active 这个 virtqueue
vp_active_vq
list_for_each_entry(vq, &vdev->vqs, list)
// enable 每一个 queue
vp_modern_set_queue_enable(&vp_dev->mdev, vq->index, true);
static int vp_active_vq(struct virtqueue *vq, u16 msix_vec)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
unsigned long index;
index = vq->index;
// 设置 size
vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
//
vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
virtqueue_get_avail_addr(vq),
virtqueue_get_used_addr(vq));
if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
msix_vec = vp_modern_queue_vector(mdev, index, msix_vec);
//...
}
//...
}
vp_modern_queue_address()
Guest kernel
设置 Desc/Available/Used 这三个 VRing 的地址(和 queue_pfn 什么关系?)
vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
virtqueue_get_avail_addr(vq),
virtqueue_get_used_addr(vq));
void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
u16 index, u64 desc_addr, u64 driver_addr,
u64 device_addr)
{
struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
vp_iowrite16(index, &cfg->queue_select);
vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, &cfg->queue_desc_hi);
vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, &cfg->queue_avail_hi);
vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, &cfg->queue_used_hi);
}
vp_modern_get_queue_size()
/ vp_modern_get_queue_enable()
Guest kernel
拿到当前 queue 的 size。
/*
* vp_modern_get_queue_size - get size for a virtqueue
* @mdev: the modern virtio-pci device
* @index: the queue index
*
* Returns the size of the virtqueue
*/
u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 index)
{
// 写 index 到 queue_select 地址,表示我们当前选中这个 index。
// #define VIRTIO_PCI_COMMON_Q_SELECT 22
vp_iowrite16(index, &mdev->common->queue_select);
// 拿到当前选中 queue 的 size
return vp_ioread16(&mdev->common->queue_size);
}
// (in QEMU) for the queue_select write
virtio_pci_common_write
case VIRTIO_PCI_COMMON_Q_SELECT:
vdev->queue_sel = val;
// (in QEMU) for the queue_size read
virtio_pci_common_read
case VIRTIO_PCI_COMMON_Q_SIZE:
val = virtio_queue_get_num(vdev, vdev->queue_sel);
bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, u16 index)
{
// 通过写这个 queue_select 来选择这个 queue
vp_iowrite16(index, &mdev->common->queue_select);
// 通过读这个来判断这个 queue enable 了没有
// VIRTIO_PCI_COMMON_Q_ENABLE
return vp_ioread16(&mdev->common->queue_enable);
}
// in QEMU
virtio_pci_common_read
case VIRTIO_PCI_COMMON_Q_ENABLE:
val = proxy->vqs[vdev->queue_sel].enabled;
// in QEMU, 来 enable 这一个 queue
virtio_pci_common_write
case VIRTIO_PCI_COMMON_Q_ENABLE:
if (val == 1) {
virtio_queue_set_num
virtio_queue_set_rings
virtio_queue_enable
else
virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
VIRTIO_PCI_COMMON_*
/ struct virtio_pci_common_cfg
/ Kernel header
// 这里的这些宏的值表示 byte offset,比如 SELECT 是 22
// 表示它的地址是在 22 个 byte 之后。
#define VIRTIO_PCI_COMMON_DFSELECT 0
#define VIRTIO_PCI_COMMON_DF 4
#define VIRTIO_PCI_COMMON_GFSELECT 8
#define VIRTIO_PCI_COMMON_GF 12
#define VIRTIO_PCI_COMMON_MSIX 16
#define VIRTIO_PCI_COMMON_NUMQ 18
#define VIRTIO_PCI_COMMON_STATUS 20
#define VIRTIO_PCI_COMMON_CFGGENERATION 21
#define VIRTIO_PCI_COMMON_Q_SELECT 22
#define VIRTIO_PCI_COMMON_Q_SIZE 24
#define VIRTIO_PCI_COMMON_Q_MSIX 26
#define VIRTIO_PCI_COMMON_Q_ENABLE 28
#define VIRTIO_PCI_COMMON_Q_NOFF 30
#define VIRTIO_PCI_COMMON_Q_DESCLO 32
#define VIRTIO_PCI_COMMON_Q_DESCHI 36
#define VIRTIO_PCI_COMMON_Q_AVAILLO 40
#define VIRTIO_PCI_COMMON_Q_AVAILHI 44
#define VIRTIO_PCI_COMMON_Q_USEDLO 48
#define VIRTIO_PCI_COMMON_Q_USEDHI 52
#define VIRTIO_PCI_COMMON_Q_NDATA 56
#define VIRTIO_PCI_COMMON_Q_RESET 58
/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
// 可以看到这个结构体的布局和上面宏定义的值
// 是一一对应的,所以我们也会有类似下面这种 check
// BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != offsetof(struct virtio_pci_common_cfg, queue_select));
// 来保证是一一对应的。
struct virtio_pci_common_cfg {
/* About the whole device. */
__le32 device_feature_select; /* read-write */
__le32 device_feature; /* read-only */
__le32 guest_feature_select; /* read-write */
__le32 guest_feature; /* read-write */
__le16 msix_config; /* read-write */
__le16 num_queues; /* read-only */
__u8 device_status; /* read-write */
__u8 config_generation; /* read-only */
/* About a specific virtqueue. */
__le16 queue_select; /* read-write */
__le16 queue_size; /* read-write, power of 2. */
__le16 queue_msix_vector; /* read-write */
__le16 queue_enable; /* read-write */
__le16 queue_notify_off; /* read-only */
// desc/avail/used 这三个 vring 的地址。
__le32 queue_desc_lo; /* read-write */
__le32 queue_desc_hi; /* read-write */
__le32 queue_avail_lo; /* read-write */
__le32 queue_avail_hi; /* read-write */
__le32 queue_used_lo; /* read-write */
__le32 queue_used_hi; /* read-write */
};
VIRTIO_PCI_COMMON_Q_SELECT
Kernel
表示当前操作的 queue 是哪一个:
在 device backend 这里:
virtio_pci_common_write
case VIRTIO_PCI_COMMON_Q_SELECT:
if (val < VIRTIO_QUEUE_MAX)
// 当前选择的 queue
vdev->queue_sel = val;
virtio_pci_common_read
case VIRTIO_PCI_COMMON_Q_SELECT:
val = vdev->queue_sel;
VirtIO PCI Initialization Process
#define TYPE_VIRTIO_DEVICE "virtio-device"
static const TypeInfo virtio_device_info = {
.name = TYPE_VIRTIO_DEVICE,
// 继承了 TYPE_DEVICE
.parent = TYPE_DEVICE,
.instance_size = sizeof(VirtIODevice),
.class_init = virtio_device_class_init,
.instance_finalize = virtio_device_instance_finalize,
.abstract = true,
.class_size = sizeof(VirtioDeviceClass),
};
virtio_device_class_init()
QEMU
static void virtio_device_class_init(ObjectClass *klass, void *data)
{
/* Set the default value here. */
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
DeviceClass *dc = DEVICE_CLASS(klass);
dc->realize = virtio_device_realize;
dc->unrealize = virtio_device_unrealize;
dc->bus_type = TYPE_VIRTIO_BUS;
device_class_set_props(dc, virtio_properties);
vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl;
vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl;
vdc->legacy_features |= VIRTIO_LEGACY_FEATURES;
}
virtio_device_realize()
QEMU
virtio_device_realize
virtio_bus_device_plugged
virtio_pci_device_plugged
// 一些 VirtIO PCI 相关的 memory region 以及 ops
virtio_pci_modern_regions_init
struct VirtioDeviceClass
struct VirtioDeviceClass {
/*< private >*/
DeviceClass parent;
/*< public >*/
/* This is what a VirtioDevice must implement */
DeviceRealize realize;
DeviceUnrealize unrealize;
uint64_t (*get_features)(VirtIODevice *vdev,
uint64_t requested_features,
Error **errp);
uint64_t (*bad_features)(VirtIODevice *vdev);
void (*set_features)(VirtIODevice *vdev, uint64_t val);
int (*validate_features)(VirtIODevice *vdev);
void (*get_config)(VirtIODevice *vdev, uint8_t *config);
void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
void (*reset)(VirtIODevice *vdev);
void (*set_status)(VirtIODevice *vdev, uint8_t val);
/* Device must validate queue_index. */
void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index);
/* Device must validate queue_index. */
void (*queue_enable)(VirtIODevice *vdev, uint32_t queue_index);
/* For transitional devices, this is a bitmap of features
* that are only exposed on the legacy interface but not
* the modern one.
*/
uint64_t legacy_features;
/* Test and clear event pending status.
* Should be called after unmask to avoid losing events.
* If backend does not support masking,
* must check in frontend instead.
*/
bool (*guest_notifier_pending)(VirtIODevice *vdev, int n);
/* Mask/unmask events from this vq. Any events reported
* while masked will become pending.
* If backend does not support masking,
* must mask in frontend instead.
*/
void (*guest_notifier_mask)(VirtIODevice *vdev, int n, bool mask);
int (*start_ioeventfd)(VirtIODevice *vdev);
void (*stop_ioeventfd)(VirtIODevice *vdev);
/* Saving and loading of a virtio device; trying to deprecate save/load
* use vmsd for new devices.
*/
void (*save)(VirtIODevice *vdev, QEMUFile *f);
int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
/* Post load hook in vmsd is called early while device is processed, and
* when VirtIODevice isn't fully initialized. Devices should use this instead,
* unless they specifically want to verify the migration stream as it's
* processed, e.g. for bounds checking.
*/
int (*post_load)(VirtIODevice *vdev);
const VMStateDescription *vmsd;
bool (*primary_unplug_pending)(void *opaque);
struct vhost_dev *(*get_vhost)(VirtIODevice *vdev);
void (*toggle_device_iotlb)(VirtIODevice *vdev);
};
VirtIO Structure PCI Capabilities
The virtio device configuration layout includes several structures:
- Common configuration
- Notifications
- ISR Status
- Device-specific configuration (optional)
- PCI configuration access
Each structure can be
- mapped by a BAR belonging to the function, or
- accessed via the special
VIRTIO_PCI_CAP_PCI_CFG
field in the PCI configuration space.
因为 PCI configuration space 的部分 layout 是 device specific 的,因此 VirtIO device 在自己的 configuration 里有 VIRTIO_PCI_CAP_PCI_CFG
field。当然,VirtIO device 有哪些 BAR 以及 BAR 的意义也是设备自己定义的。因此这些 structures 其实也被设计成了可以通过访问 PCI BAR 的形式来进行访问。
那么这几个 structure 的地址到底在哪呢?The location of each structure is specified using a vendor-specific PCI capability located on the capability list^ in PCI configuration space.
可以看到一个结构体 16 bytes (dword)。这个结构体占用了一个 capability list 里的 entry。
struct virtio_pci_cap {
u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
u8 cap_next; /* Generic PCI field: next ptr. */
// 别把这个和后面的 length 弄混了。这个表示的只是这个 cap entry 的长度。
u8 cap_len; /* Generic PCI field: capability length, Length of this capability structure */
// 比如是:
// - Common configuration
// - Notifications
// - ISR Status
// - Device-specific configuration
// - PCI configuration access
u8 cfg_type; /* Identifies the structure. */
// values 0x0 to 0x5 (一共 6 个 BAR) specify a BAR located in PCI Configuration Space
// and used to map the structure into Memory or I/O Space.
// 这个 bar 表示了所对应的 cfg_type 在内存中的地址。
u8 bar; /* Where to find it. */
// 一个类型比如 VIRTIO_PCI_CAP_COMMON_CFG 可以有多个 structures。
// 这个表示对应 structure 的 id
u8 id; /* Multiple capabilities of the same type */
u8 padding[2]; /* Pad to full dword. */
// indicates where the structure begins relative to the base address associated with the BAR.
// BAR 不是指明了这个 cap 的地址吗,那么 offset 指明的就是这个 structure 开始的地方
// 相对于 BAR 这个地址的 offset(因为如上面所示,一个 type 比如 VIRTIO_PCI_CAP_COMMON_CFG 可以有多个 structures)
// 而一个此结构(virtio_pci_cap)只对应一个 structure。
le32 offset; /* Offset within bar. */
le32 length; /* Length of the structure, in bytes. */
}
cfg_type
: The device MAY offer more than one structure of any type, this makes it possible for the device to expose multiple interfaces to drivers.
下面我们来看每一个 cfg_type
:
VIRTIO_PCI_CAP_COMMON_CFG
Kernel
The common configuration structure is found at the bar and offset within the VIRTIO_PCI_CAP_COMMON_CFG
capability.
struct virtio_pci_common_cfg {
/* About the whole device. */
__le32 device_feature_select; /* read-write */
__le32 device_feature; /* read-only */
__le32 guest_feature_select; /* read-write */
__le32 guest_feature; /* read-write */
__le16 msix_config; /* read-write */
__le16 num_queues; /* read-only */
__u8 device_status; /* read-write */
__u8 config_generation; /* read-only */
/* About a specific virtqueue. */
__le16 queue_select; /* read-write */
__le16 queue_size; /* read-write, power of 2. */
__le16 queue_msix_vector; /* read-write */
__le16 queue_enable; /* read-write */
__le16 queue_notify_off; /* read-only */
__le32 queue_desc_lo; /* read-write */
__le32 queue_desc_hi; /* read-write */
__le32 queue_avail_lo; /* read-write */
__le32 queue_avail_hi; /* read-write */
__le32 queue_used_lo; /* read-write */
__le32 queue_used_hi; /* read-write */
};
device_feature_select
: The driver uses this to select which feature bits device_feature
shows.
- 0x0 selects Feature Bits 0 to 31,
- 0x1 selects Feature Bits 32 to 63.
device_feature
: The device uses this to report which feature bits it is offering to the driver: the driver writes to device_feature_select
to select which feature bits are presented.
driver_feature_select
: The driver uses this to select which feature bits driver_feature
shows. Value 0x0 selects Feature Bits 0 to 31, 0x1 selects Feature Bits 32 to 63, etc.
driver_feature
: The driver writes this to accept feature bits offered by the device. Driver Feature Bits selected by driver_feature_select
.
msix_config
: VIRTIO_PCI_COMMON_Q_MSIX
: Set by the driver to the MSI-X vector for configuration change notifications(driver 设置一个 MSI-X
向量号用来 configuration change notify)。
virtio_pci_common_read
switch (addr) {
case VIRTIO_PCI_COMMON_MSIX:
val = vdev->config_vector;
virtio_pci_common_write
switch (addr) {
case VIRTIO_PCI_COMMON_MSIX:
// 先把之前的 vector invalidate 一下
if (vdev->config_vector != VIRTIO_NO_VECTOR)
msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
// 使用 msix_vector_use() 来启用这个 vector
if (val < proxy->nvectors)
msix_vector_use(&proxy->pci_dev, val);
else
val = VIRTIO_NO_VECTOR;
vdev->config_vector = val;
queue_msix_vector
: Set by the driver to the MSI-X vector for virtqueue notifications:
virtio_pci_common_read
switch (addr) {
case VIRTIO_PCI_COMMON_Q_MSIX:
val = virtio_queue_vector(vdev, vdev->queue_sel);
virtio_pci_common_write
switch (addr) {
case VIRTIO_PCI_COMMON_Q_MSIX:
// 和 common misx 的逻辑是一样的
// 先 unuse 关闭之前的,再 use 启用新的
vector = virtio_queue_vector(vdev, vdev->queue_sel);
if (vector != VIRTIO_NO_VECTOR)
msix_vector_unuse(&proxy->pci_dev, vector);
if (val < proxy->nvectors)
msix_vector_use(&proxy->pci_dev, val);
else
val = VIRTIO_NO_VECTOR;
virtio_queue_set_vector(vdev, vdev->queue_sel, val);
VIRTIO_PCI_CAP_DEVICE_CFG
VirtIO Device Configuration Space
Used for rarely-changing or initialization-time parameters. Where configuration fields are optional, their existence is indicated by feature bits.
这个 configuration space 和 PCI configuration space 是不一样的。VirtIO spec 里并没有对于 configuration space 有详细的说明,只是表明这个是 device-specific 的。
注意这个 space 不是 virtio-pci 所特有的,virtio-mmio 也有,只是访问的方式是不一样的。
QEMU 代码里,VirtIODevice->config
表示的就是 VirtIO device configuration space。
virtio_pci_modern_regions_init
static const MemoryRegionOps device_ops = {
.read = virtio_pci_device_read,
.write = virtio_pci_device_write,
//...
};
virtio_pci_device_write / virtio_mmio_write
virtio_config_modern_writew
stw_le_p(vdev->config + addr, val);
g_string_printf(name, "virtio-pci-device-%s", vdev_name);
memory_region_init_io(&proxy->device.mr, OBJECT(proxy), &device_ops, proxy, name->str, proxy->device.size);