QEMU Patch: dfb8e184db758bff275f94f7aa634300886cfe21 virtio-pci: initial virtio 1.0 support

因为是 VirtIO over PCI,所以我们应该按照 PCI 的方式来呈现一个 VirtIO device。PCI 的 configuration space 需要有 Vendor ID 和 device ID,那么 Vendor ID 是 0x1AF4。device ID 就是 0 是网络设备,1 是块设备等等。

QEMU 这里对于一个 PCI virtio device,支持 1024 个 virtqueue。从打的 log 来看,virtio-blk 好像只用了 1 个 virtqueue(从 init_vq() 里设置的)。这个内容可以从 guest 的 dmesg 里 grep 下面的 string 得到:

dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
            vblk->io_queues[HCTX_TYPE_DEFAULT],
            vblk->io_queues[HCTX_TYPE_READ],
            vblk->io_queues[HCTX_TYPE_POLL]);

// 类似下面的信息,可见默认 virtio blk 虽然 enable 了 4 个 queue,但是只用了一个
virtio_blk virtio2: 4/0/0 default/read/poll queues

virtio_pci_modern_regions_init() QEMU

要理解这部分代码,需要先理解 Virtio Structure PCI Capabilities^。也就是说一个 virtio PCI 设备有很多能够配置的东西:Common configuration • Notifications • ISR Status • Device-specific configuration (optional) • PCI configuration access。

注意这些东西对于 VirtIO MMIO 设备是没有的。

static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
                                           const char *vdev_name)
{
    static const MemoryRegionOps common_ops = {
        .read = virtio_pci_common_read,
        .write = virtio_pci_common_write,
        .impl = {
            .min_access_size = 1,
            .max_access_size = 4,
        },
        .endianness = DEVICE_LITTLE_ENDIAN,
    };
    static const MemoryRegionOps isr_ops = {
        .read = virtio_pci_isr_read,
        .write = virtio_pci_isr_write,
        .impl = {
            .min_access_size = 1,
            .max_access_size = 4,
        },
        .endianness = DEVICE_LITTLE_ENDIAN,
    };
    static const MemoryRegionOps device_ops = {
        .read = virtio_pci_device_read,
        .write = virtio_pci_device_write,
        .impl = {
            .min_access_size = 1,
            .max_access_size = 4,
        },
        .endianness = DEVICE_LITTLE_ENDIAN,
    };
    static const MemoryRegionOps notify_ops = {
        .read = virtio_pci_notify_read,
        .write = virtio_pci_notify_write,
        .impl = {
            .min_access_size = 1,
            .max_access_size = 4,
        },
        .endianness = DEVICE_LITTLE_ENDIAN,
    };
    static const MemoryRegionOps notify_pio_ops = {
        .read = virtio_pci_notify_read,
        .write = virtio_pci_notify_write_pio,
        .impl = {
            .min_access_size = 1,
            .max_access_size = 4,
        },
        .endianness = DEVICE_LITTLE_ENDIAN,
    };
    g_autoptr(GString) name = g_string_new(NULL);

    g_string_printf(name, "virtio-pci-common-%s", vdev_name);
    memory_region_init_io(&proxy->common.mr, OBJECT(proxy),
                          &common_ops,
                          proxy,
                          name->str,
                          proxy->common.size);

    g_string_printf(name, "virtio-pci-isr-%s", vdev_name);
    memory_region_init_io(&proxy->isr.mr, OBJECT(proxy),
                          &isr_ops,
                          proxy,
                          name->str,
                          proxy->isr.size);

    g_string_printf(name, "virtio-pci-device-%s", vdev_name);
    memory_region_init_io(&proxy->device.mr, OBJECT(proxy),
                          &device_ops,
                          proxy,
                          name->str,
                          proxy->device.size);

    g_string_printf(name, "virtio-pci-notify-%s", vdev_name);
    memory_region_init_io(&proxy->notify.mr, OBJECT(proxy),
                          &notify_ops,
                          proxy,
                          name->str,
                          proxy->notify.size);

    g_string_printf(name, "virtio-pci-notify-pio-%s", vdev_name);
    memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy),
                          &notify_pio_ops,
                          proxy,
                          name->str,
                          proxy->notify_pio.size);
}

struct VirtIOPCIRegion QEMU

对应于 VirtIO Spec 里面的一个 structure。

See Virtio Structure PCI Capabilities^.

typedef struct VirtIOPCIRegion {
    MemoryRegion mr;
    // 相比于 bar 中地址的 offset
    uint32_t offset;
    // 大小
    uint32_t size;
    // Common configuration • Notifications • ISR Status • Device-specific configuration (optional) • PCI configuration access
    uint32_t type;
} VirtIOPCIRegion;

struct VirtIOPCIQueue QEMU

所有 PCI 设备共享的结构体:virtio-scsi-pci, virtio-pmem-pci, virtio-mem-pci, virtio-input-pci, vhost-user-vsock-pci

从 QEMU 角度来看的一个表示一个 virtqueue 的结构体。

我们已经有 struct VirtQueue 了,为什么还要定义这个新的结构体呢?

typedef struct VirtIOPCIQueue {
  uint16_t num;
  bool enabled;
  // No need to migrate the reset status, because it is always 0
  // when the migration starts.
  bool reset;
  // 为什么是 2 呢,因为地址可能是 64bit,也就是 low 和 high
  uint32_t desc[2];
  uint32_t avail[2];
  uint32_t used[2];
} VirtIOPCIQueue;

struct VirtIOPCIProxy QEMU

一个 PCI Device 对应一个此结构。

叫 proxy 的原因可能是因为这个结构里面的 field 都直接关联了外部的 interface。可以参考这个变量 virtio_pci_properties

struct VirtIOPCIProxy {
    // 对应的 PCI Device
    PCIDevice pci_dev;
    MemoryRegion bar;
    union {
        // 对应了五个 capability structure
        // 请看 virtio 的 spec
        struct {
            VirtIOPCIRegion common;
            VirtIOPCIRegion isr;
            VirtIOPCIRegion device;
            VirtIOPCIRegion notify;
            VirtIOPCIRegion notify_pio;
        };
        VirtIOPCIRegion regs[5];
    };
    // 上述每一个 structure 都由一个 bar(一共有 6 个 bar)中所表示的地址指着。
    // 这个 MR 包含了这 6 个 bar 的 region。
    MemoryRegion modern_bar;
    MemoryRegion io_bar;
    uint32_t legacy_io_bar_idx;
    uint32_t msix_bar_idx;
    uint32_t modern_io_bar_idx;
    uint32_t modern_mem_bar_idx;
    int config_cap;
    uint32_t flags;
    bool disable_modern;
    bool ignore_backend_features;
    OnOffAuto disable_legacy;
    /* Transitional device id */
    uint16_t trans_devid;
    uint32_t class_code;
    uint32_t nvectors;
    uint32_t dfselect;
    uint32_t gfselect;
    // 是 driver 传给 device 想要置上的 feature,也就是是 driver 的一厢情愿。
    // 不是 device 最后置上的 feature,device 最后置上的 feature 还
    // 需要和自己支持的 features 进行一个与。
    uint32_t guest_features[2];
    // 这个 device 对应的所有的 virtqueue
    // 这个值是 1024,相当于是说我们预分配了 1024 个 queue
    // 给这个 device 来用。
    VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX];
    VirtIOIRQFD *vector_irqfd;
    int nvqs_with_notifiers;
    VirtioBusState bus;
};

virtio_pci_modern_region_map() QEMU

// virtio_pci_modern_region_map(proxy, region, cap, &proxy->modern_bar, proxy->modern_mem_bar_idx);
static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
                                     VirtIOPCIRegion *region,
                                     struct virtio_pci_cap *cap,
                                     MemoryRegion *mr,
                                     uint8_t bar)
{
    // bar(type) 和 stucture 是一对多,modern bar 这个 MR 和 bar 也是一对多(因为有 6 个 bar)。
    // 所以加入为 subregion
    memory_region_add_subregion(mr, region->offset, &region->mr);

    // 
    cap->cfg_type = region->type;
    cap->bar = bar;
    cap->offset = cpu_to_le32(region->offset);
    cap->length = cpu_to_le32(region->size);
    virtio_pci_add_mem_cap(proxy, cap);
}

virtio_pci_add_mem_cap() QEMU

static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, struct virtio_pci_cap *cap)
{
    PCIDevice *dev = &proxy->pci_dev;
    int offset;
    offset = pci_add_capability(dev, PCI_CAP_ID_VNDR, 0, cap->cap_len, &error_abort);
    memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len, cap->cap_len - PCI_CAP_FLAGS);
    return offset;
}

virtio_pci_common_write() QEMU


pci_host_config_write_common
pci_dev->config_write()
virtio_write_config
virtio_address_space_write
memory_region_dispatch_write
access_with_adjusted_size
memory_region_write_accessor
mr->ops->write(mr->opaque, addr, tmp, size);
virtio_pci_common_write

static void virtio_pci_common_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
{
    VirtIOPCIProxy *proxy = opaque;
    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
    uint16_t vector;

    //...
    switch (addr) {
    case VIRTIO_PCI_COMMON_DFSELECT:
        proxy->dfselect = val;
        break;
    case VIRTIO_PCI_COMMON_GFSELECT:
        proxy->gfselect = val;
        break;
    case VIRTIO_PCI_COMMON_GF:
        if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) {
            proxy->guest_features[proxy->gfselect] = val;
            virtio_set_features(vdev, (((uint64_t)proxy->guest_features[1]) << 32) |
                                proxy->guest_features[0]);
        }
        break;
    case VIRTIO_PCI_COMMON_MSIX:
        if (vdev->config_vector != VIRTIO_NO_VECTOR) {
            msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
        }
        /* Make it possible for guest to discover an error took place. */
        if (val < proxy->nvectors) {
            msix_vector_use(&proxy->pci_dev, val);
        } else {
            val = VIRTIO_NO_VECTOR;
        }
        vdev->config_vector = val;
        break;
    case VIRTIO_PCI_COMMON_STATUS:
        if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
            virtio_pci_stop_ioeventfd(proxy);
        }

        virtio_set_status(vdev, val & 0xFF);

        if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
            virtio_pci_start_ioeventfd(proxy);
        }

        if (vdev->status == 0) {
            virtio_pci_reset(DEVICE(proxy));
        }

        break;
    // 选择上这个 virtqueue
    case VIRTIO_PCI_COMMON_Q_SELECT:
        if (val < VIRTIO_QUEUE_MAX)
            vdev->queue_sel = val;
        break;
    case VIRTIO_PCI_COMMON_Q_SIZE:
        proxy->vqs[vdev->queue_sel].num = val;
        virtio_queue_set_num(vdev, vdev->queue_sel, proxy->vqs[vdev->queue_sel].num);
        virtio_init_region_cache(vdev, vdev->queue_sel);
        break;
    case VIRTIO_PCI_COMMON_Q_MSIX:
        vector = virtio_queue_vector(vdev, vdev->queue_sel);
        if (vector != VIRTIO_NO_VECTOR) {
            msix_vector_unuse(&proxy->pci_dev, vector);
        }
        /* Make it possible for guest to discover an error took place. */
        if (val < proxy->nvectors) {
            msix_vector_use(&proxy->pci_dev, val);
        } else {
            val = VIRTIO_NO_VECTOR;
        }
        virtio_queue_set_vector(vdev, vdev->queue_sel, val);
        break;
    // enable 当前选中的 virtqueue
    case VIRTIO_PCI_COMMON_Q_ENABLE:
        if (val == 1) {
            virtio_queue_set_num(vdev, vdev->queue_sel,
                                 proxy->vqs[vdev->queue_sel].num);
            virtio_queue_set_rings(vdev, vdev->queue_sel,
                       ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
                       proxy->vqs[vdev->queue_sel].desc[0],
                       ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
                       proxy->vqs[vdev->queue_sel].avail[0],
                       ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
                       proxy->vqs[vdev->queue_sel].used[0]);
            proxy->vqs[vdev->queue_sel].enabled = 1;
            proxy->vqs[vdev->queue_sel].reset = 0;
            virtio_queue_enable(vdev, vdev->queue_sel);
        } else {
            virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
        }
        break;
    // desc 的地址(GPA)low
    case VIRTIO_PCI_COMMON_Q_DESCLO:
        proxy->vqs[vdev->queue_sel].desc[0] = val;
        break;
    // desc 的地址(GPA)high
    case VIRTIO_PCI_COMMON_Q_DESCHI:
        proxy->vqs[vdev->queue_sel].desc[1] = val;
        break;
    // avail 的地址(GPA)low
    case VIRTIO_PCI_COMMON_Q_AVAILLO:
        proxy->vqs[vdev->queue_sel].avail[0] = val;
        break;
    // avail 的地址(GPA)high
    case VIRTIO_PCI_COMMON_Q_AVAILHI:
        proxy->vqs[vdev->queue_sel].avail[1] = val;
        break;
    // used 的地址(GPA)low
    case VIRTIO_PCI_COMMON_Q_USEDLO:
        proxy->vqs[vdev->queue_sel].used[0] = val;
        break;
    // used 的地址(GPA)high
    case VIRTIO_PCI_COMMON_Q_USEDHI:
        proxy->vqs[vdev->queue_sel].used[1] = val;
        break;
    case VIRTIO_PCI_COMMON_Q_RESET:
        if (val == 1) {
            proxy->vqs[vdev->queue_sel].reset = 1;

            virtio_queue_reset(vdev, vdev->queue_sel);

            proxy->vqs[vdev->queue_sel].reset = 0;
            proxy->vqs[vdev->queue_sel].enabled = 0;
        }
        break;
    default:
        break;
    }
}

VirtIO PCI Modern

setup_vq() Guest kernel modern

注意这个函数是同时定义在文件

  • drivers/virtio/virtio_pci_legacy.c
  • drivers/virtio/virtio_pci_modern.c

这意味着这个函数并不是仅仅给 modern 用的,legacy 也会用到。下面的 code 是 modern 的。

static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
				  struct virtio_pci_vq_info *info,
				  unsigned int index,
				  void (*callback)(struct virtqueue *vq),
				  const char *name,
				  bool ctx,
				  u16 msix_vec)
{

	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
	bool (*notify)(struct virtqueue *vq);
	struct virtqueue *vq;
	u16 num;
	int err;

	if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA))
		notify = vp_notify_with_data;
	else
		notify = vp_notify;

    //...
	/* Check if queue is either not available or already active. */
	num = vp_modern_get_queue_size(mdev, index);
    // !num 表示我们这个 queue 是 unavailale 的,所以我们应该报错。
    // 同样,如果这个 queue 之前已经被 enable 过了,也要报错
    // 也就是我们要保证我们的 queue 是 available 同时还没有被 enable 的。
	if (!num || vp_modern_get_queue_enable(mdev, index))
		return ERR_PTR(-ENOENT);

	info->msix_vector = msix_vec;

    // 不如说这个 virtqueue 之前就已经存在了。
    // 我们只是创建描述这个 virtqueue 的结构体

	vq = vring_create_virtqueue(index, num,
				    SMP_CACHE_BYTES, &vp_dev->vdev,
				    true, true, ctx,
				    notify, callback, name);
    //...
	vq->num_max = num;

	err = vp_active_vq(vq, msix_vec);
    //...

	vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL);
    //...
	return vq;
    //...
}

vp_active_vq() Guest kernel modern

Enable 和 active 的区别,先 active 再 enable。

vp_modern_find_vqs
    vp_find_vqs
        vp_find_vqs_msix
            vp_setup_vq
                setup_vq
                    // 此处是 active 这个 virtqueue
                    vp_active_vq
    list_for_each_entry(vq, &vdev->vqs, list)
        // enable 每一个 queue
		vp_modern_set_queue_enable(&vp_dev->mdev, vq->index, true);
static int vp_active_vq(struct virtqueue *vq, u16 msix_vec)
{
	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
	unsigned long index;

	index = vq->index;
    // 设置 size
	vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
    // 
	vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
				virtqueue_get_avail_addr(vq),
				virtqueue_get_used_addr(vq));

	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
		msix_vec = vp_modern_queue_vector(mdev, index, msix_vec);
        //...
	}
    //...
}

vp_modern_queue_address() Guest kernel

设置 Desc/Available/Used 这三个 VRing 的地址(和 queue_pfn 什么关系?)

vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
            virtqueue_get_avail_addr(vq),
            virtqueue_get_used_addr(vq));

void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
			     u16 index, u64 desc_addr, u64 driver_addr,
			     u64 device_addr)
{
	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;

	vp_iowrite16(index, &cfg->queue_select);

	vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo, &cfg->queue_desc_hi);
	vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo, &cfg->queue_avail_hi);
	vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo, &cfg->queue_used_hi);
}

vp_modern_get_queue_size() / vp_modern_get_queue_enable() Guest kernel

拿到当前 queue 的 size。

/*
 * vp_modern_get_queue_size - get size for a virtqueue
 * @mdev: the modern virtio-pci device
 * @index: the queue index
 *
 * Returns the size of the virtqueue
 */
u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 index)
{
    // 写 index 到 queue_select 地址,表示我们当前选中这个 index。
    // #define VIRTIO_PCI_COMMON_Q_SELECT	22
	vp_iowrite16(index, &mdev->common->queue_select);
    // 拿到当前选中 queue 的 size
	return vp_ioread16(&mdev->common->queue_size);

}

// (in QEMU) for the queue_select write
virtio_pci_common_write
    case VIRTIO_PCI_COMMON_Q_SELECT:
        vdev->queue_sel = val;

// (in QEMU) for the queue_size read
virtio_pci_common_read
    case VIRTIO_PCI_COMMON_Q_SIZE:
        val = virtio_queue_get_num(vdev, vdev->queue_sel);
bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, u16 index)
{
    // 通过写这个 queue_select 来选择这个 queue
	vp_iowrite16(index, &mdev->common->queue_select);
    // 通过读这个来判断这个 queue enable 了没有
    // VIRTIO_PCI_COMMON_Q_ENABLE
	return vp_ioread16(&mdev->common->queue_enable);
}

// in QEMU
virtio_pci_common_read
    case VIRTIO_PCI_COMMON_Q_ENABLE:
        val = proxy->vqs[vdev->queue_sel].enabled;

// in QEMU, 来 enable 这一个 queue
virtio_pci_common_write
    case VIRTIO_PCI_COMMON_Q_ENABLE:
        if (val == 1) {
            virtio_queue_set_num
            virtio_queue_set_rings
            virtio_queue_enable
        else
            virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);

VIRTIO_PCI_COMMON_* / struct virtio_pci_common_cfg / Kernel header

// 这里的这些宏的值表示 byte offset,比如 SELECT 是 22
// 表示它的地址是在 22 个 byte 之后。
#define VIRTIO_PCI_COMMON_DFSELECT	0
#define VIRTIO_PCI_COMMON_DF		4
#define VIRTIO_PCI_COMMON_GFSELECT	8
#define VIRTIO_PCI_COMMON_GF		12
#define VIRTIO_PCI_COMMON_MSIX		16
#define VIRTIO_PCI_COMMON_NUMQ		18
#define VIRTIO_PCI_COMMON_STATUS	20
#define VIRTIO_PCI_COMMON_CFGGENERATION	21
#define VIRTIO_PCI_COMMON_Q_SELECT	22
#define VIRTIO_PCI_COMMON_Q_SIZE	24
#define VIRTIO_PCI_COMMON_Q_MSIX	26
#define VIRTIO_PCI_COMMON_Q_ENABLE	28
#define VIRTIO_PCI_COMMON_Q_NOFF	30
#define VIRTIO_PCI_COMMON_Q_DESCLO	32
#define VIRTIO_PCI_COMMON_Q_DESCHI	36
#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
#define VIRTIO_PCI_COMMON_Q_USEDLO	48
#define VIRTIO_PCI_COMMON_Q_USEDHI	52
#define VIRTIO_PCI_COMMON_Q_NDATA	56
#define VIRTIO_PCI_COMMON_Q_RESET	58

/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
// 可以看到这个结构体的布局和上面宏定义的值
// 是一一对应的,所以我们也会有类似下面这种 check
//   BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != offsetof(struct virtio_pci_common_cfg, queue_select));
// 来保证是一一对应的。
struct virtio_pci_common_cfg {
	/* About the whole device. */
	__le32 device_feature_select;	/* read-write */
	__le32 device_feature;		/* read-only */
	__le32 guest_feature_select;	/* read-write */
	__le32 guest_feature;		/* read-write */
	__le16 msix_config;		/* read-write */
	__le16 num_queues;		/* read-only */
	__u8 device_status;		/* read-write */
	__u8 config_generation;		/* read-only */

	/* About a specific virtqueue. */
	__le16 queue_select;		/* read-write */
	__le16 queue_size;		/* read-write, power of 2. */
	__le16 queue_msix_vector;	/* read-write */
	__le16 queue_enable;		/* read-write */
	__le16 queue_notify_off;	/* read-only */

    // desc/avail/used 这三个 vring 的地址。 
	__le32 queue_desc_lo;		/* read-write */
	__le32 queue_desc_hi;		/* read-write */
	__le32 queue_avail_lo;		/* read-write */
	__le32 queue_avail_hi;		/* read-write */
	__le32 queue_used_lo;		/* read-write */
	__le32 queue_used_hi;		/* read-write */
};

VIRTIO_PCI_COMMON_Q_SELECT Kernel

表示当前操作的 queue 是哪一个:

在 device backend 这里:

virtio_pci_common_write
    case VIRTIO_PCI_COMMON_Q_SELECT:
        if (val < VIRTIO_QUEUE_MAX)
            // 当前选择的 queue
            vdev->queue_sel = val;

virtio_pci_common_read
    case VIRTIO_PCI_COMMON_Q_SELECT:
        val = vdev->queue_sel;

VirtIO PCI Initialization Process

#define TYPE_VIRTIO_DEVICE "virtio-device"
static const TypeInfo virtio_device_info = {
    .name = TYPE_VIRTIO_DEVICE,
    // 继承了 TYPE_DEVICE
    .parent = TYPE_DEVICE,
    .instance_size = sizeof(VirtIODevice),
    .class_init = virtio_device_class_init,
    .instance_finalize = virtio_device_instance_finalize,
    .abstract = true,
    .class_size = sizeof(VirtioDeviceClass),
};

virtio_device_class_init() QEMU

static void virtio_device_class_init(ObjectClass *klass, void *data)
{
    /* Set the default value here. */
    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
    DeviceClass *dc = DEVICE_CLASS(klass);

    dc->realize = virtio_device_realize;
    dc->unrealize = virtio_device_unrealize;
    dc->bus_type = TYPE_VIRTIO_BUS;
    device_class_set_props(dc, virtio_properties);
    vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl;
    vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl;

    vdc->legacy_features |= VIRTIO_LEGACY_FEATURES;
}

virtio_device_realize() QEMU

virtio_device_realize
    virtio_bus_device_plugged
        virtio_pci_device_plugged
            // 一些 VirtIO PCI 相关的 memory region 以及 ops
            virtio_pci_modern_regions_init

struct VirtioDeviceClass

struct VirtioDeviceClass {
    /*< private >*/
    DeviceClass parent;
    /*< public >*/

    /* This is what a VirtioDevice must implement */
    DeviceRealize realize;
    DeviceUnrealize unrealize;
    uint64_t (*get_features)(VirtIODevice *vdev,
                             uint64_t requested_features,
                             Error **errp);
    uint64_t (*bad_features)(VirtIODevice *vdev);
    void (*set_features)(VirtIODevice *vdev, uint64_t val);
    int (*validate_features)(VirtIODevice *vdev);
    void (*get_config)(VirtIODevice *vdev, uint8_t *config);
    void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
    void (*reset)(VirtIODevice *vdev);
    void (*set_status)(VirtIODevice *vdev, uint8_t val);
    /* Device must validate queue_index.  */
    void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index);
    /* Device must validate queue_index.  */
    void (*queue_enable)(VirtIODevice *vdev, uint32_t queue_index);
    /* For transitional devices, this is a bitmap of features
     * that are only exposed on the legacy interface but not
     * the modern one.
     */
    uint64_t legacy_features;
    /* Test and clear event pending status.
     * Should be called after unmask to avoid losing events.
     * If backend does not support masking,
     * must check in frontend instead.
     */
    bool (*guest_notifier_pending)(VirtIODevice *vdev, int n);
    /* Mask/unmask events from this vq. Any events reported
     * while masked will become pending.
     * If backend does not support masking,
     * must mask in frontend instead.
     */
    void (*guest_notifier_mask)(VirtIODevice *vdev, int n, bool mask);
    int (*start_ioeventfd)(VirtIODevice *vdev);
    void (*stop_ioeventfd)(VirtIODevice *vdev);
    /* Saving and loading of a virtio device; trying to deprecate save/load
     * use vmsd for new devices.
     */
    void (*save)(VirtIODevice *vdev, QEMUFile *f);
    int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
    /* Post load hook in vmsd is called early while device is processed, and
     * when VirtIODevice isn't fully initialized.  Devices should use this instead,
     * unless they specifically want to verify the migration stream as it's
     * processed, e.g. for bounds checking.
     */
    int (*post_load)(VirtIODevice *vdev);
    const VMStateDescription *vmsd;
    bool (*primary_unplug_pending)(void *opaque);
    struct vhost_dev *(*get_vhost)(VirtIODevice *vdev);
    void (*toggle_device_iotlb)(VirtIODevice *vdev);
};

VirtIO Structure PCI Capabilities

The virtio device configuration layout includes several structures:

  • Common configuration
  • Notifications
  • ISR Status
  • Device-specific configuration (optional)
  • PCI configuration access

Each structure can be

  • mapped by a BAR belonging to the function, or
  • accessed via the special VIRTIO_PCI_CAP_PCI_CFG field in the PCI configuration space.

因为 PCI configuration space 的部分 layout 是 device specific 的,因此 VirtIO device 在自己的 configuration 里有 VIRTIO_PCI_CAP_PCI_CFG field。当然,VirtIO device 有哪些 BAR 以及 BAR 的意义也是设备自己定义的。因此这些 structures 其实也被设计成了可以通过访问 PCI BAR 的形式来进行访问

那么这几个 structure 的地址到底在哪呢?The location of each structure is specified using a vendor-specific PCI capability located on the capability list^ in PCI configuration space.

可以看到一个结构体 16 bytes (dword)。这个结构体占用了一个 capability list 里的 entry。

struct virtio_pci_cap {
    u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
    u8 cap_next; /* Generic PCI field: next ptr. */
    // 别把这个和后面的 length 弄混了。这个表示的只是这个 cap entry 的长度。
    u8 cap_len; /* Generic PCI field: capability length, Length of this capability structure */
    // 比如是:
    //  - Common configuration
    //  - Notifications
    //  - ISR Status
    //  - Device-specific configuration
    //  - PCI configuration access
    u8 cfg_type; /* Identifies the structure. */
    // values 0x0 to 0x5 (一共 6 个 BAR) specify a BAR located in PCI Configuration Space
    // and used to map the structure into Memory or I/O Space.
    // 这个 bar 表示了所对应的 cfg_type 在内存中的地址。
    u8 bar; /* Where to find it. */
    // 一个类型比如 VIRTIO_PCI_CAP_COMMON_CFG 可以有多个 structures。
    // 这个表示对应 structure 的 id
    u8 id; /* Multiple capabilities of the same type */
    u8 padding[2]; /* Pad to full dword. */
    // indicates where the structure begins relative to the base address associated with the BAR.
    // BAR 不是指明了这个 cap 的地址吗,那么 offset 指明的就是这个 structure 开始的地方
    // 相对于 BAR 这个地址的 offset(因为如上面所示,一个 type 比如 VIRTIO_PCI_CAP_COMMON_CFG 可以有多个 structures)
    // 而一个此结构(virtio_pci_cap)只对应一个 structure。
    le32 offset; /* Offset within bar. */
    le32 length; /* Length of the structure, in bytes. */
}

cfg_type: The device MAY offer more than one structure of any type, this makes it possible for the device to expose multiple interfaces to drivers.

下面我们来看每一个 cfg_type

VIRTIO_PCI_CAP_COMMON_CFG Kernel

The common configuration structure is found at the bar and offset within the VIRTIO_PCI_CAP_COMMON_CFG capability.

struct virtio_pci_common_cfg {
	/* About the whole device. */
	__le32 device_feature_select;	/* read-write */
	__le32 device_feature;		/* read-only */
	__le32 guest_feature_select;	/* read-write */
	__le32 guest_feature;		/* read-write */
	__le16 msix_config;		/* read-write */
	__le16 num_queues;		/* read-only */
	__u8 device_status;		/* read-write */
	__u8 config_generation;		/* read-only */

	/* About a specific virtqueue. */
	__le16 queue_select;		/* read-write */
	__le16 queue_size;		/* read-write, power of 2. */
	__le16 queue_msix_vector;	/* read-write */
	__le16 queue_enable;		/* read-write */
	__le16 queue_notify_off;	/* read-only */
	__le32 queue_desc_lo;		/* read-write */
	__le32 queue_desc_hi;		/* read-write */
	__le32 queue_avail_lo;		/* read-write */
	__le32 queue_avail_hi;		/* read-write */
	__le32 queue_used_lo;		/* read-write */
	__le32 queue_used_hi;		/* read-write */
};

device_feature_select: The driver uses this to select which feature bits device_feature shows.

  • 0x0 selects Feature Bits 0 to 31,
  • 0x1 selects Feature Bits 32 to 63.

device_feature: The device uses this to report which feature bits it is offering to the driver: the driver writes to device_feature_select to select which feature bits are presented.

driver_feature_select: The driver uses this to select which feature bits driver_feature shows. Value 0x0 selects Feature Bits 0 to 31, 0x1 selects Feature Bits 32 to 63, etc.

driver_feature: The driver writes this to accept feature bits offered by the device. Driver Feature Bits selected by driver_feature_select.

msix_config: VIRTIO_PCI_COMMON_Q_MSIX: Set by the driver to the MSI-X vector for configuration change notifications(driver 设置一个 MSI-X 向量号用来 configuration change notify)。

virtio_pci_common_read
    switch (addr) {
    case VIRTIO_PCI_COMMON_MSIX:
        val = vdev->config_vector;

virtio_pci_common_write
    switch (addr) {
    case VIRTIO_PCI_COMMON_MSIX:
        // 先把之前的 vector invalidate 一下
        if (vdev->config_vector != VIRTIO_NO_VECTOR)
            msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
        // 使用 msix_vector_use() 来启用这个 vector
        if (val < proxy->nvectors)
            msix_vector_use(&proxy->pci_dev, val);
        else
            val = VIRTIO_NO_VECTOR;
        vdev->config_vector = val;

queue_msix_vector: Set by the driver to the MSI-X vector for virtqueue notifications:

virtio_pci_common_read
    switch (addr) {
    case VIRTIO_PCI_COMMON_Q_MSIX:
        val = virtio_queue_vector(vdev, vdev->queue_sel);

virtio_pci_common_write
    switch (addr) {
    case VIRTIO_PCI_COMMON_Q_MSIX:
        // 和 common misx 的逻辑是一样的
        // 先 unuse 关闭之前的,再 use 启用新的
        vector = virtio_queue_vector(vdev, vdev->queue_sel);
        if (vector != VIRTIO_NO_VECTOR)
            msix_vector_unuse(&proxy->pci_dev, vector);
        if (val < proxy->nvectors)
            msix_vector_use(&proxy->pci_dev, val);
        else
            val = VIRTIO_NO_VECTOR;
        virtio_queue_set_vector(vdev, vdev->queue_sel, val);

VIRTIO_PCI_CAP_DEVICE_CFG VirtIO Device Configuration Space

Used for rarely-changing or initialization-time parameters. Where configuration fields are optional, their existence is indicated by feature bits.

这个 configuration space 和 PCI configuration space 是不一样的。VirtIO spec 里并没有对于 configuration space 有详细的说明,只是表明这个是 device-specific 的。

注意这个 space 不是 virtio-pci 所特有的,virtio-mmio 也有,只是访问的方式是不一样的。

QEMU 代码里,VirtIODevice->config 表示的就是 VirtIO device configuration space。

virtio_pci_modern_regions_init
    static const MemoryRegionOps device_ops = {
        .read = virtio_pci_device_read,
        .write = virtio_pci_device_write,
        //...
    };
        virtio_pci_device_write / virtio_mmio_write
            virtio_config_modern_writew
                stw_le_p(vdev->config + addr, val);
    g_string_printf(name, "virtio-pci-device-%s", vdev_name);
    memory_region_init_io(&proxy->device.mr, OBJECT(proxy), &device_ops, proxy, name->str, proxy->device.size);