QEMU 的 memory model | Deep Dark Fantasy

为什么 free -m used 增加的量远小于我们在 QEMU cmdline 中指定的 memory size?

memory_access_is_direct() QEMU

判断一个 memory access 是 direct 的还是不是,direct 表示我们可以直接根据指针来进行更新,不是 direct 表示我们需要通过模拟或者其他手段来进行这个 memory region 的读写,比如 MMIO。

如果这个 access 是写:

如果这是一个 RAM memory region(注意 RAM 和 RAM device 不一样,请看函数 memory_region_is_ram() 和函数 memory_region_is_ram_device()),这个 RAM region 不是只读的,这个 region 不是一个 rom_device,这个 region 不是一个 ram_device。那么就是可以直接写的。

如果这个 access 是读:

ROM device 和 RAM 都是可以直接读的,RAM Device 是不行的。

static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
{
    if (is_write) {
        return memory_region_is_ram(mr) && !mr->readonly &&
               !mr->rom_device && !memory_region_is_ram_device(mr);
    } else {
        return (memory_region_is_ram(mr) && !memory_region_is_ram_device(mr)) ||
               memory_region_is_romd(mr);
    }
}

struct MemoryRegionOps QEMU

我的理解是,因为一个 MemoryRegion 并不一定就是 RAM,有的 MemoryRegion 可能是 MMIO 用的,所以这个 memory region 应该怎么被写什么的需要有自己的 callback。通过以下代码来印证:

case KVM_EXIT_MMIO: / case KVM_EXIT_IO:
    address_space_rw
        address_space_write
            flatview_write
                flatview_write_continue
                    for (;;) {
                        // 可以看到只有当这个 MR 不能 direct access 的时候,才 dispatch write。
                        if (!memory_access_is_direct(mr, true))
                            memory_region_dispatch_write
                                memory_region_write_accessor
                                    mr->ops->write(mr->opaque, addr, tmp, size);
// ops 被包含在 memoryregion 里
struct MemoryRegion {
    //...
    const MemoryRegionOps *ops;
    //...
}

kvm_arch_nr_memslot_as_ids() KVM

每一个 VM 可以有两个 address space。每一个 address space 都可以有许多 kvm_memory_slot

如果这个 VM 有 private mem(比如 TDX),那么只有 1 个 memslot。如果不是的话有两个。

# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2)

tdp_mmu_zap_leafs() KVM

Zap leaf SPTEs for the range of gfns, [start, end)

// 这条 path 是在发生了 fallocate(fd) 之后的 call path
// 我们可以看到 leaf 被 zap 了。
kvm_gmem_punch_hole / kvm_gmem_release
    kvm_gmem_invalidate_begin
        kvm_mmu_unmap_gfn_range
            kvm_unmap_gfn_range
                kvm_tdp_mmu_unmap_gfn_range
                    tdp_mmu_zap_leafs

// 这条 path 是在发生了 SET_ATTRIBUTE 之后的 call path
// SET_ATTRIBUTE 和 fallocate(fd) 的区别可以参阅笔记
kvm_vm_set_mem_attributes
    kvm_mmu_unmap_gfn_range
        kvm_unmap_gfn_range
            kvm_tdp_mmu_unmap_gfn_range
                tdp_mmu_zap_leafs

// kernel 那边 notify 过来的,比如 numa balancing 触发了 page migration 等等
kvm_mmu_notifier_invalidate_range_start
    kvm_mmu_unmap_gfn_range
        kvm_unmap_gfn_range
            kvm_tdp_mmu_unmap_gfn_range
                tdp_mmu_zap_leafs

// 要 delete 一个 memslot 或者 modify 一个 memslot 的时候
kvm_set_memslot
    kvm_invalidate_memslot
        kvm_arch_flush_shadow_memslot
            kvm_mmu_zap_memslot
                kvm_tdp_mmu_unmap_gfn_range
                    tdp_mmu_zap_leafs

// apicv / mtrr / set cr0 等等一些 corner case
kvm_zap_gfn_range
    kvm_tdp_mmu_zap_leafs
        tdp_mmu_zap_leafs
static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
			      gfn_t start, gfn_t end, bool can_yield, bool flush)
{
	struct tdp_iter iter;

	end = min(end, tdp_mmu_max_gfn_exclusive());

	lockdep_assert_held_write(&kvm->mmu_lock);

	rcu_read_lock();

	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
		if (can_yield &&
		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
			flush = false;
			continue;
		}

		if (!is_shadow_present_pte(iter.old_spte) ||
		    !is_last_spte(iter.old_spte, iter.level))
			continue;

		tdp_mmu_iter_set_spte(kvm, &iter, 0);
		flush = true;
	}

	rcu_read_unlock();

	/*
	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
	 */
	return flush;
}

kvm_align_section() QEMU

section 的右端不变,左端按 page size 向上 align。

返回 align 完后的新左端,并根据原右端和新左端计算新的 size 并返回。

static hwaddr kvm_align_section(MemoryRegionSection *section, hwaddr *start)
{
    hwaddr size = int128_get64(section->size);
    hwaddr delta, aligned;

    /* kvm works in page size chunks, but the function may be called
       with sub-page size and unaligned start address. Pad the start
       address to next and truncate size to previous page boundary. */
    // 在整个 AS 中的偏移量,向上 align
    aligned = ROUND_UP(section->offset_within_address_space, qemu_real_host_page_size());
    delta = aligned - section->offset_within_address_space;
    // 让 start 指向这里
    *start = aligned;
    // 新的 size 是 0
    if (delta > size) {
        return 0;
    }

    // 返回新的 size
    return (size - delta) & qemu_real_host_page_mask();
}

kvm_set_phys_mem() QEMU

这个函数会调用:kvm_set_user_memory_region()

// 这里是注册的地方
kvm_init
    kvm_memory_listener_register
        kml->listener.commit = kvm_region_commit;

// 这里是调用的地方
memory_listener_register
    listener_add_address_space
        listener->commit(listener);
            kvm_region_commit
                kvm_set_phys_mem

// 这里是调用的地方
memory_listener_unregister
    listener_del_address_space
        listener->commit(listener);
            kvm_region_commit
                kvm_set_phys_mem
static void kvm_set_phys_mem(KVMMemoryListener *kml, MemoryRegionSection *section, bool add)
{
    KVMSlot *mem;
    MemoryRegion *mr = section->mr;
    bool writable = !mr->readonly && !mr->rom_device;
    hwaddr start_addr, size, slot_size, mr_offset;
    ram_addr_t ram_start_offset;
    void *ram;

    // 这个 MR 不是 RAM 的情况
    // ...
    // alignment 之后
    // 此函数之后 start_addr 是 section 在整个 AS 中的偏移
    size = kvm_align_ection(section, &start_addr);
    //...

    // 因为 align 之后 section 左端偏移了,但是 MR 并没有变,所以对应的,
    // 这个 mr section 在 mr 中的偏移 mr_offset 表示的是 alignment 修正之后的 offset_within_region
    mr_offset = section->offset_within_region + start_addr - section->offset_within_address_space;

    // 拿到 section start 的 HVA
    ram = memory_region_get_ram_ptr(mr) + mr_offset;
    ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;

    // 老 slot
    if (!add) {
        do {
            // 一般就是 size,不会超过的
            slot_size = MIN(kvm_max_slot_size, size);
            // 找到这个 section 对应的 KVMSlot
            mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
            //...
            if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
                // sync dirty bitmap(暂时忽略了 dirty ring 的情况)
                // kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
                kvm_slot_get_dirty_log(kvm_state, mem);
                kvm_slot_sync_dirty_pages(mem);
            }

            /* unregister the slot */
            g_free(mem->dirty_bmap);
            mem->dirty_bmap = NULL;
            mem->memory_size = 0;
            mem->flags = 0;
            // KVM_SET_USER_MEMORY_REGION2
            err = kvm_set_user_memory_region(kml, mem, false);
            // error handling...
            start_addr += slot_size;
            size -= slot_size;
        } while (size);
        return;
    }

    /* register the new slot */
    do {
        slot_size = MIN(kvm_max_slot_size, size);
        mem = kvm_alloc_slot(kml);
        mem->as_id = kml->as_id;
        mem->memory_size = slot_size;
        mem->start_addr = start_addr;
        mem->ram_start_offset = ram_start_offset;
        mem->ram = ram;
        mem->flags = kvm_mem_flags(mr);
        mem->gmem_fd = mr->ram_block->gmem_fd;
        // 拿到这个 section 的起始 HVA 在包含它的 memory region
        // 的起始 HVA 中的偏移,作为 restricted_offset
        // 为什么不用 GPA 的偏移?
        mem->ofs = (uint8_t*)ram - mr->ram_block->host;

        // initialize mem->dirty_bmap
        // 因为这是一个新的 MR,所以我们直接把 bitmap 初始化为 0 就行了
        kvm_slot_init_dirty_bitmap(mem);
        // ioctl KVM_SET_USER_MEMORY_REGION2
        err = kvm_set_user_memory_region(kml, mem, true);
        // error handling...

        // 如果这个 memoryregion default 应该 private 的
        if (memory_region_is_default_private(mr)) {
            // ioctl KVM_SET_MEMORY_ATTRIBUTES
            err = kvm_set_memory_attributes_private(start_addr, slot_size);
            // error handling...
        }

        start_addr += slot_size;
        ram_start_offset += slot_size;
        ram += slot_size;
        size -= slot_size;
    } while (size);
}

address_space_translate_internal() QEMU

// addr 是在 AddressSpace 中的偏移,
// xlat 是在所在的 MemoryRegion 中偏移
static MemoryRegionSection *
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
                                 hwaddr *plen, bool resolve_subpage)
{
    MemoryRegionSection *section;
    MemoryRegion *mr;
    Int128 diff;

    section = address_space_lookup_region(d, addr, resolve_subpage);
    /* Compute offset within MemoryRegionSection */
    addr -= section->offset_within_address_space;

    /* Compute offset within MemoryRegion */
    *xlat = addr + section->offset_within_region;

    mr = section->mr;

    /* MMIO registers can be expected to perform full-width accesses based only
     * on their address, without considering adjacent registers that could
     * decode to completely different MemoryRegions.  When such registers
     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
     * regions overlap wildly.  For this reason we cannot clamp the accesses
     * here.
     *
     * If the length is small (as is the case for address_space_ldl/stl),
     * everything works fine.  If the incoming length is large, however,
     * the caller really has to do the clamping through memory_access_size.
     */
    if (memory_region_is_ram(mr)) {
        diff = int128_sub(section->size, int128_make64(addr));
        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
    }
    return section;
}

make_spte() KVM

Generate leaf page table entries.

bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
	       const struct kvm_memory_slot *slot,
	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
	       u64 old_spte, bool prefetch, bool can_unsync,
	       bool host_writable, u64 *new_spte)
{
	int level = sp->role.level;
    // 刚开始就置上 SPTE_MMU_PRESENT_MASK^,可以理解
    // 表示这不是一个 MMIO SPTE
	u64 spte = SPTE_MMU_PRESENT_MASK;
	bool wrprot = false;

    //...
    // accessd/dirty bit disabled
	if (sp->role.ad_disabled)
		spte |= SPTE_TDP_AD_DISABLED;
	else if (kvm_mmu_page_ad_need_write_protect(sp))
		spte |= SPTE_TDP_AD_WRPROT_ONLY;

	/*
	 * For the EPT case, shadow_present_mask is 0 if hardware
	 * supports exec-only page table entries.  In that case,
	 * ACC_USER_MASK and shadow_user_mask are used to represent
	 * read access. See FNAME(gpte_access) in paging_tmpl.h.
	 */
	spte |= shadow_present_mask;
	if (!prefetch)
		spte |= spte_shadow_accessed_mask(spte);

	/*
	 * For simplicity, enforce the NX huge page mitigation even if not
	 * strictly necessary.  KVM could ignore the mitigation if paging is
	 * disabled in the guest, as the guest doesn't have any page tables to
	 * abuse.  But to safely ignore the mitigation, KVM would have to
	 * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
	 * is toggled on, and that's a net negative for performance when TDP is
	 * enabled.  When TDP is disabled, KVM will always switch to a new MMU
	 * when CR0.PG is toggled, but leveraging that to ignore the mitigation
	 * would tie make_spte() further to vCPU/MMU state, and add complexity
	 * just to optimize a mode that is anything but performance critical.
	 */
	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
	    is_nx_huge_page_enabled(vcpu->kvm)) {
		pte_access &= ~ACC_EXEC_MASK;
	}

	if (pte_access & ACC_EXEC_MASK)
		spte |= shadow_x_mask;
	else
		spte |= shadow_nx_mask;

	if (pte_access & ACC_USER_MASK)
		spte |= shadow_user_mask;

	if (level > PG_LEVEL_4K)
		spte |= PT_PAGE_SIZE_MASK;

	if (shadow_memtype_mask)
		spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
							 kvm_is_mmio_pfn(pfn));
	if (host_writable)
		spte |= shadow_host_writable_mask;
	else
		pte_access &= ~ACC_WRITE_MASK;

	if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
		spte |= shadow_me_value;

	spte |= (u64)pfn << PAGE_SHIFT;

	if (pte_access & ACC_WRITE_MASK) {
		spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;

		/*
		 * Optimization: for pte sync, if spte was writable the hash
		 * lookup is unnecessary (and expensive). Write protection
		 * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
		 * Same reasoning can be applied to dirty page accounting.
		 */
		if (is_writable_pte(old_spte))
			goto out;

		/*
		 * Unsync shadow pages that are reachable by the new, writable
		 * SPTE.  Write-protect the SPTE if the page can't be unsync'd,
		 * e.g. it's write-tracked (upper-level SPs) or has one or more
		 * shadow pages and unsync'ing pages is not allowed.
		 */
		if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
			wrprot = true;
			pte_access &= ~ACC_WRITE_MASK;
			spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
		}
	}

	if (pte_access & ACC_WRITE_MASK)
		spte |= spte_shadow_dirty_mask(spte);

out:
	if (prefetch)
		spte = mark_spte_for_access_track(spte);

	WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
		  "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
		  get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));

	if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
		/* Enforced by kvm_mmu_hugepage_adjust. */
		WARN_ON_ONCE(level > PG_LEVEL_4K);
		mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
	}

	*new_spte = spte;
	return wrprot;
}

Collapsible SPTEs

可折叠的。也就是很多个小的 SPTE 可以折叠成为一个大的 SPTE。在折叠之前需要先把小的 SPTE zap 掉。

kvm_set_memslot
    kvm_commit_memory_region
        kvm_arch_commit_memory_region
            kvm_mmu_slot_apply_flags
                // 因为 dirty page logging 需要是 4k 粒度的,所以大页需要 split
                // LM 成功的话还好,大页会在 destination 这里重新组装起来。然而,如果
                // 失败了,小页就会存在,影响性能。所以如果发现我们关闭了 dirty page logging
                // drop 可以组装成大页的小页,等到后面 page fault 发生时,大页的 SPTE 会被创建。
            	if (!log_dirty_pages) {
                    kvm_mmu_zap_collapsible_sptes
                        kvm_tdp_mmu_zap_collapsible_sptes

kvm_mmu_zap_collapsible_sptes() / kvm_tdp_mmu_zap_collapsible_sptes() KVM

zap 的原则是不要 zap leaf SPTEs。zap 的都是中间的。

void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot)
{
	if (kvm_memslots_have_rmaps(kvm))
        // locking...
		kvm_rmap_zap_collapsible_sptes(kvm, slot);

	if (tdp_mmu_enabled)
        // locking...
		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
}

void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot)
{
	struct kvm_mmu_page *root;
    //...
	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
		zap_collapsible_spte_range(kvm, root, slot);
}

zap_collapsible_spte_range() KVM

一个 slot 可以有多个 shadow 页表。这个函数 zap 一个 slot 每一个 root 页表里所有可能可以 collapse 的 SPTEs。

static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot)
{
	gfn_t start = slot->base_gfn;
	gfn_t end = start + slot->npages;
	struct tdp_iter iter;
	int max_mapping_level;

    //...
    // 为什么搜索的最小 level 是 PG_LEVEL_2M 呢?
	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
			continue;

        // > 1G 或者 non-shadow present
        // 这种情况没有必要去检查
		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte))
			continue;

		/*
		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
		 * a large page size, then its parent would have been zapped
		 * instead of stepping down.
		 */
		if (is_last_spte(iter.old_spte, iter.level))
			continue;

		/*
		 * If iter.gfn resides outside of the slot, i.e. the page for
		 * the current level overlaps but is not contained by the slot,
		 * then the SPTE can't be made huge.  More importantly, trying
		 * to query that info from slot->arch.lpage_info will cause an
		 * out-of-bounds access.
		 */
		if (iter.gfn < start || iter.gfn >= end)
			continue;

		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, PG_LEVEL_NUM, false);
        // 最大支持的 mapping level 比当前 iter 的 level 要小
        // 说明我们没有办法 collapse iter。
		if (max_mapping_level < iter.level)
			continue;

        // 可以 collapse,所以先 zap 掉
        // 此时 iter 的 level 一定大于 4K(因为 min 是 2M),
        // 同时不可能是 leaf,那么就是 >= 2M 的 PSE
		tdp_mmu_zap_spte_atomic(kvm, &iter)
	}
    //...
}

lpage_info_slot() / struct kvm_lpage_info KVM

根据 kvm_memory_slot 里存储的数组信息决定一个 GFN 可不可以是一个 large 的。

struct kvm_lpage_info {
	int disallow_lpage;
};

struct kvm_arch_memory_slot {
    //...
    // 可以看到一个 kvm memory slot 要保存每一个 GFN 的 large page 信息
    // 是一个二维列表,没一个级别的 level 对应一个 GFN 的列表。 
    // 注意 0 对应的是 2M level,因为 4k page 本身就不需要看是不是 large 的
	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};

static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
		const struct kvm_memory_slot *slot, int level)
{
	unsigned long idx;

    // 找到这个 gfn 应该在 lpage_info 中的 index
	idx = gfn_to_index(gfn, slot->base_gfn, level);

    // 根据这个 index 得出这个 gfn 允不允许是一个 lpage
	return &slot->arch.lpage_info[level - 2][idx];
}

host_pfn_mapping_level() KVM

根据 GFN 找到对应的 HVA,然后根据 HVA 找 kernel 页表里的每一级。从根 PSE 往下找,得到这个 gfn 所对应 host 页的 level。

static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
				  const struct kvm_memory_slot *slot)
{
	int level = PG_LEVEL_4K;
	unsigned long hva;
	unsigned long flags;
	pgd_t pgd;
	p4d_t p4d;
	pud_t pud;
	pmd_t pmd;

	/*
	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
	 * is not solely for performance, it's also necessary to avoid the
	 * "writable" check in __gfn_to_hva_many(), which will always fail on
	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
	 * page fault steps have already verified the guest isn't writing a
	 * read-only memslot.
	 */
	hva = __gfn_to_hva_memslot(slot, gfn);
    //...

	/*
	 * Read each entry once.  As above, a non-leaf entry can be promoted to
	 * a huge page _during_ this walk.  Re-reading the entry could send the
	 * walk into the weeks, e.g. p*d_large() returns false (sees the old
	 * value) and then p*d_offset() walks into the target huge page instead
	 * of the old page table (sees the new value).
	 */
	pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
	if (pgd_none(pgd))
		goto out;

	p4d = READ_ONCE(*p4d_offset(&pgd, hva));
	if (p4d_none(p4d) || !p4d_present(p4d))
		goto out;

	pud = READ_ONCE(*pud_offset(&p4d, hva));
	if (pud_none(pud) || !pud_present(pud))
		goto out;

	if (pud_large(pud)) {
		level = PG_LEVEL_1G;
		goto out;
	}

	pmd = READ_ONCE(*pmd_offset(&pud, hva));
	if (pmd_none(pmd) || !pmd_present(pmd))
		goto out;

	if (pmd_large(pmd))
		level = PG_LEVEL_2M;

out:
    //...
	return level;
}

max_huge_page_level KVM

/*
 * max_huge_page_level reflects KVM's MMU capabilities irrespective
 * of kernel support, e.g. KVM may be capable of using 1GB pages when
 * the kernel is not.  But, KVM never creates a page size greater than
 * what is used by the kernel for any given HVA, i.e. the kernel's
 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). 这个函数会考虑到 kernel 的 capability
 */

QEMU memory management

首先感受一下 Memory model 是什么,在 QEMU 的 monitor console 中 info mtree

Memory 相关的数据结构都定义在 include/exec/memory.h 中。

Should the sub MemoryRegions fully cover the parent MemoryRegion?

No, we can see that none of the following are fully-covered:

00000000febd0000-00000000febd0fff (prio 1, i/o): vga.mmio
    00000000febd0000-00000000febd017f (prio 0, i/o): edid
    00000000febd0400-00000000febd041f (prio 0, i/o): vga ioports remapped
    00000000febd0500-00000000febd0515 (prio 0, i/o): bochs dispi interface
    00000000febd0600-00000000febd0607 (prio 0, i/o): qemu extended regs
00000000febd1000-00000000febd1fff (prio 1, i/o): virtio-net-pci-msix
    00000000febd1000-00000000febd103f (prio 0, i/o): msix-table
    00000000febd1800-00000000febd1807 (prio 0, i/o): msix-pba
00000000febd2000-00000000febd2fff (prio 1, i/o): virtio-serial-pci-msix
    00000000febd2000-00000000febd201f (prio 0, i/o): msix-table
    00000000febd2800-00000000febd2807 (prio 0, i/o): msix-pba
...

Why QEMU allocate the memory for the VM in a lazy style?

If set the memory to a large value, e.g., 16G, then launch the VM, free -h only show the used memory increase 1G, why?

When QEMU invokes this ioctl, has the memory already been allocated?

How does KVM associate the non-continuous userspace HVA to continuous GPA?

__kvm_set_memory_region // arch-agnostic, because each ISA need to use memory
    new->as_id = as_id;
	new->id = id;
    // ...
    kvm_set_memslot
        kvm_create_memslot // As an example, has others: kvm_delete_memslot, kvm_move_memslot, kvm_update_flags_memslot
        	kvm_replace_memslot(kvm, NULL, new);
        	kvm_activate_memslot(kvm, NULL, new);

find_ram_offset() QEMU

为新的 RAMBlock 找到合适的 offset 值。藉由此,我们可以知道 offset 到底是什么意思。

虽然函数涉及的数学计算比较多,但是你可以简单理解为往空隙里面塞东西。也就是说一个空隙一个空隙往后找,找到一个满足 size 大小的空隙就记录下来,最后在所有记录下来的空隙中找到最小的空隙。

所谓的空隙指的就是一个 RB 的 offset + max_length 到其后一个 RB 的 offset 之间的空间。

这个只能表示在 ram_list 中的偏移,不能够表示 GPA。所以你看我们通过 HVA 拿 GPA 的方式是调用 kvm_physical_memory_addr_from_host()。是通过 KVMSlot 而非 RAMBlock 来获取 GPA 的,尽管 RAMBlock 里也有 HVA 的信息。

int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
                                       hwaddr *phys_addr)
{
    KVMMemoryListener *kml = &s->memory_listener;
    int i, ret = 0;

    kvm_slots_lock();
    for (i = 0; i < s->nr_slots; i++) {
        KVMSlot *mem = &kml->slots[i];

        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
            *phys_addr = mem->start_addr + (ram - mem->ram);
            ret = 1;
            break;
        }
    }
    kvm_slots_unlock();

    return ret;
}

这说明:

  • RAMBlock 的 offset 到 offset + max_length 中间的这段区域不会重合。
// size is new_block->max_length
static ram_addr_t find_ram_offset(ram_addr_t size)
{
    RAMBlock *block, *next_block;
    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;

    // ...
    // 如果是 ram_list 里第一个 block,那么就用 0 吧
    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
        return 0;
    }

    RAMBLOCK_FOREACH(block) {
        ram_addr_t candidate, next = RAM_ADDR_MAX;

        //...
        // 对于 ram_list 里的每一个 RAMBlock,找到这个 block 的右界
        candidate = block->offset + block->max_length;
        //...

        // next 是一个逐渐缩小的值,对当前 candidate 值,
        // 找到比其大同时又是最接近的 block。
        RAMBLOCK_FOREACH(next_block) {
            if (next_block->offset >= candidate) {
                next = MIN(next, next_block->offset);
            }
        }

        if (next - candidate >= size && next - candidate < mingap) {
            offset = candidate;
            mingap = next - candidate;
        }
    }

    //...
    return offset;
}

KVMSlot (QEMU)

typedef struct KVMSlot
{
    // 这个 Slot 的 GPA
    hwaddr start_addr;
    // 这个 Slot 的大小
    ram_addr_t memory_size;
    // 这个 slot 的起始 HVA
    void *ram;
    // 这个 slot 的 id,也代表这个 slot 在 kml->slots 里的 index
    int slot;
    // 这两个主要是用来在 kvm_slot_update_flags 里进行判断
    // 每一次 ioctl KVM_SET_USER_MEMORY_REGION 的时候,如果这个 KVMSlot 上次调用
    // 这个 ioctl 的时候的 flag 和这次在是否 readonly 上不同,那么会先调用 KVM_SET_USER_MEMORY_REGION
    // 把以前的整个删掉,然后 set 新的,这两个 flag 的用途之一就是用来 track 这个的,具体看
    // kvm_set_user_memory_region
    // 表示上次 ioctl KVM_SET_USER_MEMORY_REGION 和这次的 flags
    int flags;
    int old_flags;
    /* Dirty bitmap cache for the slot */
    // 这个 dirty_bmap 相比于 RAMBlock 里的 bmap,优势在哪里?
    // RAMBlock 里的主要是为了 migration,因为 migration 是基于 RAMBlock
    // 而非 KVMSlot 的,所以在里面有一个 bitmap 很有必要。可以看到 bmap 更加细粒度
    // 当一个 page 变脏时
    unsigned long *dirty_bmap;
    unsigned long dirty_bmap_size;
    // 这个 KVMSlot 属于哪一个 address space
    int as_id;
    // Cache of the offset in ram address space
    ram_addr_t ram_start_offset;
    // 这两个就是为了 gmem 加进去的
    // fd 就是 gmem_fd
    int fd;
    // ofs 表示 offset,使用 RAM - HVA
    hwaddr ofs;
} KVMSlot;

This struct is constructed later than MemoryRegion.

表示一个 KVM 虚拟机的内存插槽,类似于 kvm memory slot。

KVMSlot 结构体只在 KVMMemoryListerner 里以数组形式包含,每一项表示一个内存插槽。

我们可以看到在创建 RAM 时,函数 kvm_set_phys_mem() 会被调用到:

kvm_set_phys_mem
    //...
    slot_size = MIN(kvm_max_slot_size, size);
    mem = kvm_alloc_slot(kml);
    mem->as_id = kml->as_id;
    mem->memory_size = slot_size;
    mem->start_addr = start_addr;
    mem->ram_start_offset = ram_start_offset;
    mem->ram = ram;
    mem->flags = kvm_mem_flags(mr);
    mem->fd = mr->ram_block->restricted_fd;
    mem->ofs = (uint8_t*)ram - mr->ram_block->host;

RAMBlockKVMSlot 的区别是什么?

RAMBlock 里有 HVA,KVMSlot 也有 HVA。

KVMSlot 里有 GPA,

dirty_bmap QEMU

    /*
     * Layout of the KVMSlot.dirty_bmap:
     *
     *                   |<-------- bmap_npages -----------..>|
     *                                                     [1]
     *                     start_delta         size
     *  |----------------|-------------|------------------|------------|
     *  ^                ^             ^                               ^
     *  |                |             |                               |
     * start          bmap_start     (start)                         end
     * of memslot                                             of memslot
     */

kvm_alloc_slot() QEMU

这个函数本身没什么好讲的,但是有趣的一点是传进来的是一个 KVMMemoryListener,说明我们是先创建了这个结构体以及里面的 slots array,然后从 array 里面拿空闲的出来进行所谓的“分配”。

static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
{
    return kvm_get_free_slot(kml);
    //...
}

MemoryListener / KVMMemoryListener QEMU

注意,请不要和 KVM 里的 MMU Notifier 弄混,MMU Notifier 是为了当 MMU 有改动时同步到 KVM,而 KVMMemoryListener 是为了同步到 QEMU。

Callbacks structure for updates to the physical memory map.

一个 KVMMemoryListener 对应一个(或者说 listen 一个)Address Space。

AddressSpace 中的 MemoryRegion 发生变化,则触发注册的 listener,处理 region 变更的事件。

为什么需要设计 Listener, 直接触发不行吗?

/**
 * struct MemoryListener: callbacks structure for updates to the physical memory map
 *
 * Allows a component to adjust to changes in the guest-visible memory map.
 * Use with memory_listener_register() and memory_listener_unregister().
 */
struct MemoryListener {
    // Called at the beginning of an **address space update transaction**.
    // Called in: memory_region_transaction_commit() and listener_add_address_space()
    void (*begin)(MemoryListener *listener);

    // Called at the end of an **address space update transaction**
    // Called in: memory_region_transaction_commit() and listener_del_address_space()
    void (*commit)(MemoryListener *listener);

    /**
     * Called during an **address space update transaction**,
     * for a section of the address space that is new in this address space
     * space since the last transaction.
     */
    void (*region_add)(MemoryListener *listener, MemoryRegionSection *section);

    /**
     * Called during an **address space update transaction**,
     * for a section of the address space that has disappeared in the address
     * space since the last transaction.
     */
    void (*region_del)(MemoryListener *listener, MemoryRegionSection *section);

    /**
     * Called during an **address space update transaction**,
     * for a section of the address space that is in the same place in the address
     * space as in the last transaction.
     */
    void (*region_nop)(MemoryListener *listener, MemoryRegionSection *section);

    // log-related
    //...

    /**
     * Called during an address space update transaction,
     * for a section of the address space that has had a new ioeventfd
     * registration since the last transaction.
     */
    void (*eventfd_add)(MemoryListener *listener, MemoryRegionSection *section,
                        bool match_data, uint64_t data, EventNotifier *e);

    /**
     * Called during an address space update transaction,
     * for a section of the address space that has dropped an ioeventfd
     * registration since the last transaction.
     */
    void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section,
                        bool match_data, uint64_t data, EventNotifier *e);

    /**
     * Called during an address space update transaction,
     * for a section of the address space that has had a new coalesced
     * MMIO range registration since the last transaction.
     */
    void (*coalesced_io_add)(MemoryListener *listener, MemoryRegionSection *section,
                               hwaddr addr, hwaddr len);

    /**
     * Called during an address space update transaction,
     * for a section of the address space that has dropped a coalesced
     * MMIO range since the last transaction.
     */
    void (*coalesced_io_del)(MemoryListener *listener, MemoryRegionSection *section,
                               hwaddr addr, hwaddr len);

    /**
     * Called during the memory attribute conversion.
     */
    void (*convert_mem_attr)(MemoryListener *listener, MemoryRegionSection *section,
                            bool shared);

    // 决定 memory listeners 们被调用的顺序,对不同函数调用顺序的影响不一样,比如
    // Lower priorities are invoked earlier for "add" or "start" callbacks, and later for "delete"
    // or "stop" callbacks.
    unsigned priority;

    /**
     * Name of the listener.  It can be used in contexts where we'd like to
     * identify one memory listener with the rest.
     */
    const char *name;

    // The address space this listener is relate to
    // will be set in memory_listener_register()
    AddressSpace *address_space;
    QTAILQ_ENTRY(MemoryListener) link;
    QTAILQ_ENTRY(MemoryListener) link_as;
};

// wrapper function for listener_add_address_space
// kvm_init()
//    kvm_memory_listener_register
void memory_listener_register(MemoryListener *listener, AddressSpace *as)
{
    // add listener to the memory_listeners list
    // add listener to the as->listeners list
    // listener_add_address_space(listener, as)
}

static void listener_add_address_space(MemoryListener *listener,
                                       AddressSpace *as)
{
    listener->begin(listener);
    listener->log_global_start(listener);
    listener->region_add(listener, &section);
    listener->log_start(listener, &section, 0, fr->dirty_log_mask);
    listener->commit(listener);
}
typedef struct KVMMemoryListener {
    MemoryListener listener;
    KVMSlot *slots;
    int as_id;
} KVMMemoryListener;

memory_listener_register

void memory_listener_register(MemoryListener *listener, AddressSpace *as)
{
    // ...
    // Register the listener to address space
    listener->address_space = as;

    // Add this memory listener to the global `memory_listeners` list
    //...
    // Add this memory listener to as->listeners, because the address
    // space also has a property to track all the listeners registered on it
    //...

    // Call listener->begin(), listener->region_add(), listener->commit()...
    listener_add_address_space(listener, as);
}

listener_add_address_space

这个函数只会在 register 一个 listener 到该 address space 的时候被调用。

这个函数调用了:

  • begin
  • region_add
  • commit

listener_del_address_space

这个函数只会在 unregister 一个该 address space 上的 listener 的时候被调用。

这个函数调用了:

  • begin
  • region_del
  • commit
KVMMemoryListener* kml->listener.commit = kvm_region_commit; // registered in kvm_init():kvm_memory_listener_register() function 
    kvm_region_commit
        kvm_set_phys_mem
            kvm_set_user_memory_region
                KVM_SET_USER_MEMORY_REGION

void (*commit)(MemoryListener *listener); // called in listener_del_address_space / listener_add_address_space

MemoryRegion initialized by memory_region_init_ram has its own physical memory (allocated from QEMU process address space). The allocated physical memory returns hva, and is saved to the host domain of RAMBlock.

Why do we need a thing to describe the fragment of a MemoryRegion?

// In kvm
struct RAMBlock { 
    struct rcu_head rcu;  
    struct MemoryRegion *mr; // MemoryRegion 
    uint8_t *host; // HVA 
    ram_addr_t offset; // in ram_list  

    ram_addr_t used_length;  

    ram_addr_t max_length;  

    void (*resized)(const char*, uint64_t length, void *host);  

    uint32_t flags; 

    char idstr[]; // id 

  

    QLIST_ENTRY(RAMBlock) next; //next block in ram_list.blocks     

    int fd;  

    size_t page_size;  

}; 

tdp_page_fault checks whether this exit is caused by MMIO firstly, call the handle_mmio_page_fault function to handle MMIO pagefault if so. And then obtain the corresponding pfn according to the gfn. This process needs to first obtain the slot corresponding to the gfn and convert it into an hva, and then execute the normal host address translation process. If the address corresponding to the hva is not in the memory, page fault needs to be processed by the host itself. __direct_map function is called to build the page table.

__direct_map excute the for_each_shadow_entry loop, which is used to traverse the corresponding entries of the EPT page table according to gfn. In the loop, the level of the entry will be checked. If the level are not equal to the request, then check whether the next-level page corresponding to the entry exists.

If it does not exist, then rebuilds it with kvm_mmu_get_page, and if it exists,then traverse backward directly, compare the entry in the next-level page table, search EPT layer according to the gpa group. When the final level is equal, locate a PTE according to the index of the last layer. The PTE should point to the pfn corresponding to the gfn. And the spte will be set .

generate_memory_topology() QEMU

根据一个顶层的 MR,生成一个 FlatView。

/* Render a memory topology into a list of disjoint absolute ranges. */
static FlatView *generate_memory_topology(MemoryRegion *mr)
{
    //...
    render_memory_region(view, mr, int128_zero(),
                         addrrange_make(int128_zero(), int128_2_64()),
                         false, false);
    // 简化一下?
    flatview_simplify(view);

    view->dispatch = address_space_dispatch_new(view);
    for (i = 0; i < view->nr; i++) {
        MemoryRegionSection mrs = section_from_flat_range(&view->ranges[i], view);
        flatview_add_to_dispatch(view, &mrs);
    }
    address_space_dispatch_compact(view->dispatch);
    g_hash_table_replace(flat_views, mr, view);

    return view;
}

render_memory_region() QEMU

是一个根据 MemoryRegion(不仅仅是顶层,因为这个函数会递归调用自己)构建传进来的 view 的过程。

通过对这个函数深入分析不难发现:

  • 所有的 FlatRange 所表示的区间不会交叉。
  • 可能有多个 FlatRange 一同瓜分一个叶子 MemoryRegion 的区域。
  • FlatView->ranges 是按照起始地址从小到大排序的(通过插入的方式可以看出来)。再因为所有的 FlatRange 之间不会交叉,所以可以得出第 $i$ 个 range 的右端一定是 $\le$ 第 $i+1$ 个 range 的左端的。

看下来只是为叶子 MR 和其所有祖先 MR 都重合的区间做了 FlatRange,那叶子 MR 未重合的区间呢?是不是没有可能叶子 MR 的区间超出父 MR 的范围?

static void render_memory_region(FlatView *view,
                                 MemoryRegion *mr,
                                 Int128 base,
                                 AddrRange clip,
                                 bool readonly,
                                 bool nonvolatile)
{
    //...
    FlatRange fr;
    AddrRange tmp;

    //...
    // addr 是在父 MR 中的偏移量,base 传进来的时候想必也已经是父 MR 的其实 GPA 了
    // 所以这行代码是把 base 更新为自己的 GPA
    int128_addto(&base, int128_make64(mr->addr));
    // 在 DFS 的搜索过程中,但凡有一个祖先节点是 readonly 的,那这个祖先的所有孩子 MR 都是 readonly 的
    // 可以理解
    readonly |= mr->readonly;
    //...

    // 把当前 MR 的起始 GPA 以及 size 放到 AddrRange 里面备用
    tmp = addrrange_make(base, mr->size);

    // 传进来的 clip 表示所有祖先 MR 所**重合**的 GPA 区间
    // 我们需要保证当前 MR 和祖先 MR 有交叉的地方才能继续
    if (!addrrange_intersects(tmp, clip)) {
        return;
    }

    // 更新 clip,方便下一次传递
    clip = addrrange_intersection(tmp, clip);

    // 这是一个 alias MR
    if (mr->alias) {
        // 因为到 alias 里的时候会加,所以我们先减回去
        int128_subfrom(&base, int128_make64(mr->alias->addr));
        // 还不够,因为 alias_offset 表示的是此 MR 在指向 MR 中的偏移
        // 所以指向 MR 的真实起始地址要更小,需要减去这个 offset
        int128_subfrom(&base, int128_make64(mr->alias_offset));
        render_memory_region(view, mr->alias, base, clip, readonly, nonvolatile);
        return;
    }

    // 我们是以 DFS 递归的形式进行 render 的。
    // 找到所有子 MR,递归调用 render
    QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
        render_memory_region(view, subregion, base, clip, readonly, nonvolatile);
    }

    // 如果不是叶子 MR,那么没必要继续了。因为非叶子 MR 不需要对应的 FlatRange
    if (!mr->terminates) {
        return;
    }

    // 父 MR 表示的空间不一定完全包含子 MR,有可能父 MR 的 start 要比
    // 子 MR 的 start 要大。这就造成了 clip.start 可以大于 base
    // 而一个 FR 的 start 是由 clip.start 决定的,因此 clip.start - base 就表示这个
    // 这个 FlatRange 的 start 在对应 MR 的偏移。
    offset_in_region = int128_get64(int128_sub(clip.start, base));

    // FlatRange 的区间起始不一定是现在剩余的 clip 的区间
    // 因为这个 clip 区间可能会根据和其他 FlatRange 的交叉情况被分成多个 FlatRange
    // 此时,我们可以用长条来形容 clip,这个长条需要被切成多块,每一块表示一个 FlatRange
    base = clip.start;
    remain = clip.size;

    fr.mr = mr;
    fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
    fr.romd_mode = mr->romd_mode;
    fr.readonly = readonly;
    //...

    // 迭代目前所有已有的 FlatRange
    for (i = 0; i < view->nr && int128_nz(remain); ++i) {
        // base >= 这个 FlatRange 的结束的地址(右端)
        // 那说明我们在更右侧,这个 FlatRange 不值得继续
        if (int128_ge(base, addrrange_end(view->ranges[i].addr))) {
            continue;
        }
        // ----------------------------------------
        // 此时,长条 和 当前 FlatRange 有以下几种可能的关系:
        //  1. 长条完全在 FlatRange 左侧(长条右 小于 FR 左)
        //     那就把整个长条打包成一个 FlatRange 插入进去完事,结束退出循环
        //  2. 长条包含了 FlatRange
        //     长条左侧未覆盖部分打包成 FlatRange 插入,中间覆盖部分忽略,右侧继续进行下一个 FlatRange 的对比判断
        //  3. FlatRange 包含了长条
        //     忽略长条,什么都不插入,结束退出循环
        //  4. 两者重合了,长条在左
        //     长条左侧未覆盖部分打包成 FlatRange 插入,结束退出循环
        //  5. 两者重合了,长条在右
        //     长条中间覆盖部分忽略,右侧继续进行下一个 FlatRange 的对比判断
        // 可以看出,只有 2 和 5 这两种情况需要继续,其他的情况直接退出即可
        // ----------------------------------------
        // 当然,base 也有可能小于这个 FlatRange 的起始地址(左端)
        // 但这不一定就会超出范围,因为右端不确定
        // 经过这个分支之后,有可能长条已经被切没了
        if (int128_lt(base, view->ranges[i].addr.start)) {
            // 找到我们 base 到当前 range 的 start 之间的空隙,和我们的 size 作比较
            // 选小的那个
            now = int128_min(remain, int128_sub(view->ranges[i].addr.start, base));
            fr.offset_in_region = offset_in_region;
            // 把小的这个作为我们 FlatRange 的 size
            fr.addr = addrrange_make(base, now);
            // 我们直接 copy 了 fr 得数据,所以虽然过程中我们要 insert 多个 FlatRange
            // 我们仍然可以复用 fr 这个变量
            flatview_insert(view, i, &fr);
            ++i;
            // 我们的长条(base, ramain)从左边切掉一块了
            // 所以更新 base 的值,来反映这一状况
            int128_addto(&base, now);
            // 当然,接下来的 FlatRange 起始的偏移也要加上切掉的长度
            offset_in_region += int128_get64(now);
            // remain 也要更新,来反应我们的长条因为被切掉一块而长度变短的事实
            int128_subfrom(&remain, now);
        }
        // --- 这一部分表示我们要跳过当前 FlatRange 覆盖的部分
        // 首先取 剩余长条的右端 和 这个 FlatRange 的右端 之间的较小值,也就是靠左的那个
        // 将这个值减去 剩余长条的左端,因此 now 表示的是长度
        now = int128_sub(int128_min(int128_add(base, remain), addrrange_end(view->ranges[i].addr)), base);
        // 将 base 加上这个长度,相当于将 base 置为 剩余长条的右端 和 这个 FlatRange 的右端 靠左的那个
        int128_addto(&base, now);
        // 接下来要创造的 FlatRange 在 MR 里的 offset 需要加上这个长度,表示我们跳过了交叉的这部分
        offset_in_region += int128_get64(now);
        // remain 也要减去这段被当前 FlatRange 覆盖的长度
        int128_subfrom(&remain, now);
    }

    // 如果在遍历了所有得 FlatRange 之后这个长条还有剩余,那么就直接把剩余得部分变成一个 FlatRange
    if (int128_nz(remain)) {
        fr.offset_in_region = offset_in_region;
        fr.addr = addrrange_make(base, remain);
        flatview_insert(view, i, &fr);
    }
}

struct RAMList / ram_list / QEMU

typedef struct RAMList {
    QemuMutex mutex;
    RAMBlock *mru_block;
    /* RCU-enabled, writes protected by the ramlist lock. */
    QLIST_HEAD(, RAMBlock) blocks;
    // 这是 array 了,每一个 entry 是一个指向 DirtyMemoryBlocks 的指针,目前这个 array 的长度是 3:
    //     - DIRTY_MEMORY_VGA
    //     - DIRTY_MEMORY_CODE
    //     - DIRTY_MEMORY_MIGRATION
    // DirtyMemoryBlocks^ 这个结构体也是一个 2 维的 long 类型 array,里面每一个 long 类型一维列表表示一个 block。
    // 因为一个 block 由一列 long 表示,所以一个 block 包含了许多个 page,所以这就成了一个 2 维的结构。
    // 一个 DirtyMemoryBlocks 表示的是整个 ram_list 内存地址空间的(包含许多个 RAMBlock)的
    // dirty bitmap,所以注意和 RAMBlock 里的 dirty bitmap 做区分,那个只是针对于这个 RAMBlock 的。
    // 别看这是一个 list,其实主要还是 DIRTY_MEMORY_MIGRATION
    // 
    DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM];
    uint32_t version;
    QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
} RAMList;
extern RAMList ram_lis;

ram_list 里的 RAMBlock 是通过大小max_length)排序的,从大到小,从函数 ram_block_add() 的注释中可以看到这一点。为什么这么做呢?

struct DirtyMemoryBlocks QEMU

#define DIRTY_MEMORY_VGA       0
#define DIRTY_MEMORY_CODE      1
#define DIRTY_MEMORY_MIGRATION 2
#define DIRTY_MEMORY_NUM       3        /* num of dirty bits */

typedef struct {
    struct rcu_head rcu;
    unsigned long *blocks[];
} DirtyMemoryBlocks;

Data Structures

HostMemoryBackend (QEMU)

struct HostMemoryBackend {
    //...
    // Representing host memory belonging to backend
    // 和 MachineState->ram 是同一个东西
    // 虽然不是顶层 MR,只是一个叶子实体 MR,但是仍然代表了所有的 RAM,包含在 system_memory 之下
    // 这也说明了 MR 树的深度本身就很小,没有特别复杂的嵌套关系。
    MemoryRegion mr;
};

kvm_memslots (KVM)

这个结构体大概和一个 AddressSpace 对应。

struct kvm_memslots {
	/* Generations must be different for each address space. */
	u64 generation;
	atomic_long_t last_used_slot;
	struct rb_root_cached hva_tree;
	struct rb_root gfn_tree;
	/*
	 * The mapping table from slot id to memslot.
	 *
	 * 7-bit bucket count matches the size of the old id to index array for
	 * 512 slots, while giving good performance with this slot count.
	 * Higher bucket counts bring only small performance improvements but
	 * always result in higher memory usage (even for lower memslot counts).
	 */
	DECLARE_HASHTABLE(id_hash, 7);
	int node_idx;
};

kvm_memory_slot (KVM)

一个 AddressSpace 比如说内存,可能有多个内存条,这个可以理解就是一个内存条,所以这个结构体大概是和一个 KVM 里的 mem_region 对应?

struct kvm_memory_slot {
    // Two memslot sets (one active and one inactive) are necessary so the VM
    // continues to run on one memslot set while the other is being modified.
    // These two memslot sets normally point to the same set of memslots.
    // They can, however, be desynchronized when performing a memslot management
    // operation by replacing the memslot to be modified by its copy.
    // After the operation is complete, both memslot sets once again point to
    // the same, common set of memslot data.
	struct hlist_node id_node[2];
	struct interval_tree_node hva_node[2];
	struct rb_node gfn_node[2];
	gfn_t base_gfn; // 虚拟机的开始 gfn
	unsigned long npages; // 总 page 数量
	unsigned long *dirty_bitmap;
	struct kvm_arch_memory_slot arch;
	unsigned long userspace_addr; // HVA
	u32 flags;
	short id; // id
	u16 as_id; // 所属 address space 的 id
	struct file *restricted_file;
	loff_t restricted_offset;
	struct restrictedmem_notifier notifier;
	struct kvm *kvm;
};

struct kvm_arch_memory_slot {
	struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
    // gfn track
	unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
};

__gfn_to_pfn_memslot() KVM

要完成将 GFN 转化为 PFN,需要先将 GFN 转化为 HVA,再将 HVA 转化为 PFN。

注意别把这个和影子页表弄混了,即使是用了 TDP 也需要这两步,因为只有知道了对应的映射,才能把它写到 EPT 里面,下次 EPT 在做转换的时候才不会发生 page fault。

// write_fault, writable 有什么区别?
// write fault 表示这个 access 是一个 write,对应 fault->write
// 对于一个 access 是 read 的 fault,也就是说 !fault->write,writable
// 为 NULL 时就映射一个不可写的 gfn,如果不为 NULL,再映射可写的
// 这是为了支持 readonly memslot,因为之前统一都给映射成了可写的
kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
			       bool atomic, bool interruptible, bool *async,
			       bool write_fault, bool *writable, hva_t *hva)
{
    // 先计算出 gfn 相对于 base gfn 的 offset:offset = gfn - slot->base_gfn
    // 表示这个 gfn 是这个 slot 里的第几个 page
    // slot->userspace_addr + offset * PAGE_SIZE 就能计算出 HVA
	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);

    // 放到传进来的 fault->hva 里
	if (hva)
		*hva = addr;

    // memslot 是 readonly 的
    // 那说明这是一个 MMIO 的 address
    // 有一个问题,为什么 MMIO 可以 read 呢?不应该 read 也需要 emulate 吗?
	if (addr == KVM_HVA_ERR_RO_BAD) {
		if (writable)
			*writable = false;
		return KVM_PFN_ERR_RO_FAULT;
	}

    // error handling
    // ...

	// For readonly memslot, disable its pfn's host-writable^ bit
    // If following is true, this assumes !write_fault, or we have returned previously
    // 因为 !write_fault, 那么这是一个 access 引起的 fault,那么就对应到了四种情况(详见 is_writable_pte 的注释)
    // 第四种情况,模拟 A/D bit。
	if (writable && memslot_is_readonly(slot)) {
		*writable = false;
		writable = NULL;
	}

	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
			  writable);
}

Private/Internal kvm_memory_slot / KVM

原来叫 private,后来为了避免和后面可能进入的 private 冲突,所以更名为 internal。See bdd1c37a315bc50ab14066c4852bc8dcf070451e

memslot->id >= KVM_USER_MEM_SLOTS 时,表示这是一个 internal 的 memory slot。KVM_MEM_SLOTS_NUM 被宏定义为 short 类型所能表示的最大的数。

// memory slots that are not exposed to userspace, they are for internal use
#define KVM_INTERNAL_MEM_SLOTS 3
// 目前定义了以下三种 internal memslot
#define TSS_PRIVATE_MEMSLOT			(KVM_USER_MEM_SLOTS + 0)
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 1)
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 2)

#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)

__x86_set_memory_region() KVM

Setup KVM internal memory slot. This function helps to setup a KVM internal memory slot.

  • Specify size > 0 to install a new slot,
  • size == 0 to uninstall a slot.
void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
{
	int i, r;
	unsigned long hva, old_npages;
	struct kvm_memslots *slots = kvm_memslots(kvm);
	struct kvm_memory_slot *slot;

    //...
	slot = id_to_memslot(slots, id);
    // 添加一个新的 internal memory slot
	if (size) {
        //...
        // 申请 size 大小的内存
		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0);
        //...
    // uninstall an internal memory slot
	} else {
        //...
		old_npages = slot->npages;
		hva = slot->userspace_addr;
	}

	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		struct kvm_userspace_memory_region m;

		m.slot = id | (i << 16);
		m.flags = 0;
		m.guest_phys_addr = gpa;
		m.userspace_addr = hva;
		m.memory_size = size;
        // 有可能 install,也有可能 uninstall
		r = __kvm_set_memory_region(kvm, &m);
        //...
	}

    // 把旧的 HVA 取消掉
	if (!size)
		vm_munmap(hva, old_npages * PAGE_SIZE);

	return (void __user *)hva;
}

AddressSpace (KVM)

KVM doesn't have a struct AddressSpace, but it has as_id, which index a kvm_memslots:

# define KVM_ADDRESS_SPACE_NUM 2
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
			lockdep_is_held(&kvm->slots_lock) ||
			!refcount_read(&kvm->users_count));
}

A kvm_memslots contains many kvm_memory_slot.

AddressSpace (QEMU)

主要的作用是做映射,用于描述整个地址空间的映射关系。只要给出 AddressSpace 以及要映射的地址,最后就可以找到最后的 handler 为 kbd_read_data。(具体的函数是什么?)

AddressSpace 之间没有关系,基本上都 cover 了 0000000000000000-ffffffffffffffff,guest 写相同的地址,在 io address space 和 memory address space 的效果不同的。

但是这里有一个问题,这些 MemoryRegion 逐层嵌套,如果不做简化处理,为了确定一个地址到底落到了哪一个 MemoryRegion,每次都需要 从顶层 MemoryRegion 逐个找其 child,其中还需要处理 alias 和 priority 的问题,而且到底命中哪一个 MemoryRegion 需要这个比较范围。 为此,QEMU 在每次 MemoryRegion 的属性发生修改的时候都会进行两个事情:

  • MemoryRegion 压平为 FlatRange,避免逐级查询 MemoryRegion
  • FlatRange 变为树的查询,将查询从 O(n) 的查询修改为 O(log(N))

QEMU 的 memory model | Deep Dark Fantasy

/**
 * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
 */
struct AddressSpace {
    char *name;
    MemoryRegion *root; // 一个 AddressSpace 和一个根 MemoryRegion 所表示的树对应(树的 range 需要完全 cover 这个 AS 吗)
    struct FlatView *current_map; // 一个 AddressSpace 也缓存了一个 FlatView
    int ioeventfd_nb;
    struct MemoryRegionIoeventfd *ioeventfds;

    // The memory listeners attached/registered on this address space
    QTAILQ_HEAD(, MemoryListener) listeners;
    QTAILQ_ENTRY(AddressSpace) address_spaces_link;
};

address_space_map() QEMU

这个函数返回的是一个 HVA。

传进来 GPA(或者应该说是一个 address space 里的 offset,如果这个 address space 是 RAM 的话,那么可以对应 GPA),map 成对应的 HVA(也就是在 host back 的那片区域)。

// Map a physical memory region into a HVA.
// May map a subset of the requested range, given by and returned in *plen.
// addr: address within that address space
// plen: pointer to length of buffer; updated on return
void *address_space_map(AddressSpace *as,
                        hwaddr addr,
                        hwaddr *plen,
                        bool is_write,
                        MemTxAttrs attrs)
{
    hwaddr len = *plen;
    hwaddr l, xlat;
    MemoryRegion *mr;
    FlatView *fv;

    //...
    l = len;
    //...
    fv = address_space_to_flatview(as);
    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);

    // Indirect memory access
    // 是不是可以暂时先不看?
    if (!memory_access_is_direct(mr, is_write)) {
        if (qatomic_xchg(&bounce.in_use, true)) {
            *plen = 0;
            return NULL;
        }
        /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
        bounce.addr = addr;
        bounce.len = l;

        memory_region_ref(mr);
        bounce.mr = mr;
        if (!is_write) {
            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
        }

        *plen = l;
        return bounce.buffer;
    }

    memory_region_ref(mr);
    *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs);
    fuzz_dma_read_cb(addr, *plen, mr);
    // 返回的是 HVA
    // 输入是 offset (xlat)。
    return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
}

RAMBlock QEMU

存在的意义是 cleanup 的时候更方便?然后更好地记录已经申请的内存?

真正分配了物理内存的地方。我们可以把 RAMBlock 看成一个根的 MemoryRegion 附带一些比如物理内存之类的 metadata。

pc_init1 / pc_q35_init
pc_memory_init
memory_region_init_ram
memory_region_init_ram_nomigrate
memory_region_init_ram_flags_nomigrate
qemu_ram_alloc
qemu_ram_alloc_internal
ram_block_add
struct RAMBlock {
    // 如果用了 memory-backend-memfd(不管是 private=on 还是 off) (-object memory-backend-memfd),就会有这个
    // memfd_backend_info->class_init = memfd_backend_class_init
    //     bc->alloc = memfd_backend_memory_alloc;
    //         memfd_backend_memory_alloc
    //             // QEMU 的 memfd backend 依赖于 kernel 里 Sealed files^ / Memfd^ 的实现
    //             fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size, m->hugetlb, m->hugetlbsize,...)
    //                 memfd_create
    //             memory_region_init_ram_from_fd
    //                 qemu_ram_alloc_from_fd
    //                     new_block->gmem_fd = -1;
    //                     file_ram_alloc
    //                         block->fd = fd
    // 综上,fd 表示的就是当使用 memfd 作为 backend 时的 fd。
    // 所以其是一个和 host 等价的存在,只不过 host 在使用 ptr 直接作为 backend 的时候的 HVA
    int fd;
    // 只有在创建时就希望是 resizeable 的,也就是说用 qemu_ram_alloc_resizeable 创建的,这个才和下面的 max_length 不相等
    // 其他情况都是相等的,都等于创建时传进来的 size。
    ram_addr_t used_length;
    // 同上
    ram_addr_t max_length;
    // dirty bitmap used during migration
    // 什么时候会进行置位?
    // 初始化时全部置为 1 表示所有 pages 都要发送,
    unsigned long *bmap;
    // 每个 RAMBlock 都有一个唯一的 MemoryRegion,但不是每个 MemoryRegion 都有 RAMBlock 对应,比如最顶级的 MemoryRegion。
    // 它还有很多子 MemoryRegion,比如在这个 ramblock 地址范围内的 MMIO 等。
    struct MemoryRegion *mr;
    // HVA
    uint8_t *host;
    QLIST_ENTRY(RAMBlock) next; // 下一个 RAMBlock, ram_list
    // Bitmap for a confidential guest to record if a page is private (1) or shared (0).
    unsigned long *cgs_bmap;
    // 在 ram_list 地址空间中的偏移 (要把前面 block 的 size 都加起来)
    // 不能够直接理解成起始 GPA,原因请看 find_ram_offset^。
    // ram_block_add() 函数中 new_block->offset = find_ram_offset(new_block->max_length);
    ram_addr_t offset;

    // bitmap to track already cleared dirty bitmap. When the bit is
    // set, it means the corresponding memory chunk needs a log-clear.
    // Set this up to non-NULL to enable the capability to postpone
    // and split clearing of dirty bitmap on the remote node (e.g., KVM)
    unsigned long *clear_bmap;
    //...
};

使用全局变量 ram_list 以链表形式维护所有 RAMBlock

typedef struct RAMList {
    // ...
    QLIST_HEAD(, RAMBlock) blocks;
    // ...
} RAMList;
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };

RAMBlock 之间表示的 GPA 可以重合吗?

RAMBlock 之间的地址范围需要是连续的吗,如果不是,那么用 ram_list 串起来的意义是什么?

MemoryRegion

MemoryRegion 用于描述一个 range 内的映射规则,同时通过内存属性,Guest 物理地址等特点对内存分类。

MemoryRegion 之间是可以重叠的,如果一个范围两个 MemoryRegion 出现重叠,优先级高的压制优先级低的。

叶子 MemoryRegion 表示实际分配给虚拟机的物理内存或者 MMIO(即实体 MemoryRegion)。

MemoryRegion 本身不 hold 内存,其通过 RAMBlock 域来 hold 内存相关信息,比如内存的 HVA。

根 MR 可找到所有别名 MR,别名 MR 通过其 alias 域找到指向的 MemoryRegion。

It is valid to add subregions to a region which is not a pure container (that is, to an MMIO, RAM or ROM region). This means that the region will act like a container, except that any addresses within the container’s region which are not claimed by any subregion are handled by the container itself (ie by its MMIO callbacks or RAM backing). However it is generally possible to achieve the same effect with a pure container one of whose subregions is a low priority “background” region covering the whole address range; this is often clearer and is preferred.

MemoryRegion 可以分为如下几类

  • 根级 MemoryRegion
    • 没有自己的内存,用于管理 subregion。
    • 别名 mr 都是根级 mr 的 subregion
  • Container MR
    • A container simply includes other MRs, each at a different offset. A container’s subregions are usually non-overlapping. In some cases it is useful to have overlapping regions;
  • 实体 MemoryRegion
    • 叶子节点,有具体的内存,从 QEMU 进程地址空间分配内存,保存在 RAMBlock 成员的 host 域。
  • 别名 MemoryRegion
    • A subsection of another region. Aliases allow a region to be split apart into discontiguous regions.
    • 通过 alias 成员可以指向其它任何类型 MemoryRegion(包括 alias MR),alias_offset 表示该别名 mr 在其中的偏移量。
    • 别名 MR 不能有 subregion。

aliases and containers only help QEMU building the guest’s memory map; they are never accessed directly.

struct MemoryRegion {
    // ...
    bool enabled; // 是否已经通知 KVM 使用这段内存,This allows users to disable a memory region without removing it from the hierarchy
    bool terminates; // 是否是叶子节点。原来叫 has_ram_addr,后来因为 IO 的存在,改名了
    MemoryRegion *container; // 指向父 MR
    RAMBlock *ram_block; // 指向对应的 RAMBlock
    hwaddr addr; // 在父 MR(container)中的偏移量,若父是 address_spaces_memory 可以理解为 GPA
    
    // 如果不为 NULL 说明这是一个**别名 MR**,其指向**实体 MR**
    MemoryRegion *alias;
    // 起始地址(GPA)在指向 MR 中 GPA space 的偏移量, HVA = 指向 MR HVA + alias_offset
    hwaddr alias_offset;
    // 用不到,只在 nvdimm 等非易失性内存才有意义
    bool nonvolatile;

    // See details below
    uint8_t dirty_log_mask;
    // ...
};

mr->dirty_log_mask QEMU

一般是 0,因为都是 VGA 在调用。

memory_region_set_log
    assert(client == DIRTY_MEMORY_VGA);
    mr->dirty_log_mask = (mr->dirty_log_mask & ~mask) | (log * mask);

Why alias MR's size can be greater than the target MR?

From the code, we can see sometimes alias MR's size can be even greater than that of the target MR:

static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr)
{
    //...
            if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size))
    //...
}

MemoryRegion 搜索逻辑

下面的逻辑在函数 `` 中体现。

The memory core uses the following rules to select a memory region when the guest accesses an address:

  • all direct subregions of the root region are matched against the address, in descending priority order
    • if the address lies outside the region offset/size, the subregion is discarded
    • if the subregion is a leaf (RAM or MMIO), the search terminates, returning this leaf region
    • if the subregion is a container, the same algorithm is used within the subregion (after the address is adjusted by the subregion offset)
    • if the subregion is an alias, the search is continued at the alias target (after the address is adjusted by the subregion offset and alias offset)
    • if a recursive search within a container or alias subregion does not find a match (because of a “hole” in the container’s coverage of its address range), then if this is a container with its own MMIO or RAM backing the search terminates, returning the container itself. Otherwise we continue with the next subregion in priority order
  • if none of the subregions match the address then the search terminates with no match found

MR lifecycle

A MR is created by one of the memory_region_init*() functions.

// owner is the owner object
void memory_region_init(MemoryRegion *mr, Object *owner, const char *name, uint64_t size)

Owner 主要是用来管理 MR 的生命周期的。

Destruction of a memory region happens automatically when the owner object dies.

Why MemoryRegion can overlap?

Usually, regions may not overlap each other, only the MRs in the same container can be overlapped (determined by priority).

For example a memory controller that can overlay a subregion of RAM with MMIO or ROM, or a PCI controller that does not prevent card from claiming overlapping BARs.

The memory API — QEMU 8.0.0 documentation

QEMU 的 memory model | Deep Dark Fantasy

Why MemoryRegion can be aliased?

Aliases allow a region to be split apart into dis-contiguous regions. Examples of uses are:

  • memory banks used when the guest address space is smaller than the amount of RAM addressed, or
  • a memory controller that splits main memory to expose a “PCI hole”.
system_memory: container@0-2^48-1(Far more than 4GB)
 |
 +---- lomem: alias@0(0GB) - 0xdfffffff(3.5GB) ---> #ram (0(0GB) - 0xdfffffff(3.5GB))
 |
 +---- himem: alias@0x100000000(4GB) - 0x11fffffff(4.5GB) ---> #ram (0xe0000000(3.5GB) - 0xffffffff(4GB))
 |
 +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff)
 |      (prio 1)
 |
 +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff)

pci (0-2^32-1)
 |
 +--- vga-area: container@0xa0000-0xbffff
 |      |
 |      +--- alias@0x00000-0x7fff  ---> #vram (0x010000-0x017fff)
 |      |
 |      +--- alias@0x08000-0xffff  ---> #vram (0x020000-0x027fff)
 |
 +---- vram: ram@0xe1000000-0xe1ffffff
 |
 +---- vga-mmio: mmio@0xe2000000-0xe200ffff

ram: ram@0x00000000-0xffffffff

The 4GB RAM block is mapped into the system address space via two aliases: “lomem” is a 1:1 mapping of the first 3.5GB; “himem” maps the last 0.5GB at address 4GB. This leaves 0.5GB for the so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with 4GB of memory.

The memory API — QEMU 8.0.0 documentation

QEMU 的 memory model | Deep Dark Fantasy

FlatView (QEMU)

MemoryRegion 是 QEMU 管理内存的树状结构,便于按照功能、属性分类;但这只是管理结构。但虚拟机的内存需要通过 KVM_SET_USER_MEMORY_REGION,将 HVA 和 GPA 的对应关系注册到 KVM 模块的 memslot,才可以生效成为 EPT。如果 QEMU 直接使用 MemoryRegion 进行注册,那么注册的过程将会很麻烦,也容易不断的出现重叠判断等。所以在通过 KVM_SET_USER_MEMORY_REGION 注册前,加了一层转换机制,先将树状的 MemoryRegion 展开物理内存的一维区间结构,然后再通过 KVM_SET_USER_MEMORY_REGION 将这个展开的物理内存注册到 KVM 内核模块中,就方便了许多。这个转换机制就是 FlatView 模型。

因为只有顶层的 MemoryRegion 才需要持有 Flatview所以可以理解为 FlatViewAddressSpace 是对应的。所以没必要在 MemoryRegion 结构体中添加一个属性,QEMU 将 MemoryRegion 持有的 Flatview 保存在 static GHashTable *flat_views;

每次增删改 MemoryRegion 之后,都会调用 memory_region_transaction_commit 进而调用 address_space_set_flatview 保证 AddressSpace::current_mapFlatview 和对 MemoryRegion 的更改总是同步的。

一个 GPA translate 的 process:

// ...
flatview_translate
flatview_do_translate

MemoryRegion模型原理,以及同FlatView模型的关系_leoufung的博客-CSDN博客

QEMU 的 memory model | Deep Dark Fantasy

flatview_extend_translation() QEMU

调用多次 flatview_translate() 函数来把一个 target_len 的地址多次映射。

static hwaddr flatview_extend_translation(FlatView *fv, hwaddr addr,
                            hwaddr target_len,
                            MemoryRegion *mr, hwaddr base, hwaddr len,
                            bool is_write, MemTxAttrs attrs)
{
    hwaddr done = 0;
    hwaddr xlat;
    MemoryRegion *this_mr;

    for (;;) {
        target_len -= len;
        addr += len;
        done += len;
        if (target_len == 0) {
            return done;
        }

        len = target_len;
        this_mr = flatview_translate(fv, addr, &xlat, &len, is_write, attrs);
        if (this_mr != mr || xlat != base + done)
            return done;
    }
}

flatview_translate() QEMU

根据传进来的 FlatView 以及地址 addr,返回这个地址 addr 所落在的 MemoryRegion,并返回 addr 翻译到 MemoryRegion 中对应的 offset xlat 以及长度 plen。

xlatplen 都是输出的参数,传入的值没有意义。

MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
                                 hwaddr *plen, bool is_write,
                                 MemTxAttrs attrs)
{
    //...
    section = flatview_do_translate(fv, addr, xlat, plen, NULL, is_write, true, &as, attrs);
    mr = section.mr;
    //...
    return mr;
}

When MemoryRegion change, how to update FlatView?

一个 MemoryRegion 对应一个 FlatView 还是 FlatRange?

应该是 FlatRange,一个 AddressSpace 对应一个 FlatView.

struct FlatView {
    FlatRange *ranges; // 扁平化后的 FlatRange 们
    unsigned nr; // FlatRange 个数
    unsigned nr_allocated;
    struct AddressSpaceDispatch *dispatch; // 为了加快访问所建成的红黑树
    MemoryRegion *root; // 这个 FlatView 是基于哪一个 MemoryRegion 来生成的
};

address_space_update_topology 这个函数,转换的核心的逻辑在 render_memory_region 这个函数里面:

static void render_memory_region(...)
{
    //...
    QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
        render_memory_region(view, subregion, base, clip, readonly, nonvolatile);
    }
    //...
    flatview_insert(view, i, &fr);
}

FlatRange (QEMU)

以列表的形式存在 FlatView 中,表示各个不重叠的区间。对应一个叶子 MemoryRegion

各个 FlatRange 不会重叠(毕竟已经扁平化之后了)。

FlatRangeRAMBlock 是怎么对应的?

struct FlatRange {
    // 表示这个 FlatRange 是哪一个叶子 mr,只有叶子 MR 才有对应的 FlatRange。
    // 详见 render_memory_region()^
    MemoryRegion *mr;
    hwaddr offset_in_region; // 没太明白
    AddrRange addr; // start and size
    // 对应 log_start 等非 global 的函数
    uint8_t dirty_log_mask;
    //...
};

fr->dirty_log_mask / memory_region_get_dirty_log_mask() QEMU

是从 mr->dirty_log_mask 加上一个 DIRTY_MEMORY_MIGRATION bit(或不加)拿过来的。

mr->dirty_log_mask 大概率是 0,这个不一定是 0,因为会加 bit。

render_memory_region
    fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
{
    uint8_t mask = mr->dirty_log_mask;
    RAMBlock *rb = mr->ram_block;

    // 打开了 global_dirty_tracking 并且 ram 是 migratable 的,一般可以满足
    if (global_dirty_tracking && ((rb && qemu_ram_is_migratable(rb)) || memory_region_is_iommu(mr))) {
        mask |= (1 << DIRTY_MEMORY_MIGRATION);
    }
    //...
    return mask;
}

AddressSpaceDispatch (QEMU)

FlatView 是一个数组形式,为了加快访问,显然需要使用构成一个红黑树之类的,可以在 $log(n)$ 的时间内访问的,QEMU 实现的 这个就是 AddressSpaceDispatch 了。

FlatRange 逐个调用 flatview_add_to_dispatch 创建出来的。

MemoryRegionSection (QEMU)

MemoryRegionSection describes a fragment of a MemoryRegion.

MemoryRegion 平坦化后,由于可能重叠,本来完整的 MR 可能就被分成了数片 MemoryRegionSection

struct MemoryRegionSection {
    Int128 size;
    MemoryRegion *mr; // 这个 Section 所属的 MR
    FlatView *fv; // AddressSpace 的 FlatView
    hwaddr offset_within_region;
    hwaddr offset_within_address_space;
    // ...
};

MemoryRegionSectionFlatRange 是对应的:

static inline MemoryRegionSection
section_from_flat_range(FlatRange *fr, FlatView *fv)
{
    return (MemoryRegionSection) {
        .mr = fr->mr,
        .fv = fv,
        .offset_within_region = fr->offset_in_region,
        .size = fr->addr.size,
        .offset_within_address_space = int128_get64(fr->addr.start),
        .readonly = fr->readonly,
        .nonvolatile = fr->nonvolatile,
    };
}

同时看 kvm_section_update_flags(),一个 MemoryRegionSection 和一个 KVMSlot 是大致对应的。进一步,也可以说大致是和 ioctl 传进去的 struct kvm_userspace_memory_region_ext 也是对应的。

为什么设计 MemoryRegionSection 结构体?

Functions

qemu_ram_ptr_length() / ramblock_ptr() QEMU

Return a host pointer (HVA) to guest's ram. Similar to qemu_map_ram_ptr() but takes a size argument.

static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
                                 hwaddr *size, bool lock)
{
    RAMBlock *block = ram_block;
    //...
    *size = MIN(*size, block->max_length - addr);

    // Zen 相关的 code
    //...
    return ramblock_ptr(block, addr);
}

static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
{
    //...
    // HVA
    return (char *)block->host + offset;
}

memory_region_dispatch_write() / memory_region_dispatch_read() QEMU

pcie_mmcfg_data_write
pci_host_config_write_common


virtio_write_config
virtio_address_space_write
memory_region_dispatch_write

qemu_ram_mmap() QEMU

这个函数其实就是用 mmap 与传进来的 flags 进行申请内存,不同的是 mmap 了两次。第一次是 reserve,第二次是 activate。reserve 的区间要比 activate 略大(不超过一个 page),这么做的目的是保证起始地址的 alignment,保证后面有一个 PROT_NONE 的 guard page,防止访问越界。

这个函数可以是普通的申请内存,也可以从一个 fd 中来申请内存。这个 fd 背后可以是 file backed 的,也可以是 memfd。

// total = 15, size = 11
|----|----|----|----|----|----|   // 这是内存空间,alignment 是 4
   --|----|----|----|-            // 假如我们要申请 11 个大小,那么我们传给 mmap_reserve 的是 11 + 4 = 15
     |----|----|---               // 而 mmap_activate 只会申请 11 个,同时还保持了起始地址的 alignment。
void *qemu_ram_mmap(int fd, size_t size, size_t align, uint32_t qemu_map_flags, off_t map_offset)
{
    //...
    size_t offset, total;
    void *ptr, *guardptr;

    //...
    total = size + align;

    // 申请 total 大小的内存空间,因为 mmap_reserve 不保证
    // 申请到的起始地址的 alignment,所以我们的 total 要大于 size 一个 align
    // 好像 fd 在这个函数里不会用到
    guardptr = mmap_reserve(total, fd);
    //...
    offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
    ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags, map_offset);
    // error handling...
    // page guard...
    return ptr;
}

ram_block_add() QEMU

名字是 add,说明最主要做的是加入到 ram_list 中去,分配内存只是当 !new_block->host 的时候才做。

  • 分配一份普通的内存(backend 就是物理内存,而不是 fd 而不是其他的),存到 new_block->host 中去
  • 分配一份 gmem 内存(如果需要的话),存到 new_block->gmem_fd 中去
  • 插入到 ram_list 这个全局数组当中去
  • 把所有 page 的 dirty bitmap 置为 dirty 的
qemu_ram_alloc_*
    qemu_ram_alloc_from_fd / qemu_ram_alloc_internal
        ram_block_add
static void ram_block_add(RAMBlock *new_block, Error **errp)
{
    const bool noreserve = qemu_ram_is_noreserve(new_block);
    const bool shared = qemu_ram_is_shared(new_block);
    RAMBlock *block;
    RAMBlock *last_block = NULL;
    ram_addr_t old_ram_size, new_ram_size;
    Error *err = NULL;

    old_ram_size = last_ram_page();

    //...
    new_block->offset = find_ram_offset(new_block->max_length);

    // ramblock 的 host 表示 HVA,为空的话表示我们还没有 alloc HVA
    if (!new_block->host) {
        // Xen related...
        //...
        // 通过 mmap 来分配内存
        // 从名字可以看出来,backend 就是普通的 memory,而不是其他 fd 或者 file 什么的
        new_block->host = qemu_anon_ram_alloc(new_block->max_length,
                                              &new_block->mr->align,
                                              shared, noreserve);
        // error handling...
        // merging...
    }

    // gmem-related
    // 是不是分配了两次内存?
    // 一个 RAMBlock 只需要 max_length 的内存,可是 normal 和 gmem 都分配了一遍
    if (kvm_enabled() && new_block->flags & RAM_GUEST_MEMFD && new_block->guest_memfd < 0) {
        uint64_t flags = QEMU_IS_ALIGNED(new_block->max_length, get_thp_size()) ?
                         KVM_GUEST_MEMFD_ALLOW_HUGEPAGE : 0;
        new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length, flags, errp);
        new_block->cgs_bmap = bitmap_new(new_block->max_length >> TARGET_PAGE_BITS);
        if (!ram_default_shared)
            new_block->flags |= RAM_DEFAULT_PRIVATE;
        //...
    }

    new_ram_size = MAX(old_ram_size, (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
    if (new_ram_size > old_ram_size) {
        dirty_memory_extend(old_ram_size, new_ram_size);
    }
    // 在 ram_list 里寻找 new block 插入的地方, 因为 block 是从大到小排列的。
    RAMBLOCK_FOREACH(block) {
        last_block = block;
        if (block->max_length < new_block->max_length) {
            break;
        }
    }
    // 插入进去
    if (block) {
        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
    } else if (last_block) {
        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
    } else { /* list is empty */
        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
    }
    ram_list.mru_block = NULL;

    //...
    // 把所有 page 在 dirty bitmap 置为 dirty 的
    cpu_physical_memory_set_dirty_range(new_block->offset, new_block->used_length, DIRTY_CLIENTS_ALL);

    // shared memory, not gmem
    if (new_block->host) {
        //...
        // 对于 shared memory,尽量用大页
        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
        // 还有一些 madvise...
        // sev-related...
    }
}

tdp_mmu_zap_spte_atomic() KVM

static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter)
{
	int ret;

    // See REMOVED_SPTE^
	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
    //...

	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);

	/*
	 * No other thread can overwrite the removed SPTE as they must either
	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
	 * overwrite the special removed SPTE value. No bookkeeping is needed
	 * here since the SPTE is going from non-present to non-present.  Use
	 * the raw write helper to avoid an unnecessary check on volatile bits.
	 */
	__kvm_tdp_mmu_write_spte(iter->sptep, 0);

	return 0;
}

KVM shadow SPTE State

在没有引入 TDX patch 的情况下,zap 就是置 0。引入了 TDX 之后,zap 就变成了置成:

  • private_zapped_spte():对于 private 的;
  • SHADOW_NONPRESENT_VALUE:shared 的。

kvm_mmu_unmap_gfn_range() / kvm_unmap_gfn_range() / kvm_tdp_mmu_unmap_gfn_range() KVM

What is Unmap?

首先排除 Unmap 的是 Guest 的页表,因为 Guest 的页表理应由 Guest 来管理(当使用了 TDP 的情况下)。

那 Unmap 指的可能是 Unmap EPT 的页表。不仅要将 entry 从 EPT 中移除,也要从 TLB 中移除。

bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
	return kvm_unmap_gfn_range(kvm, range);
}

bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
	bool flush = false;

	if (kvm_memslots_have_rmaps(kvm))
		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);

	if (tdp_mmu_enabled)
		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);

	if (kvm_x86_ops.set_apic_access_page_addr &&
	    range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);

	return flush;
}
// 对于 gmem 的情况,直接调用的
// 比如在 post-copy 的 DISCARD 阶段,会通过 PUNCH_HOLE 一路调用到这里
kvm_gmem_invalidate_begin
    kvm_mmu_unmap_gfn_range
        kvm_unmap_gfn_range
            kvm_tdp_mmu_unmap_gfn_range

// 对于普通的情况
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
     //...
};
kvm_mmu_notifier_invalidate_range_start
    .handler	= kvm_mmu_unmap_gfn_range,
        kvm_mmu_unmap_gfn_range
            kvm_unmap_gfn_range
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush, bool zap_private)
{
	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
				     range->end, range->may_block, flush,
				     zap_private);
}

kvm_mmu_child_role() KVM

这个函数就是返回一下 sptep 指向的 PT 的 role,一般来说只是 level 会少一级。

static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access)
{
    // 找到这个 SPTE 所在的 PT,这两者的 role level 应该是一样的
	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
	union kvm_mmu_page_role role;

	role = parent_sp->role;
    // 既然是 child,那么就 --
	role.level--;
	role.access = access;
	role.direct = direct;
	role.passthrough = 0;

    // 32-bits paging...
    // ...
	return role;
}

让传进来的 sptep 指向传进来的 spt。

static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp)
{
	__link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
}

static void __link_shadow_page(struct kvm *kvm,
			       struct kvm_mmu_memory_cache *cache, u64 *sptep,
			       struct kvm_mmu_page *sp, bool flush)
{
	u64 spte;

	 // If an SPTE is present already, it must be a leaf and therefore
	 // a large one.  Drop it, and flush the TLB if needed, before installing sp.
	if (is_shadow_present_pte(*sptep))
		drop_large_spte(kvm, sptep, flush);

    // make a PSE,指向这个 sp
	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));

    // *sptep = spte,也就是说让 sptep 指向了 sp
	mmu_spte_set(sptep, spte);

	mmu_page_add_parent_pte(cache, sp, sptep);

	/*
	 * The non-direct sub-pagetable must be updated before linking.  For
	 * L1 sp, the pagetable is updated via kvm_sync_page() in
	 * kvm_mmu_find_shadow_page() without write-protecting the gfn,
	 * so sp->unsync can be true or false.  For higher level non-direct
	 * sp, the pagetable is updated/synced via mmu_sync_children() in
	 * FNAME(fetch)(), so sp->unsync_children can only be false.
	 * WARN_ON_ONCE() if anything happens unexpectedly.
	 */
	if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
		mark_unsync(sptep);
}


kvm_mmu_get_child_sp() KVM

尽管传进来的叫做 sptep,但是其指向的也可以是 PSE。并不一定就是一个 leaf PTE。

一个 shadow page 是通过 kvm_mmu_page 来描述的,这个函数就是根据 gfn 找到或创建其所在的 PT,进而返回描述此 PT 的 kvm_mmu_page

// sptep 所指向的 pte 指向了我们所要的 kvm_mmu_page
static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
						 u64 *sptep, gfn_t gfn,
						 bool direct, unsigned int access)
{
	union kvm_mmu_page_role role;

    // 存在并且不是 large pte。
	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
		return ERR_PTR(-EEXIST);

    // 根据 sptep 所在的 PT 生成 sptep 指向的 PT 的 page role
    // 一般来说就是 level--。
	role = kvm_mmu_child_role(sptep, direct, access);
	return kvm_mmu_get_shadow_page(vcpu, gfn, role);
}

kvm_mmu_get_shadow_page() / __kvm_mmu_get_shadow_page() / KVM

找到或创建一个页表(PT),这个页表所映射的起始 GFN 就是传进去的 GFN。返回这个创建的页表。

static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
						      struct kvm_vcpu *vcpu,
						      struct shadow_page_caches *caches,
						      gfn_t gfn,
						      union kvm_mmu_page_role role)
{
	struct hlist_head *sp_list;
	struct kvm_mmu_page *sp;

	sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];

    // 先找,找不到就创建。
	sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
	if (!sp)
		sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
    //...
	return sp;
}

kvm_mmu_find_shadow_page() / KVM

在所给的 shadow page 列表(sp_list)里找到映射 gfn 的 shadow page。

/*
 * The vCPU is required when finding indirect shadow pages; the shadow
 * page may already exist and syncing it needs the vCPU pointer in
 * order to read guest page tables.  Direct shadow pages are never
 * unsync, thus @vcpu can be NULL if @role.direct is true.
 */
static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
						     struct kvm_vcpu *vcpu,
						     gfn_t gfn,
						     struct hlist_head *sp_list,
						     union kvm_mmu_page_role role)
{
	struct kvm_mmu_page *sp;
	int ret;
	LIST_HEAD(invalid_list);

    // 迭代 sp_list
	for_each_valid_sp(kvm, sp, sp_list) {
        // 这个 shadow page (sp) 并不是我们要找的,换下一个
		if (sp->gfn != gfn)
			continue;

		if (sp->role.word != role.word) {
			/*
			 * If the guest is creating an upper-level page, zap
			 * unsync pages for the same gfn.  While it's possible
			 * the guest is using recursive page tables, in all
			 * likelihood the guest has stopped using the unsync
			 * page and is installing a completely unrelated page.
			 * Unsync pages must not be left as is, because the new
			 * upper-level page will be write-protected.
			 */
			if (role.level > PG_LEVEL_4K && sp->unsync)
				kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
			continue;
		}

		/* unsync and write-flooding only apply to indirect SPs. */
		if (sp->role.direct)
			goto out;

		if (sp->unsync) {
			if (KVM_BUG_ON(!vcpu, kvm))
				break;

			/*
			 * The page is good, but is stale.  kvm_sync_page does
			 * get the latest guest state, but (unlike mmu_unsync_children)
			 * it doesn't write-protect the page or mark it synchronized!
			 * This way the validity of the mapping is ensured, but the
			 * overhead of write protection is not incurred until the
			 * guest invalidates the TLB mapping.  This allows multiple
			 * SPs for a single gfn to be unsync.
			 *
			 * If the sync fails, the page is zapped.  If so, break
			 * in order to rebuild it.
			 */
			ret = kvm_sync_page(vcpu, sp, &invalid_list);
			if (ret < 0)
				break;

			WARN_ON_ONCE(!list_empty(&invalid_list));
			if (ret > 0)
				kvm_flush_remote_tlbs(kvm);
		}

		__clear_sp_write_flooding_count(sp);

		goto out;
	}

	sp = NULL;
	++kvm->stat.mmu_cache_miss;

out:
	kvm_mmu_commit_zap_page(kvm, &invalid_list);

    // stats...
	return sp;
}

sptep_to_sp() / spte_to_child_sp() KVM

sptep_to_sp(): 从一个 sptep 来计算得到这个 spte 所在的 pt 的 page 对应的 kvm_mmu_page,中间会用到 vmemmap 来计算。

spte_to_child_sp(): 从一个 spte 来计算得到这个 spte 指向的 pt 的 page 对应的 kvm_mmu_page,中间会用到 vmemmap 来计算。

rmap_remove() KVM

static void rmap_remove(struct kvm *kvm, u64 *spte)
{
	struct kvm_memslots *slots;
	struct kvm_memory_slot *slot;
	struct kvm_mmu_page *sp;
	gfn_t gfn;
	struct kvm_rmap_head *rmap_head;

	sp = sptep_to_sp(spte);
	gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));

	/*
	 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
	 * so we have to determine which memslots to use based on context
	 * information in sp->role.
	 */
	slots = kvm_memslots_for_spte_role(kvm, sp->role);

	slot = __gfn_to_memslot(slots, gfn);
	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);

	pte_list_remove(spte, rmap_head);
}

mmu_page_zap_pte() KVM

/* Returns the number of zapped non-leaf child shadow pages. */
// spte: 1 of 512 sptes in this pt described by sp
static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
			    u64 *spte, struct list_head *invalid_list)
{
	u64 pte;
	struct kvm_mmu_page *child;

	pte = *spte;
	if (is_shadow_present_pte(pte) {
        // last level pte
		if (is_last_spte(pte, sp->role.level)) {
			drop_spte(kvm, spte); // rmap_remove(),  
		} else {
			child = spte_to_child_sp(pte);
			drop_parent_pte(child, spte);
			/*
			 * Recursively zap nested TDP SPs, parentless SPs are
			 * unlikely to be used again in the near future.  This
			 * avoids retaining a large number of stale nested SPs.
			 */
			if (tdp_enabled && invalid_list &&
			    child->role.guest_mode && !child->parent_ptes.val)
				return kvm_mmu_prepare_zap_page(kvm, child,
								invalid_list);
		}
	} else if (is_mmio_spte(kvm, pte)) {
		mmu_spte_clear_no_track(spte);
	}
	return 0;
}

Remove a shadow page (described by sp) from the list of used pages.

static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared)
{
    //...
    // set sp->nx_huge_page_disallowed to false if it is true
	if (!sp->nx_huge_page_disallowed)
		return;
	sp->nx_huge_page_disallowed = false;
    // 从 list 中将 sp entry 移除
	untrack_possible_nx_huge_page(kvm, sp);
    //...
}

kvm_tdp_mmu_spte_need_atomic_write() KVM

重点在于这个 valatile_bits

  • The Writable bit can be set by KVM's fast page fault handler, and;
  • Accessed and Dirty bits can be set by the CPU.
/*
 * SPTEs must be modified atomically if they are shadow-present, leaf
 * SPTEs, and have volatile bits, i.e. has bits that can be set outside
 * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
 * handler, and Accessed and Dirty bits can be set by the CPU.
 *
 * Note, non-leaf SPTEs do have Accessed bits and those bits are
 * technically volatile, but KVM doesn't consume the Accessed bit of
 * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit.  This
 * logic needs to be reassessed if KVM were to use non-leaf Accessed
 * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
 */
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
{
	return is_shadow_present_pte(old_spte) &&
	       is_last_spte(old_spte, level) &&
	       spte_has_volatile_bits(old_spte);
}

kvm_tdp_mmu_write_spte() / kvm_tdp_mmu_write_spte_atomic() KVM

注意,这个函数会返回 old_spte。我们会使用返回的新的 old_spte 传入 handle_changed_spte() 来进行 handle。

  • 对于不需要 atomic write 的情况,我们直接返回传进来的 old_spte 就可以了。
  • 对于需要 atomic write 的情况,返回的是 sptep 指向的原值。可能是因为含有 volatile bits,old_spte 的值可能已经被更改了,所以需要获取最新的值?
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level)
{
    // 对于需要 atomic write 的情况,返回的是 sptep 指向的原值。
	if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
		return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);

    // 对于不需要 atomic write 的情况,我们直接返回传进来的 old_spte 就可以了。
	__kvm_tdp_mmu_write_spte(sptep, new_spte);
	return old_spte;
}

static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
	return xchg(rcu_dereference(sptep), new_spte);
}

感觉好像什么也没做。

nx_huge_page_disallowed 是 false 就返回。

static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, bool shared)
{
    //...
	if (!sp->nx_huge_page_disallowed)
		return;

    // shared 参数主要用来决定用哪一个锁
    // 允许 NX-huge page
	sp->nx_huge_page_disallowed = false;
	untrack_possible_nx_huge_page(kvm, sp);
    //...
}

handle_removed_pt() KVM

Given a page table that has been removed, iterates through the page table to clear SPTEs and free child page tables.

This function is only called by handle_changed_spte().

static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
{
	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
	int level = sp->role.level;
	gfn_t base_gfn = sp->gfn;
	int ret;
	int i;

    //...
    // remove this shadow page from list of used pages
	tdp_mmu_unlink_sp(kvm, sp, shared);

    // 一般来说是有 512 个 PTE,这里只是处理了所有的 SPTE
    // 但是没有处理这个 PT 本身。
	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
        // 找到这个 page 里的第 i 个 entry
		tdp_ptep_t sptep = pt + i;
		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
		u64 old_spte;

        // shared 表示可能存在多线程同步的问题。
		if (shared) {
			for (;;) {
                // 先原子性地设置为 REMOVED_SPTE, REMOVED_SPTE 是 64bit 的,
                // 表示一个空的 SPTE。
				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
                // 如果原来的 spte 没有被 remove,那么就 break
				if (!is_removed_spte(old_spte))
					break;
                // 如果发现之前已经被设置成为 REMOVED_SPTE 了,说明有其它正在处理
                // page fault 的线程,可能会覆盖我们的结果,所以就再试一次,直到发现原来的 SPTE
                // 恢复成为正常的。
				cpu_relax();
			}
		} else {
            // 先读出来这个 spte 的值
			old_spte = kvm_tdp_mmu_read_spte(sptep);
            //...
            // 置空
			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
							  REMOVED_SPTE, level);
		}
        // 这里面也会调用 handle_removed_pt,从而实现递归 remove 的效果
        // 为什么不直接调 handle_removed_pt 呢,我觉得可能是因为需要遵循
        // remove pt -> remove pte -> remove pt 这样的循环,就是 pt 和 pte
        // 是两个不同的东西,对他们的 remove 需要分开来进行处理
		ret = handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
					  old_spte, REMOVED_SPTE, sp->role,
					  shared);
        //...
	}

    // 为什么 TDX 还要再 free 一次?
    // SEPT 是 TDX module 管的,需要 tdh.mem.sept.remove 来把这个 PT 
    // 所在的 page 移除
	if (is_private_sp(sp)
        static_call(kvm_x86_free_private_spt)(kvm, sp->gfn, sp->role.level, kvm_mmu_private_spt(sp))
    //...
}

struct tdp_zap_private KVM

enum tdp_zap_private {
    // 对于 private SPTE 不进行 zap
	ZAP_PRIVATE_SKIP = 0,
    // 从 normal SPTE -> zapped
	ZAP_PRIVATE_BLOCK,
    // 从 normal SPTE -> 0
	ZAP_PRIVATE_REMOVE,
};

handle_changed_spte() KVM

参数里的 old_sptenew_spte 都只是 spte(也就是 PFN),并不包括键,也就是(GFN,键本身就没有包括在 PTE 里面),这两者一个是 change 之前的,一个是之后的。如果要删除一个 spte,那么可以在 new_spte 传入 REMOVED_SPTE。参数 gfn 应该是 Map 到 old_spte(SPTE 从 GFN map 到 PFN)。

注意,old_spte 也可以是中间一级的 PSE,不一定非要是 leaf。

shared: This operation may not be running under the exclusive use of the MMU lock and the operation must synchronize with other threads that might be modifying SPTEs.

role 是 PTE 所在的 PT 的 role。

直接改 leaf SPTE 的 PFN 是不被允许的。

static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
				u64 old_spte, u64 new_spte,
				union kvm_mmu_page_role role, bool shared)
{
	bool is_private = kvm_mmu_page_role_is_private(role);
	int level = role.level;
    // 不要和页表项里的 present bit 混淆,那个指的是 PTE 所指向的页在不在内存
    // 这个指的是这个 SPTE 是不是 backed by memory(也就是普通 SPTE),还是说是一个 MMIO SPTE。
    // #define SPTE_MMU_PRESENT_MASK^
	bool was_present = is_shadow_present_pte(old_spte);
	bool is_present = is_shadow_present_pte(new_spte);

    // was_last 表示这是一个 leaf
	bool was_last = is_last_spte(old_spte, level);
    // was_leaf 表示这是一个存在的 leaf
	bool was_leaf = was_present && was_last;
    // is_leaf 表示这是一个存在的 leaf
	bool is_leaf = is_present && is_last_spte(new_spte, level);
	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
	kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
	bool pfn_changed = old_pfn != new_pfn;
	bool was_private_zapped = is_private_zapped_spte(old_spte);

    // 检查一下 level 区间上界和下界
	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
	WARN_ON_ONCE(level < PG_LEVEL_4K);

    // 检查一下 gfn 是不是在 level 里面
	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));

    // 以前是 private zapped 的状态,新的也必须要是 private
	KVM_BUG_ON(was_private_zapped && !is_private, kvm);
    // SPTE 和 GPA 的 private 属性不一致
	KVM_BUG_ON(kvm_is_private_gpa(kvm, gfn_to_gpa(gfn)) != is_private, kvm);

    // 映射到的 PFN value 不能直接改,而是 A notifier handler should be zapping the SPTE
    // before the main MM's page table is changed, or the SPTE should be zeroed,
    // and the TLBs flushed by the thread before replacement.
	if (was_leaf && is_leaf && pfn_changed)
        //...
		BUG();

    //...
	if (is_leaf)
		check_spte_writable_invariants(new_spte);

    // 之前就已经是 zapped 了,现在要进一步 remove(从 TDX module 里 remove 掉)
	if (was_private_zapped && !is_present) {
		handle_private_zapped_spte(kvm, gfn, old_spte, new_spte, level);
		return;
	}

    // some MMIO handling...
    // ...

    // 添加了新 leaf 表示多了一个 page 映射,所以要 +1,反之 -1
	if (is_leaf != was_leaf)
		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);

    // 如果原来的页是 dirty 的,那么将其设置为 dirty
	if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
		kvm_set_pfn_dirty(old_pfn);

	// Recursively handle child PTs if the change removed a subtree from the paging structure.
    // old spte 存在且不是 leaf,那么就是一个 PSE,也就是一个子树
    // new spte 是 leaf(要把一个树变成一个大叶子),或者不是 present 的(也就是要删除这个树)
    // 不管怎么样,这棵树都不要了,所以需要递归地删除
	if (was_present && !was_last && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
        //...
		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);

	/*
	 * Secure-EPT requires to remove Secure-EPT tables after removing
	 * children.  hooks after handling lower page table by above
	 * handle_remove_pt().
	 */
    // 上面那里是之前是 zapped 的,要进一步 remove
    // 这里是之前是正常的 SPTE,现在要变成 zapped。
	if (is_private && !is_present) {
        // 如果 new spte 需要是 private 并且 not present 的,那么需要首先将其 zapped
        // 不能直接 was_present=1 -> zero EPT entry,不然就 warn 出来。
		KVM_BUG_ON(!shared && is_leaf && !is_private_zapped_spte(new_spte), kvm);
		handle_removed_private_spte(kvm, gfn, old_spte, new_spte, role.level);
	}

	if (was_leaf && is_accessed_spte(old_spte) && (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}

handle_private_zapped_spte() KVM

old_spte: private zapped 的状态。

new_spte: 完全 remove,既不是 present 也不是 private zapped 的状态。

static void handle_private_zapped_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, u64 new_spte, int level)
{
    //...
    // old 必须要是 zapped 的状态
	KVM_BUG_ON(!was_private_zapped, kvm);
	KVM_BUG_ON(!is_last_spte(old_spte, level), kvm);
    // 因为 old 要是 zapped 的,所以不能是 present 的
	KVM_BUG_ON(was_present, kvm);
    // new 不能还是 zapped 状态了,也就是不能从 zapped 到 zapped
	KVM_BUG_ON(is_private_zapped, kvm);
    // 同时也不能是 present 的状态,这意味着我们要完全 remove 掉。
	KVM_BUG_ON(is_present, kvm);

    //...
    // 既然要完全 remove 掉,那么就调用对应的函数
	ret = static_call(kvm_x86_remove_private_spte)(kvm, gfn, level, old_pfn);
    //...
}

handle_removed_private_spte() KVM

old_spte: Normal SPTE。

new_spte: Private zapped 的状态。

目前的实现中,传进来的 new_spte 是一个 zapped private 的 spte。

static void handle_removed_private_spte(struct kvm *kvm, gfn_t gfn,
					u64 old_spte, u64 new_spte,
					int level)
{
    //...
    // 不能 zap 两次
    // is_private_zapped 应该是 true,那么 was 应该就是 false 了
	KVM_BUG_ON(was_private_zapped && is_private_zapped, kvm);
	WARN_ON_ONCE(is_present);
	// Allow only leaf page to be zapped.  Reclaim Non-leaf page tables at destroying VM.
	if (!was_leaf)
		return;

    //...
    // tdh.mem.range.block
	ret = static_call(kvm_x86_zap_private_spte)(kvm, gfn, level);
    //...

	/* non-present -> non-present doesn't make sense. */
    // 结合 was_private_zapped 是 false,可以推断出 old_spte 就是一个正常的已经映射的 SPTE
	KVM_BUG_ON(!was_present, kvm);
	KVM_BUG_ON(new_pfn, kvm);
    // 用 SEAMCALL 将 page remove 掉,
    // 在 TDX Module 的 SEPT 里,已经 remove 了;在 KVM 里,会将其置为 zapped 的状态。
	static_call(kvm_x86_remove_private_spte)(kvm, gfn, level, old_pfn);
}

is_shadow_present_pte() KVM

因为一个 SPT 有 512 个项,不可能每一个项都有值,present 表示的就是这一个 SPTE 是有值的,而不是空的。

address_space_update_topology

address_space_update_topology_pass() QEMU

AddressSpaceFlatView 有所更改,我们需要通知 AddressSpace 上面的 memory listeners 这些更改。

address_space_set_flatview
    // 先 adding 为 false,表示我们先去除旧的
    address_space_update_topology_pass(as, old_view2, new_view, false);
    // 再 adding 为 true,表示我们要添加新的
    address_space_update_topology_pass(as, old_view2, new_view, true);

static void address_space_update_topology_pass(AddressSpace *as,
                                               const FlatView *old_view,
                                               const FlatView *new_view,
                                               bool adding)
{
    unsigned iold, inew;
    FlatRange *frold, *frnew;

    /* Generate a symmetric difference of the old and new memory maps.
     * Kill ranges in the old map, and instantiate ranges in the new map.
     */
    iold = inew = 0;
    // 迭代双边的每一个 FlatRange
    // old 和 new 的数量不一样,但是只要一方还有,就继续循环
    while (iold < old_view->nr || inew < new_view->nr) {
        // 找到 iold, inew 指向的 FlatRange,放到 frold 和 frnew 里
        if (iold < old_view->nr)
            frold = &old_view->ranges[iold];
        else
            frold = NULL;

        if (inew < new_view->nr)
            frnew = &new_view->ranges[inew];
        else
            frnew = NULL;

        if (frold
            && (!frnew
                || int128_lt(frold->addr.start, frnew->addr.start)
                || (int128_eq(frold->addr.start, frnew->addr.start)
                    && !flatrange_equal(frold, frnew)))) {
            // In old but not in new, or in both but attributes changed.
            if (!adding) {
                flat_range_coalesced_io_del(frold, as);
                // AddressSpace 有更新,所以我们要通知到这个 AddressSpace 的每一个 listener
                // 关于更新的这个 FlatRange(MemoryRegionSection)。
                MEMORY_LISTENER_UPDATE_REGION(frold, as, Reverse, region_del);
            }

            ++iold;
        } else if (frold && frnew && flatrange_equal(frold, frnew)) {
            // In both and unchanged (except logging may have changed)

            if (adding) {
                MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_nop);
                // 相比于旧的 FlatRange,新的加了 dirty_log_mask,所以要调用 log_start
                if (frnew->dirty_log_mask & ~frold->dirty_log_mask) {
                    MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, log_start,
                                                  frold->dirty_log_mask,
                                                  frnew->dirty_log_mask);
                }
                // 相比于旧的 FlatRange,新的减了 dirty_log_mask,所以要调用 log_stop
                if (frold->dirty_log_mask & ~frnew->dirty_log_mask) {
                    MEMORY_LISTENER_UPDATE_REGION(frnew, as, Reverse, log_stop,
                                                  frold->dirty_log_mask,
                                                  frnew->dirty_log_mask);
                }
            }

            ++iold;
            ++inew;
        } else {
            // In new,所以我们需要调用 region_add() 函数。
            if (adding) {
                MEMORY_LISTENER_UPDATE_REGION(frnew, as, Forward, region_add);
                flat_range_coalesced_io_add(frnew, as);
            }
            ++inew;
        }
    }
}

address_space_init (QEMU)

void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
{
    as->root = root; // add the MemoryRegion as root attribute of AddressSpace
    // ...
    address_space_update_topology(as); // update the topology
}

memory_region_get_flatview_root

Input: MR, output: MR. "Get the root memory region for creating/updating the flatview".

实际上,AddressSpace::root 持有的并不一定就是顶层 MemoryRegion。 是需要通过 memory_region_get_flatview_root 来获取的,比如 e1000 的顶层 MemoryRegion 就不是 bus master container 而是 system,这样就可以让多个 AddressSpace 虽然持有的 root 不同,但是可以公用相同的 Flatview 了。

address-space: e1000
  0000000000000000-ffffffffffffffff (prio 0, i/o): bus master container
    0000000000000000-ffffffffffffffff (prio 0, i/o): alias bus master @system 0000000000000000-ffffffffffffffff
// input is a root MR, output is also a **MR**, this function is not used to get the flatview
// actually, this function is used to get the real root MR.
// 
static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr)
{
    while (mr->enabled) {
        // 如果有 alias,且此 alias MR 的地址范围包含了指向的 MR,将当前搜索节点替换为 alias
        // alias_offset
        if (mr->alias) {
            if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) {
                /* The alias is included in its entirety.  Use it as
                 * the "real" root, so that we can share more FlatViews.
                 */
                mr = mr->alias;
                continue;
            }
        // 非叶子节点
        } else if (!mr->terminates) {
            unsigned int found = 0;
            MemoryRegion *child, *next = NULL;
            // 在所有子节点中找到满足要求的子节点,找到后继续使用此算法
            QTAILQ_FOREACH(child, &mr->subregions, subregions_link) {
                if (child->enabled) {
                    if (++found > 1) {
                        next = NULL;
                        break;
                    }
                    if (!child->addr && int128_ge(mr->size, child->size)) {
                        /* A child is included in its entirety.  If it's the only
                         * enabled one, use it in the hope of finding an alias down the
                         * way. This will also let us share FlatViews.
                         */
                        next = child;
                    }
                }
            }
            if (found == 0) {
                return NULL;
            }
            if (next) {
                mr = next;
                continue;
            }
        }

        return mr;
    }

    return NULL;
}

memory_region_init_ram_*() / QEMU

memory_region_init_ram_from_file() / QEMU

void memory_region_init_ram_from_file(MemoryRegion *mr,
                                      Object *owner,
                                      const char *name,
                                      uint64_t size,
                                      uint64_t align,
                                      uint32_t ram_flags,
                                      const char *path,
                                      ram_addr_t offset,
                                      Error **errp)
{
    Error *err = NULL;
    memory_region_init(mr, owner, name, size);
    mr->ram = true;
    mr->readonly = !!(ram_flags & RAM_READONLY);
    // 表示这是一个叶子实体 MR
    mr->terminates = true;
    mr->destructor = memory_region_destructor_ram;
    mr->align = align;
    // 真正分配的地方,从给定的文件中进行分配,这个文件在 path
    mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path, offset, &err);
    // error handling
}

memory_region_init_ram_from_fd() / QEMU

void memory_region_init_ram_from_fd(MemoryRegion *mr,
                                    Object *owner,
                                    const char *name,
                                    uint64_t size,
                                    uint32_t ram_flags,
                                    int fd,
                                    ram_addr_t offset,
                                    Error **errp)
{
    Error *err = NULL;
    memory_region_init(mr, owner, name, size);
    mr->ram = true;
    mr->readonly = !!(ram_flags & RAM_READONLY);
    // 表示这是一个叶子实体 MR
    mr->terminates = true;
    mr->destructor = memory_region_destructor_ram;
    // 从给定的 fd 中进行分配
    mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, &err);
    //..
}

memory_region_init_ram_flags_nomigrate() / QEMU

void memory_region_init_ram_flags_nomigrate(MemoryRegion *mr,
                                            Object *owner,
                                            const char *name,
                                            uint64_t size,
                                            uint32_t ram_flags,
                                            Error **errp)
{
    Error *err = NULL;
    memory_region_init(mr, owner, name, size);
    mr->ram = true;
    mr->terminates = true;
    mr->destructor = memory_region_destructor_ram;
    mr->ram_block = qemu_ram_alloc(size, ram_flags, mr, &err);
    //...
}

*_backend_memory_alloc() / QEMU

注意,alloc 的都是 backend->mr,QEMU 传进来 ram 参数 -m 有多大,这个 MR 就有多大。ram 本身的 MR 结构比较扁平,没有很复杂,所以这里可以看作一次性就把虚拟机所有的 RAM(以及对应的一个 RAMBlock)一股脑申请了。

memfd_backend_memory_alloc() QEMU

创建一个 memfd,然后基于这个 memfd 作为 backend 来分配内存。

memfd_backend_class_init
    bc->alloc = memfd_backend_memory_alloc;
host_memory_backend_memory_complete
    bc->alloc(backend, &local_err);
static void memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
    HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend);
    uint32_t ram_flags;
    char *name;
    int fd;

    //...
    fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
                           m->hugetlb, m->hugetlbsize, m->seal ?
                           F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
                           errp);
    //...
    name = host_memory_backend_get_name(backend);
    // 设置一些 flags
    ram_flags = backend->share ? RAM_SHARED : 0;
    ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
    ram_flags |= backend->kvm_gmem ? RAM_KVM_GMEM : 0;
    // 给 backend->mr^ 进行内存的分配
    memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp);
    //...
}

ram_backend_memory_alloc() QEMU

static void ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
{
    uint32_t ram_flags;
    char *name;

    //...
    name = host_memory_backend_get_name(backend);
    ram_flags = backend->share ? RAM_SHARED : 0;
    ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
    ram_flags |= backend->kvm_gmem ? RAM_KVM_GMEM : 0;
    memory_region_init_ram_flags_nomigrate(&backend->mr, OBJECT(backend), name,
                                           backend->size, ram_flags, errp);
    g_free(name);
}

qemu_ram_alloc_from_*() / QEMU

qemu_ram_alloc_from_fd() / QEMU

RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
                                 uint32_t ram_flags, int fd, off_t offset,
                                 Error **errp)
{
    //...
    //error checking...
    size = HOST_PAGE_ALIGN(size);
    file_size = get_file_size(fd);
    //error checking...
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->used_length = size;
    new_block->max_length = size;
    new_block->flags = ram_flags;
    new_block->gmem_fd = -1;
    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset, errp);
    // error checking...

    ram_block_add(new_block, &local_err);
    // error checking...
    return new_block;
}

qemu_ram_alloc_from_file() QEMU

这个函数虽然名字看起来和 qemu_ram_alloc_from_fd() 是并列的,但是其实是一层对它包装,在之前加入了创建 fd 的过程。

RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
                                   uint32_t ram_flags, const char *mem_path,
                                   off_t offset, Error **errp)
{
    //...
    fd = file_ram_open(mem_path, memory_region_name(mr), !!(ram_flags & RAM_READONLY_FD), &created);
    // error handling...
    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp);
    // error handling...
    return block;
}

qemu_ram_alloc_from_ptr() / qemu_ram_alloc_internal() / qemu_ram_alloc() QEMU

qemu_ram_alloc_from_ptr() 这个函数并不会进行内存的分配,因为传进来的 host 已经表示了那一块内存空间了,直接用就可以。

qemu_ram_alloc() 会进行内存的分配,因为传进来的时候 host 是 null。

RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr, Error **errp)
{
    return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr, errp);
}

RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags, MemoryRegion *mr, Error **errp)
{
    //...
    return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
}

RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                  void (*resized)(const char*, uint64_t length, void *host),
                                  void *host, uint32_t ram_flags,
                                  MemoryRegion *mr, Error **errp)
{
    RAMBlock *new_block;
    //...
    size = HOST_PAGE_ALIGN(size);
    max_size = HOST_PAGE_ALIGN(max_size);
    new_block = g_malloc0(sizeof(*new_block));
    new_block->mr = mr;
    new_block->resized = resized;
    new_block->used_length = size;
    new_block->max_length = max_size;
    assert(max_size >= size);
    new_block->fd = -1;
    new_block->gmem_fd = -1;
    new_block->page_size = qemu_real_host_page_size();
    new_block->host = host;
    new_block->flags = ram_flags;

    // 这才是核心,这里并没有进行内存的分配,因为我们已经有内存了
    // 因为 new_block->host = host,除非传进来的 host 是 NULL
    ram_block_add(new_block, &local_err);
    //...
    return new_block;
}

file_ram_alloc() / QEMU

file_ram_alloc() 这个函数的作用就是给 RAMblock 分配内存。可以是从传统内存进行分配,也可以是从 fd 进行分配。一般来说,传进来的 fd 都是 > 0 也就是有值的。如果没有值,会以传统的方式来申请内存,否则就从 fd 来 mmap

static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, int fd, bool truncate, off_t offset, Error **errp)
{
    uint32_t qemu_map_flags;
    void *area;

    // 拿到 fd 的 page size,
    block->page_size = qemu_fd_getpagesize(fd);
    // a bunch of error checkings...
    block->mr->align = MAX(block->page_size, block->mr->align);
    // error checking...

    memory = ROUND_UP(memory, block->page_size);
    // truncate related...
    //...
    // 根据 block 的 flag 来设置 mmap 的 flag
    qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
    qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
    qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
    qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
    // 从 fd 中分配内存
    area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);

    // error checking...
    block->fd = fd;
    block->fd_offset = offset;
    // 指针
    return area;
}

Globals

AddressSpace Globals

主要是两个 AddressSpace,一个是 address_space_memory 另一个是 address_space_io

MemoryRegion Globals

主要是两个 MemoryRegion,system_memorysystem_io,这两个都通过 address_space_init() 函数分别和 address_space_memory 以及 address_space_io 关联了起来,作为 root MemoryRegion

Static QDict *machine_opts_dict

这个存了在初始化时的所有 parse 后的参数,是一个很关键的变量。

它是通过 parse QemuOpts 来构建的。

Case 1: From -m 2G to KVM_SET_USER_MEMORY_REGION

Examples:

  • qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high;
  • qemu -M pc -m 4G (new default) -> 3072M low, 1024M high;
  • qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high;
  • qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M).

以上逻辑体现在 pc_init1():

//...
        if (machine->ram_size >= lowmem) {
            x86ms->above_4g_mem_size = machine->ram_size - lowmem;
            x86ms->below_4g_mem_size = lowmem;
        } else {
            x86ms->above_4g_mem_size = 0;
            x86ms->below_4g_mem_size = machine->ram_size;
        }
//...

In qemu-options.hx, a command line option m is defined to work with QEMU_OPTION_m:

// 2 globals
static MemoryRegion *system_io;
AddressSpace address_space_io;
static MemoryRegion *system_memory;
AddressSpace address_space_memory;

qemu_init()
    // add "qemu_mem_opts" to "vm_config_groups"
    qemu_add_opts(&qemu_mem_opts);
        case QEMU_OPTION_m:
            opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true);
                opts_parse
                    // 把 QemuOpts opts 添加到 QemuOptsList qemu_mem_opts 里
                    qemu_opts_create
                    // parse the option string
                    opts_do_parse
    // parse from QemuOpts to "machine_opts_dict"
    parse_memory_options();
    // create "current_machine"
    // initialize global variable system_memory, system_io, address_space_memory, address_space_io
    qemu_create_machine(machine_opts_dict);
        cpu_exec_init_all
            io_mem_init
                memory_region_init_io
            memory_map_init
                memory_region_init(system_memory, NULL, "system", UINT64_MAX);
                // address_space_memory uses system_memory as the root MR
                address_space_init(&address_space_memory, system_memory, "memory");
                // address_space_io uses system_io as the root MR
                memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io", 65536);
                address_space_init(&address_space_io, system_io, "I/O");

    // set the options in "machine_opts_dict" to "current_machine", such as "current_machine->mem_size"
    // default ram size is 128M, you can search default_ram_size for more details
    qemu_apply_machine_options
        object_set_properties_from_keyval

    // configure kvm accelerator
    configure_accelerators(argv[0]);
        do_configure_accelerator
            accel_init_machine
                kvm_init() // acc->init_machine(ms);
                    // register listener to address_space_memory and address_space_io
                    kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0, "kvm-memory");

    // The place calling KVM_SET_USER_MEMORY_REGION ioctl
    qmp_x_exit_preconfig
        qemu_init_board
            machine_run_board_init // pass current_machine in
                // assiociate with "current_machine->memdev", the memory backend
                create_default_memdev
                    // init the memdev->mr and memdev->mr->ram_block
                    user_creatable_complete
                        host_memory_backend_memory_complete // ucc->complete(uc, &err);
                            ram_backend_memory_alloc // bc->alloc()
                                memory_region_init_ram_flags_nomigrate
                                    memory_region_init
                                    // cmdline 指定的 mem 是多大,就申请多大,不管什么 above_4g, below_4g 
                                    qemu_ram_alloc // <- memdev->mr->ram_block
                                        ram_block_add
                                            qemu_anon_ram_alloc
                                                mmap() // 
                    object_property_set_link
                // current_machine->ram = some_modification(current_machine->memdev->mr)
                machine_consume_memdev(machine, machine->memdev);
                machine_class->init(machine);
                    pc_init1
                        pc_memory_init
                            // also will call this function on "ram_above_4g" if the
                            // memory we specified in cmdline is higher than 4g
                            memory_region_add_subregion(system_memory, 0, ram_below_4g);
                                memory_region_add_subregion_common
                                    memory_region_update_container_subregions
                                        memory_region_transaction_commit
                                            memory_region_transaction_commit_locked
                                                address_space_set_flatview
                                                    address_space_update_topology_pass
                                                        kvm_region_add
                                                            kvm_set_phys_mem
                                                                kvm_set_user_memory_region

MachineState:ram 和 global variable system_memory 的区别?

首先,这两个都是 MR。

The 2 MR ram_below_4g/ram_above_4g are alias MR both pointing to machine->ram. They are also both the subregion of MR system_memory.

未完待续。

machine_consume_memdev

pc_init1

// 简化了 Xen 相关的 code
static void pc_init1(MachineState *machine,
                     const char *host_type, const char *pci_type)
{
    PCMachineState *pcms = PC_MACHINE(machine);
    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
    X86MachineState *x86ms = X86_MACHINE(machine);

    // 可以看到这里定义了五个新的 MR
    MemoryRegion *system_memory = get_system_memory();
    MemoryRegion *system_io = get_system_io();
    MemoryRegion *ram_memory;
    MemoryRegion *pci_memory;
    MemoryRegion *rom_memory;

    PCIBus *pci_bus;
    ISABus *isa_bus;
    int piix3_devfn = -1;
    qemu_irq smi_irq;
    GSIState *gsi_state;
    BusState *idebus[MAX_IDE_BUS];
    ISADevice *rtc_state;
    ram_addr_t lowmem;
    uint64_t hole64_size;
    DeviceState *i440fx_host;

    // 一通计算,主要是为了算 lowmem
    // ...
    // 根据 lowmem 确定 above_4g_mem_size 以及 below_4g_mem_size
    if (machine->ram_size >= lowmem) {
        x86ms->above_4g_mem_size = machine->ram_size - lowmem;
        x86ms->below_4g_mem_size = lowmem;
    } else {
        x86ms->above_4g_mem_size = 0;
        x86ms->below_4g_mem_size = machine->ram_size;
    }

    // pci memory 相关
    //...


    /* allocate ram and load rom/bios */
    pc_memory_init(pcms, system_memory, rom_memory, &ram_memory, hole64_size);
    //...
}

pc_memory_init

void pc_memory_init(PCMachineState *pcms,
                    MemoryRegion *system_memory,
                    MemoryRegion *rom_memory,
                    MemoryRegion **ram_memory,
                    uint64_t pci_hole64_size)
{
    int linux_boot, i;
    MemoryRegion *option_rom_mr;
    MemoryRegion *ram_below_4g, *ram_above_4g;
    FWCfgState *fw_cfg;
    MachineState *machine = MACHINE(pcms);
    MachineClass *mc = MACHINE_GET_CLASS(machine);
    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
    X86MachineState *x86ms = X86_MACHINE(pcms);
    hwaddr maxphysaddr, maxusedaddr;
    hwaddr cxl_base, cxl_resv_end = 0;
    X86CPU *cpu = X86_CPU(first_cpu);

    // 保证我们的内存大小是没有错误的
    assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size);

    //...

    /*
     * Split single memory region and use aliases to address portions of it,
     * done for backwards compatibility with older qemus.
     */
    *ram_memory = machine->ram;
    // 把 ram_below_4g 的类型设置为 alias MR,指向 machine->ram,
    // offset 就是 0,size 就是 below_4g_mem_size,这很合理。同时
    // machine->ram 也是一块连续的 host memory,所以 ram_blow 和
    // above 应该设置为其 alias,很合理。
    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", machine->ram,
                             0, x86ms->below_4g_mem_size);
    memory_region_add_subregion(system_memory, 0, ram_below_4g);
    e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM);
    if (x86ms->above_4g_mem_size > 0) {
        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
        // 把 ram_above_4g 的类型设置为 alias MR,指向 machine->ram,
        // offset 就是 below_4g_mem_size,size 就是 above_4g_mem_size,这也很合理。
        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g",
                                 machine->ram,
                                 x86ms->below_4g_mem_size,
                                 x86ms->above_4g_mem_size);
        memory_region_add_subregion(system_memory, x86ms->above_4g_mem_start,
                                    ram_above_4g);
        e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size,
                       E820_RAM);
    }

    if (pcms->sgx_epc.size != 0) {
        e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED);
    }

    if (!pcmc->has_reserved_memory &&
        (machine->ram_slots ||
         (machine->maxram_size > machine->ram_size))) {

        error_report("\"-memory 'slots|maxmem'\" is not supported by: %s",
                     mc->name);
        exit(EXIT_FAILURE);
    }

    /* always allocate the device memory information */
    machine->device_memory = g_malloc0(sizeof(*machine->device_memory));

    /* initialize device memory address space */
    if (pcmc->has_reserved_memory &&
        (machine->ram_size < machine->maxram_size)) {
        ram_addr_t device_mem_size;

        if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
            error_report("unsupported amount of memory slots: %"PRIu64,
                         machine->ram_slots);
            exit(EXIT_FAILURE);
        }

        if (QEMU_ALIGN_UP(machine->maxram_size,
                          TARGET_PAGE_SIZE) != machine->maxram_size) {
            error_report("maximum memory size must by aligned to multiple of "
                         "%d bytes", TARGET_PAGE_SIZE);
            exit(EXIT_FAILURE);
        }

        pc_get_device_memory_range(pcms, &machine->device_memory->base, &device_mem_size);

        if ((machine->device_memory->base + device_mem_size) <
            device_mem_size) {
            error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
                         machine->maxram_size);
            exit(EXIT_FAILURE);
        }

        memory_region_init(&machine->device_memory->mr, OBJECT(pcms),
                           "device-memory", device_mem_size);
        memory_region_add_subregion(system_memory, machine->device_memory->base,
                                    &machine->device_memory->mr);
    }

    if (pcms->cxl_devices_state.is_enabled) {
        MemoryRegion *mr = &pcms->cxl_devices_state.host_mr;
        hwaddr cxl_size = MiB;

        cxl_base = pc_get_cxl_range_start(pcms);
        memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
        memory_region_add_subregion(system_memory, cxl_base, mr);
        cxl_resv_end = cxl_base + cxl_size;
        if (pcms->cxl_devices_state.fixed_windows) {
            hwaddr cxl_fmw_base;
            GList *it;

            cxl_fmw_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB);
            for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) {
                CXLFixedWindow *fw = it->data;

                fw->base = cxl_fmw_base;
                memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
                                      "cxl-fixed-memory-region", fw->size);
                memory_region_add_subregion(system_memory, fw->base, &fw->mr);
                cxl_fmw_base += fw->size;
                cxl_resv_end = cxl_fmw_base;
            }
        }
    }

    /* Initialize PC system firmware */
    pc_system_firmware_init(pcms, rom_memory);

    if (!is_tdx_vm()) {
        option_rom_mr = g_malloc(sizeof(*option_rom_mr));
        memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
                            &error_fatal);
        if (pcmc->pci_enabled) {
            memory_region_set_readonly(option_rom_mr, true);
        }
        memory_region_add_subregion_overlap(rom_memory,
                                            PC_ROM_MIN_VGA,
                                            option_rom_mr,
                                            1);
    }

    fw_cfg = fw_cfg_arch_create(machine,
                                x86ms->boot_cpus, x86ms->apic_id_limit);

    rom_set_fw(fw_cfg);

    if (pcmc->has_reserved_memory && machine->device_memory->base) {
        uint64_t *val = g_malloc(sizeof(*val));
        PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
        uint64_t res_mem_end = machine->device_memory->base;

        if (!pcmc->broken_reserved_end) {
            res_mem_end += memory_region_size(&machine->device_memory->mr);
        }

        if (pcms->cxl_devices_state.is_enabled) {
            res_mem_end = cxl_resv_end;
        }
        *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
        fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
    }

    if (linux_boot) {
        x86_load_linux(x86ms, fw_cfg, pcmc->acpi_data_size,
                       pcmc->pvh_enabled, pcmc->legacy_no_rng_seed);
    }

    for (i = 0; i < nb_option_roms; i++) {
        rom_add_option(option_rom[i].name, option_rom[i].bootindex);
    }
    x86ms->fw_cfg = fw_cfg;

    /* Init default IOAPIC address space */
    x86ms->ioapic_as = &address_space_memory;

    /* Init ACPI memory hotplug IO base address */
    pcms->memhp_io_base = ACPI_MEMORY_HOTPLUG_BASE;
}

Case 2: From -m 2G to KVM_SET_USER_MEMORY_REGION (UPM)

Because we use it by -object memory-backend-memfd-private,id=ram1,size=16G, so in qapi/qom.json, we can see:

// Describes the options of a user creatable QOM object.
{ 'union': 'ObjectOptions',
  'base': { 'qom-type': 'ObjectType',
            'id': 'str' },
  'discriminator': 'qom-type',
  'data': {
      //...
      'memory-backend-memfd-private':       { 'type': 'MemoryBackendPrivateMemfdProperties',
                                      'if': 'CONFIG_LINUX' },
      //...
  } }

##
# @MemoryBackendPrivateMemfdProperties:
#
# Properties for memory-backend-memfd-private objects.
##
  { 'struct': 'MemoryBackendPrivateMemfdProperties',
  'base': 'MemoryBackendProperties',
  'data': { '*hugetlb': 'bool',
            '*hugetlbsize': 'size',
            '*seal': 'bool' } }

##
# @MemoryBackendProperties:
#
# Properties for objects of classes derived from memory-backend.
# ...
# @size: size of the memory region in bytes
# ...
##
{ 'struct': 'MemoryBackendProperties',
  'data': { '*dump': 'bool',
             //...
            'size': 'size',
            '*x-use-canonical-path-for-ramblock-id': 'bool' } }
static const TypeInfo priv_memfd_backend_info = {
    .name = TYPE_MEMORY_BACKEND_MEMFD_PRIVATE, // memory-backend-memfd-private
    .parent = TYPE_MEMORY_BACKEND,
    .instance_init = priv_memfd_backend_instance_init,
    .class_init = priv_memfd_backend_class_init,
    .instance_size = sizeof(HostMemoryBackendPrivateMemfd),
};

// QEMU
priv_memfd_backend_class_init
    bc->alloc = priv_memfd_backend_memory_alloc;
    host_memory_backend_memory_complete
        bc->alloc(backend, &local_err);
            qemu_memfd_restricted
            // KVM
            



// TYPE_MEMORY_BACKEND_MEMFD_PRIVATE's parent
static const TypeInfo host_memory_backend_info = {
    .name = TYPE_MEMORY_BACKEND,
    .parent = TYPE_OBJECT,
    .abstract = true,
    .class_size = sizeof(HostMemoryBackendClass),
    .class_init = host_memory_backend_class_init,
    .instance_size = sizeof(HostMemoryBackend),
    .instance_init = host_memory_backend_init,
    .instance_post_init = host_memory_backend_post_init,
    .interfaces = (InterfaceInfo[]) {
        { TYPE_USER_CREATABLE },
        { }
    }
};

host_memory_backend_class_init
    host_memory_backend_set_size
        



HostMemoryBackendClass / host_memory_backend_info / TYPE_MEMORY_BACKEND

// TYPE_MEMORY_BACKEND_MEMFD_PRIVATE's parent
static const TypeInfo host_memory_backend_info = {
    .name = TYPE_MEMORY_BACKEND,
    .parent = TYPE_OBJECT,
    .abstract = true,
    .class_size = sizeof(HostMemoryBackendClass),
    .class_init = host_memory_backend_class_init,
    .instance_size = sizeof(HostMemoryBackend),
    .instance_init = host_memory_backend_init,
    .instance_post_init = host_memory_backend_post_init,
    .interfaces = (InterfaceInfo[]) {
        { TYPE_USER_CREATABLE },
        { }
    }
};
static const struct file_operations restrictedmem_fops = {
	.release = restrictedmem_release,
	.fallocate = restrictedmem_fallocate,
};

Memory Management in KVM

Initialization:

kvm_arch_init_vm
    kvm_mmu_init_vm
        kvm_mmu_init_tdp_mmu

Allocation:

kvm_mmu_get_shadow_page
__kvm_mmu_get_shadow_page
kvm_mmu_alloc_shadow_page
kvm_mmu_memory_cache_alloc
// KVM_SET_USER_MEMORY_REGION
kvm_vm_ioctl_set_memory_region // wrapper: check memslot num
    kvm_set_memory_region // wrapper: lock
        __kvm_set_memory_region
            // sanity check on the arguments
            kvm_set_memslot
                kvm_prepare_memory_region

三种不同的 MMU

  • TDP (EPT) MMU
  • Shadow MMU
  • KVM SoftMMU

Shadow PTE (sPTE)

不要和 Shadow paging 搞混。

KVM 在还没有 EPT 的时候,采用的是影子页表(shadow page table)机制,为了和之前的代码兼容,EPT 机制是在影子页表机制代码的基础上实现的,所以 EPT 里面的 PTE (注意不是 guest 里的 PTE)和之前一样被叫做 shadow PTE.

GFN track

enum kvm_page_track_mode {
	KVM_PAGE_TRACK_WRITE,
	KVM_PAGE_TRACK_MAX,
};

// gfn_track 是一个二ww数组:gfn_track[mode][index]
// index 表示的是 gfn。
// remove page 的时候 page 对应的数组项减 1
// add page 的时候 page 对应的数组项加 1
struct kvm_arch_memory_slot {
    //...
	unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
};

Replace the given spte with an spte pointing to the provided page table.

顾名思义,link 就是把一个新的 page table link 到主要的 page table 上。link 的方式就是让主 page table 的一个 SPTE 指向这个新的 page table。

// sp 表示要 link 的 page table
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
			   struct kvm_mmu_page *sp, bool shared)
{
    // make_nonleaf_spte 表示的就是这个不是 map 一个 page,而是 map 一个 page table
	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
    //...

	if (shared)
		tdp_mmu_set_spte_atomic(kvm, iter, spte);
	else
		tdp_mmu_set_spte(kvm, iter, spte);

	tdp_account_mmu_page(kvm, sp);

	return 0;
}

kvm_tdp_mmu_get_vcpu_root_hpa() / mmu_alloc_root() KVM

这两个函数作用是相同的,只不过一个是在 TDP 的语境下,另一个是在 shadow page table 的语境下。

每一个 VCPU 都有一个 root kvm_mmu_page,这两个函数就是为传入的 VCPU 生成此 kvm_mmu_page

hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
	struct kvm *kvm = vcpu->kvm;
	struct kvm_mmu_page *root;

    //...
    // 以前已经有了,直接返回即可,同时给 tdp_mmu_root_count 加 1 表示
    // 又多了一个人在用。
	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
		if (root->role.word == role.word && kvm_tdp_mmu_get_root(root))
			goto out;
	}

	root = tdp_mmu_alloc_sp(vcpu);
	tdp_mmu_init_sp(root, NULL, 0, role);

	/*
	 * TDP MMU roots are kept until they are explicitly invalidated, either
	 * by a memslot update or by the destruction of the VM.  Initialize the
	 * refcount to two; one reference for the vCPU, and one reference for
	 * the TDP MMU itself, which is held until the root is invalidated and
	 * is ultimately put by tdp_mmu_zap_root_work().
	 */
	refcount_set(&root->tdp_mmu_root_count, 2);

    //...
    // 添加到 tdp_mmu_roots 中去,这样下次再调用这个函数的时候,如果
    // 在里面,就可以直接把其 tdp_mmu_root_count 加一并返回了。
	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
    //...

out:
	return __pa(root->spt);
}

static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
			    u8 level)
{
	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
	struct kvm_mmu_page *sp;

	role.level = level;
	role.quadrant = quadrant;

    //...
    // 因为 shadow page table 机制并没有类似 tdp_mmu_roots 这种保存了
    // 所有 root 的结构,所以可以看到它是直接 get_shadow_page 的。
	sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
	++sp->root_count;
	return __pa(sp->spt);
}

to_shadow_page() KVM

把一个 HPA 转换成为 struct kvm_mmu_page

HPA 并不是 struct kvm_mmu_page 这个结构的地址,而是这个 struct kvm_mmu_page 所描述的 page 的地址。所以这可以理解为是一个反向的映射。

mmu_set_spte() KVM(还没有全看懂)

作用就是把传进来的 sptep 指向传进来的(gfn, pfn)。

static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
			u64 *sptep, unsigned int pte_access, gfn_t gfn,
			kvm_pfn_t pfn, struct kvm_page_fault *fault)
{
    // sp 表示 spte 所在的这个页表
	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
	int level = sp->role.level;
	int was_rmapped = 0;
	int ret = RET_PF_FIXED;
	bool flush = false;
	bool wrprot;
	u64 spte;

	/* Prefetching always gets a writable pfn.  */
	bool host_writable = !fault || fault->map_writable;
	bool prefetch = !fault || fault->prefetch;
	bool write_fault = fault && fault->write;

    // 一些 corner case...
    //...
	if (is_shadow_present_pte(*sptep)) {
		// If we overwrite a PTE page pointer with a 2MB PMD, unlink
		// the parent of the now unreachable PTE.
        // huge page case
		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
			struct kvm_mmu_page *child;
			u64 pte = *sptep;

			child = spte_to_child_sp(pte);
			drop_parent_pte(child, sptep);
			flush = true;
		} else if (pfn != spte_to_pfn(*sptep)) {
            //...
            // 如果我们要指向的 pfn 不是目前 sptep 指的
            // 那扔掉就行,然后 TLB Flush 一下
			drop_spte(vcpu->kvm, sptep);
			flush = true;
		} else
            // 如果目前已经指的和要映射的一样
            // 设置为 1
			was_rmapped = 1;
	}

    // 这一行是重点,根据 gfn, pfn 创建一个新的 SPTE。
	wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
			   true, host_writable, &spte);

	if (*sptep == spte) {
		ret = RET_PF_SPURIOUS;
	} else {
		flush |= mmu_spte_update(sptep, spte);
	}

	if (wrprot) {
		if (write_fault)
			ret = RET_PF_EMULATE;
	}

    // flush 一下 tlb
	if (flush)
		kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
    //...

	if (!was_rmapped) {
		WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
		rmap_add(vcpu, slot, sptep, gfn, pte_access);
	} else {
		/* Already rmapped but the pte_access bits may have changed. */
		kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
	}

	return ret;
}

struct kvm_mmu_page KVM

用来描述一个页表页(注意,是 KVM 里维护的 shadow page table,而不是 guest 的页表),也就是说,这个页是用来存放页表(通常有 512 个 entry)的。

可以看作是一个表,因为 entry (such as PTE) 是 8 个字节(一个 cache line),所以一个页可以存放 512 个 entry。

struct kvm_mmu_page {
	/*
	 * Note, "link" through "spt" fit in a single 64 byte cache line on
	 * 64-bit kernels, keep it that way unless there's a reason not to.
	 */
    // 用来加到 list 里用的,无实际意义。
	struct list_head link;
	struct hlist_node hash_link;

    // 存放的 TDP 的页表,而不是 shadow PT 的
	bool tdp_mmu_page;
    // 
	bool unsync;


    // 下面两个用来b表示这个 mmu page 所映射的 GFN 范围
    // gfn 表示的是**起始 gfn**,也就是第一个 entry 的 gfn
    // role 表示这个 page 在页表的第几级,这决定了这个页表能表示的 gfn 的多少
	union kvm_mmu_page_role role;
	gfn_t gfn;

    // Currently serving as active root
    // union 里的 field 是通过有没有使用 TDP,也就是 tdp_mmu_page 来区分的
	union {
        // 没有使用 TDP 的时候,是这个
		int root_count;

        // 使用了 TDP 的时候,是这个
        // its increased by kvm_tdp_mmu_get_root()
        // and decreased by kvm_tdp_mmu_put_root()
        // 初始化为 2,one reference for the vCPU, and one reference for the TDP MMU itself
		refcount_t tdp_mmu_root_count;
	};

    // number of unsync subentries, up to 512
    unsigned int unsync_children;

    // 其所描述的页表页的地址 HVA,这个结构体只是一个描述结构体,spt 指向的才是真正的页。
    // 为什么指向 u64 而不是 void 一样指向一个 page 呢,因为一个 entry 是 64bit,这个 spt
    // 指向的是第一个 entry,可以通过 spt[i] 来访问第 i 个 entry,这样很方便。
	u64 *spt;
    // 其所描述的 private 页表页的地址 HVA
    // 因为我们其实是两个 EPT,一个是在 KVM(VMM)里所维护的 EPT,另一个
    // 是 TDX Module 在其 private memory 里自己维护的。所以这个 attribute
    // 表示的其实就是 TDX Module 里维护的信息,和我们 KVM 里维护的对应起来。
    // KVM 里的 PT 内容和 TDX Module 自己维护的不一样,所以我们需要另外一个内存。
    void *private_spt;

	union {
        // Each entry can have up to 512 sub-entries.
		DECLARE_BITMAP(unsync_child_bitmap, 512);
        //...
	};

    // 这个 shadow page 如果被 zap 了,会不会用来重建一个 NX huge page.
    // 暂时还没有搞清楚
	struct list_head possible_nx_huge_page_link;

    /*
	 * KVM shadows two types of guest translations:
	 *  - nGPA -> GPA (shadow EPT/NPT) and,
	 *  - GVA ->  GPA (traditional shadow paging).
	 * In both * cases the result of the translation is a GPA.
     */ 
    // 里面存了 GFN,也就是映射的结果。
    // 只有在 !role.direct 的时候才会有用。
    // 长度和一个 page 里能放的 spte entry 数量相等
	u64 *shadowed_translation;
    //...
};

struct kvm_mmu KVM

kvm_mmu 表示的是 shadow MMU,而不是 guest 的 MMU。因此 kvm_mmu 里的几级页表什么的信息表示的是 shadow PT 的信息,而不是 guest PT 的。

/*
 * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
 * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
 * current mmu mode.
 */
struct kvm_mmu {
	unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
    // 处理的 page fault 的函数。
    // 如果是普通的基于软件的 MMU,根据 paging mode 的不同,可能为:
    //  - nonpaging_page_fault
    //  - paging64_page_fault // 大多数应该是这个 mode
    //  - paging32_page_fault
    // 如果是 EPT MMU,
    //  - kvm_tdp_page_fault
    // 如果是 nested EPT 的情况
    //  - ept_page_fault
	int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
				  struct x86_exception *fault);
	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gpa_t gva_or_gpa, u64 access,
			    struct x86_exception *exception);
	int (*sync_page)(struct kvm_vcpu *vcpu,
			 struct kvm_mmu_page *sp);
	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
	struct kvm_mmu_root_info root;

    // 影子页表 root 页表页物理地址
    // EPT 情况下,该值就是 VMCS 的 EPT_pointer
	hpa_t private_root_hpa;
	union kvm_cpu_role cpu_role;

    // root page 的 role
	union kvm_mmu_page_role root_role;

    //...
	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];

	/*
	 * Bitmap; bit set = permission fault
	 * Byte index: page fault error code PFEC [4:1]
	 * Bit index: pte permissions in ACC_* format
	 */
	u8 permissions[16];

	u64 *pae_root;
	u64 *pml4_root;
	u64 *pml5_root;

	/*
	 * check zero bits on shadow page table entries, these
	 * bits include not only hardware reserved bits but also
	 * the bits spte never used.
	 */
	struct rsvd_bits_validate shadow_zero_check;

	struct rsvd_bits_validate guest_rsvd_check;

	bool no_prefetch;

    // pdptr, used in PAE
	u64 pdptrs[4]; /* pae */
};

mmu, root_mmu, guest_mmu, nested_mmu, walk_mmu / KVM

struct kvm_vcpu_arch {
	// This context is always used to handle faults.
	struct kvm_mmu *mmu;

	/* L1 MMU when not nested */
	struct kvm_mmu root_mmu;

	/* L1 MMU when running nested */
	struct kvm_mmu guest_mmu;

	/*
	 * Paging state of an L2 guest (used for nested npt)
	 *
	 * This context will save all necessary information to walk page tables
	 * of an L2 guest. This context is only initialized for page table
	 * walking and not for faulting since we never handle l2 page faults on
	 * the host.
	 */
	struct kvm_mmu nested_mmu;

	/*
	 * Pointer to the mmu context currently used for
	 * gva_to_gpa translations.
	 */
	struct kvm_mmu *walk_mmu;
}

在刚开始 create 的时候,mmuwalk_mmu 都指向了 root_mmu

int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
    //...
	vcpu->arch.mmu = &vcpu->arch.root_mmu;
	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
    // guest_mmu 为什么在 create vCPU 的时候就创建呢?为什么不是在 L2 guest 起的时候。
	__kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
	__kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
    //...
}

mmu 会在 root_mmuguest_mmu 之间切换。是为了 handle nested EPT 的情况,比如

  • 会在 nested_ept_init_mmu_context() 中切到 guest_mmu
  • 并在 nested_ept_uninit_mmu_context() 中切回 root_mmu

可以看出来大多数情况还是 mmuroot_mmu 是相等的。

每一个 vCPU 的 root_mmu 的页的物理地址都是相等的吗?

是的,这也是我们需要 MMU lock 的原因之一。就是为了防止多 CPU 之间同步的问题。我们可以看下面的 call trace:

mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
    if (tdp_mmu_enabled)
        root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
            kvm_tdp_mmu_get_vcpu_root
                kvm_tdp_mmu_get_vcpu_root_no_alloc(vcpu, role);
                    // 这里会遍历所有的 mmu roots,如果有的话就直接返回
                    // 就不需要重新 alloc 了,相当于一个 cache。
                    for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
                        // 会增加 root->tdp_mmu_root_count,表示又多了一个 reference 的人
                		if (root->role.word == role.word && kvm_tdp_mmu_get_root(root))
                			return root;
                	}
                // 如果 cache 里没有,分配一个并加入到 cache(tdp_mmu_roots) 当中。
                root = tdp_mmu_alloc_sp(vcpu, role);
                list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);

MMU in struct kvm_arch

struct kvm_arch {
    // Whether the TDP MMU is enabled for this VM
    // set in kvm_mmu_init_tdp_mmu()
    bool tdp_mmu_enabled;

    // kvm_mmu_alloc_shadow_page
	struct list_head active_mmu_pages;
	struct list_head private_mmu_pages;
}

kvm_mmu_create()

kvm_mmu_load_pending_pgd() KVM

kvm_mmu_reload() KVM

vcpu_enter_guest
    kvm_mmu_reload

kvm_mmu_load() KVM

只被 kvm_mmu_reload 调用。

struct kvm_mmu_page_role KVM

kvm_mmu_page_role tracks the properties of a shadow page (where shadow page also includes TDP pages) to determine whether or not a page can be used in the given MMU context.

union kvm_mmu_page_role {
    // 这个是一个 identifier,每一个 kvm_mmu_page 的这个都不一样,
    // 因为这是一个 union,下面 struct 的值如果在 KVM 中处理时正确
    // 的话,本来就不会是一样的。
	u32 word;
	struct {
        //...
        // 此页表页所在这个多级页表里的第几级
        // 当是一个非 root page 时,取值是枚举类型 pg_level
        // 比如说 PG_LEVEL_4K 表示这个页的 512 entry
        // 里面的每一项 PTE 都是 leaf PTE 了。
        // enum pg_level {
    	//   PG_LEVEL_NONE,
    	//   PG_LEVEL_4K,
    	//   PG_LEVEL_2M,
    	//   PG_LEVEL_1G,
    	//   PG_LEVEL_512G,
    	//   PG_LEVEL_NUM
    	// }
        // 如果这个 page 表示一个 root,那么这个 root 的 level 可能是:
        // #define PT64_ROOT_5LEVEL 5
        // #define PT64_ROOT_4LEVEL 4
        // #define PT32_ROOT_LEVEL 2
        // #define PT32E_ROOT_LEVEL 3
		unsigned level:4;
        // 这个 page role 所描述的 SPTE 是 secure PTE (TDX) 的吗?
		unsigned is_private:1;
        // 只有在两种情况下,direct 才会是 1:TDP 和 Guest 在实模式
        // Guest 在实模式时,GVA 和 GPA 的值都是相等的,所以我们应该直接
        // 把 Guest 来的值当成 GPA 来处理,而不是 GVA。当我们已经在保护模式,同时我们用的是
        // shadow page table 而不是 TDP 的情况时,page fault 发生时给的是 CR2 也就是 GVA。
		unsigned direct:1;
        // ...
	};
};

vmx_load_mmu_pgd() / kvm_mmu_load_pgd() KVM

当我们没有用 EPT 时,函数简化为两行。不难看出,这个函数的作用就是把 Guest 的页表基址给换了。

void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
	guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
	vmcs_writel(GUEST_CR3, guest_cr3);
}

当有 EPT 时,CR3 和 EPTP 都要考虑,一个负责 GVA -> GPA, 一个负责 GPA->HPA:

void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
	struct kvm *kvm = vcpu->kvm;
	bool update_guest_cr3 = true;
	unsigned long guest_cr3;
	u64 eptp;

    // root_hpa 只是地址信息,缺少一些元信息,不能直接作为 EPTP,所以这里要 construct
    eptp = construct_eptp(vcpu, root_hpa, root_level);
    vmcs_write64(EPT_POINTER, eptp);

    if (!enable_unrestricted_guest && !is_paging(vcpu))
        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
    else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
        guest_cr3 = vcpu->arch.cr3;
    else /* vmcs.GUEST_CR3 is already up-to-date. */
        update_guest_cr3 = false;

    // write to guest's PDPTR
    vmx_ept_load_pdptrs(vcpu);

	if (update_guest_cr3)
		vmcs_writel(GUEST_CR3, guest_cr3);
}

vmx_ept_load_pdptrs()

void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

	if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
		return;

	if (is_pae_paging(vcpu)) {
		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
	}
}

.merge_private_spt / tdp_mmu_merge_private_spt()

Merge 512 consecutive 4KB or 2MB private TD page mappings into one 2MB or 1GB page mapping respectively.

应该是为了支持大页吧,每一个 mapping 的粒度变大了,带来的好处是占用的空间变少了。

static int tdp_mmu_merge_private_spt(struct kvm_vcpu *vcpu,
				     struct kvm_page_fault *fault,
				     struct tdp_iter *iter, u64 new_spte)
{
	u64 *sptep = rcu_dereference(iter->sptep);
	struct kvm_mmu_page *child_sp;
	struct kvm *kvm = vcpu->kvm;
	struct tdp_iter child_iter;
	bool ret_pf_retry = false;
	int level = iter->level;
	gfn_t gfn = iter->gfn;
	u64 old_spte = *sptep;
	tdp_ptep_t child_pt;
	u64 child_spte;
	int ret = 0;
	int i;

	/*
	 * TDX KVM supports only 2MB large page.  It's not supported to merge
	 * 2MB pages into 1GB page at the moment.
	 */
	WARN_ON_ONCE(fault->goal_level != PG_LEVEL_2M);
	WARN_ON_ONCE(iter->level != PG_LEVEL_2M);
	WARN_ON_ONCE(!is_large_pte(new_spte));

	/* Freeze the spte to prevent other threads from working spte. */
	if (!try_cmpxchg64(sptep, &iter->old_spte, REMOVED_SPTE))
		return -EBUSY;

	/*
	 * Step down to the child spte.  Because tdp_iter_next() assumes the
	 * parent spte isn't freezed, do it manually.
	 */
	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
	child_sp = sptep_to_sp(child_pt);
	WARN_ON_ONCE(child_sp->role.level != PG_LEVEL_4K);
	WARN_ON_ONCE(!kvm_mmu_page_role_is_private(child_sp->role));

	/* Don't modify iter as the caller will use iter after this function. */
	child_iter = *iter;
	/* Adjust the target gfn to the head gfn of the large page. */
	child_iter.next_last_level_gfn &= -KVM_PAGES_PER_HPAGE(level);
	tdp_iter_step_down(&child_iter, child_pt);

	/*
	 * All child pages are required to be populated for merging them into a
	 * large page.  Populate all child spte.
	 */
	for (i = 0; i < SPTE_ENT_PER_PAGE; i++, tdp_iter_step_side(&child_iter)) {
		WARN_ON_ONCE(child_iter.level != PG_LEVEL_4K);
		if (is_shadow_present_pte(child_iter.old_spte)) {
			/* TODO: relocate page for huge page. */
			WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) != spte_to_pfn(new_spte) + i);
			continue;
		}

		WARN_ON_ONCE(is_private_zapped_spte(old_spte) &&
			     spte_to_pfn(child_iter.old_spte) != spte_to_pfn(new_spte) + i);
		child_spte = make_huge_page_split_spte(kvm, new_spte, child_sp->role, i);
		/*
		 * Because other thread may have started to operate on this spte
		 * before freezing the parent spte,  Use atomic version to
		 * prevent race.
		 */
		ret = tdp_mmu_set_spte_atomic(vcpu->kvm, &child_iter, child_spte);
		if (ret == -EBUSY || ret == -EAGAIN)
			/*
			 * There was a race condition.  Populate remaining 4K
			 * spte to resolve fault->gfn to guarantee the forward
			 * progress.
			 */
			ret_pf_retry = true;
		else if (ret)
			goto out;
	}
	if (ret_pf_retry) {
		ret = RET_PF_RETRY;
		goto out;
	}

	/* Prevent the Secure-EPT entry from being used. */
	ret = static_call(kvm_x86_zap_private_spte)(kvm, gfn, level);
	if (ret)
		goto out;
	kvm_flush_remote_tlbs_with_address(kvm, gfn, KVM_PAGES_PER_HPAGE(level));

	/* Merge pages into a large page. */
	ret = static_call(kvm_x86_merge_private_spt)(kvm, gfn, level,
						     kvm_mmu_private_spt(child_sp));
	/*
	 * Failed to merge pages because some pages are accepted and some are
	 * pending.  Since the child page was mapped above, let vcpu run.
	 */
	if (ret == -EAGAIN)
		ret = RET_PF_RETRY;
	if (ret)
		goto unzap;

	/* Unfreeze spte. */
	__kvm_tdp_mmu_write_spte(sptep, new_spte);

	/*
	 * Free unused child sp.  Secure-EPT page was already freed at TDX level
	 * by kvm_x86_merge_private_spt().
	 */
	tdp_unaccount_mmu_page(kvm, child_sp);
	tdp_mmu_free_sp(child_sp);
	return RET_PF_RETRY;

unzap:
	if (static_call(kvm_x86_unzap_private_spte)(kvm, gfn, level))
		old_spte = __private_zapped_spte(old_spte);
out:
	__kvm_tdp_mmu_write_spte(sptep, old_spte);
	return ret;
}

SPT remove / destroy process

// hkid have been freed before
kvm_arch_destroy_vm
    kvm_unload_vcpu_mmus
        kvm_unload_vcpu_mmu
            kvm_mmu_unload
                __kvm_mmu_unload
                    kvm_mmu_free_roots(&vcpu->arch.root_mmu);
                        kvm_mmu_free_roots^
                            // private spt
                            mmu_free_root_page(kvm, &mmu->private_root_hpa, &invalid_list);
                                kvm_tdp_mmu_put_root
                                    tdp_mmu_schedule_zap_root
                                        tdp_mmu_zap_root_work

drop_spte // ...

kvm_mmu_free_roots() KVM

void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
			ulong roots_to_free)
{
	int i;
	LIST_HEAD(invalid_list);
	bool free_active_root;

	free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) && VALID_PAGE(mmu->root.hpa);

    //...
	if (free_active_root) {
		if (to_shadow_page(mmu->root.hpa)) {
			mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
		} else if (mmu->pae_root) {
			for (i = 0; i < 4; ++i) {
				if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
					continue;

				mmu_free_root_page(kvm, &mmu->pae_root[i],
						   &invalid_list);
				mmu->pae_root[i] = INVALID_PAE_ROOT;
			}
		}
		mmu->root.hpa = INVALID_PAGE;
		mmu->root.pgd = 0;
	}

	if (roots_to_free & KVM_MMU_ROOT_PRIVATE)
		mmu_free_root_page(kvm, &mmu->private_root_hpa, &invalid_list);

	kvm_mmu_commit_zap_page(kvm, &invalid_list);
	write_unlock(&kvm->mmu_lock);
}

KVM_SET_USER_MEMORY_REGION

Create, modify or delete a guest physical memory slot. Bits 0-15 of "slot" specify the slot id and this value should be less than the maximum number of user memory slots supported per VM. Slots may not overlap in guest physical address space.

Deleting a slot is done by passing zero for memory_size.

When changing an existing slot, it may be moved in the guest physical memory space, or its flags may be modified, but it may not be resized.

It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. The KVM_SET_MEMORY_REGION does not allow fine grained control over memory allocation and is deprecated.

这个 ioctl 的参数是一个 struct kvm_userspace_memory_region_ext

A memory change can be:

enum kvm_mr_change {
	KVM_MR_CREATE, // create new memory region
	KVM_MR_DELETE, // delete an existing memory region
	KVM_MR_MOVE, // update a memory region
	KVM_MR_FLAGS_ONLY, // only change it's flag
};

Struct kvm_userspace_memory_region / kvm_userspace_memory_region_ext QEMU

可以说一个 struct kvm_usersapce_memory_regionkvm_mem_slot 对应,只不过前者的目的主要是是用来让 Userspace 配置 kvm_mem_slot

/* for KVM_SET_USER_MEMORY_REGION */
struct kvm_userspace_memory_region {
	__u32 slot; // slot id
    // KVMSlot 的 flag
    // 比如 KVM_MEM_PRIVATE 表示这是 TDX
    // KVM_MEM_LOG_DIRTY_PAGES 表示要让 KVM 来 track 对于这个 slot memory 的 write
    // KVM_MEM_READONLY 表示这个 slot 是只读的
	__u32 flags;
	__u64 guest_phys_addr; // GPA
	__u64 memory_size;
	__u64 userspace_addr; // HVA not HPA
};

// This is added by the UPM patchset for **private** memory
// if (flags & KVM_MEM_PRIVATE)
//     size = sizeof(struct kvm_userspace_memory_region_ext);
// else
//     size = sizeof(struct kvm_userspace_memory_region);
struct kvm_userspace_memory_region_ext {
	struct kvm_userspace_memory_region region;
	__u64 restricted_offset;
	__u32 restricted_fd;
	__u32 pad1;
	__u64 pad2[14];
};

Struct kvm_user_mem_region KVM

KVM 里的这个结构体就是 QEMU 里 kvm_userspace_memory_region_ext 展开的样子。

这个结构体也是在 UPM patch 中被引入的。

/*
 * kvm_user_mem_region is a kernel-only **alias** of kvm_userspace_memory_region_ext
 * that "unpacks" kvm_userspace_memory_region so that KVM can directly access
 * all fields from the top-level "extended" region.
 */
struct kvm_user_mem_region {
	__u32 slot;
	__u32 flags;
	__u64 guest_phys_addr;
	__u64 memory_size;
	__u64 userspace_addr;
	__u64 restricted_offset;
	__u32 restricted_fd;
	__u32 pad1;
	__u64 pad2[14];
};

Note: this struct is also in linux-headers/linux/kvm.h which means QEMU and KVM both has this struct.

__kvm_set_memory_region() KVM

宗旨只有一个:对已经存在的 slot 进行的更改,只能是删除或者改变 flag,其它的更改比如改变大小都是不被允许的。

kvm_vm_ioctl
    kvm_vm_ioctl_set_memory_region
        kvm_set_memory_region
            __kvm_set_memory_region
int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_user_mem_region *mem)
{
	struct kvm_memory_slot *old, *new;
	struct kvm_memslots *slots;
	enum kvm_mr_change change;
	unsigned long npages;
	gfn_t base_gfn;
	int as_id, id;
	int r;

    // check the flags
	/* General sanity checks */

	as_id = mem->slot >> 16;
	id = (u16)mem->slot;

    // found all slots in this address space
    // 可以理解为找到内存空间中的所有内存插槽
	slots = __kvm_memslots(kvm, as_id);

    // 进一步找到对应的 slot
	old = id_to_memslot(slots, id);

    // memory_size == 0 被视作删除这个 memslot(KVM_MR_DELETE)
	if (!mem->memory_size)
		return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);

    // 看看这个 memslot 要从 guest 物理内存空间中的第几个页开始,
    // 大小是几个页。
	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
	npages = (mem->memory_size >> PAGE_SHIFT);

	if (!old || !old->npages) {
        // 如果老的为空,那自然就是 KVM_MR_CREATE
		change = KVM_MR_CREATE;
	} else { /* Modify an existing slot. */
        // 既然是对已有 memslot 的更改,那么需要满足以下几个条件:
        //  - HVA 要相等(这是为什么)
        //  - 大小也要相等
        //  - 改之前的和之后的都不能是 READONLY 的
		if ((mem->userspace_addr != old->userspace_addr) ||
		    (npages != old->npages) ||
		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
			return -EINVAL;

        // gfn 变了,说明是要把这个 slot 换一个地方
		if (base_gfn != old->base_gfn)
			change = KVM_MR_MOVE;
        // flag 变了,那就是只需要换一个 flag
		else if (mem->flags != old->flags)
			change = KVM_MR_FLAGS_ONLY;
		else /* Nothing to change. */
			return 0;
	}

    // 如果是添加一个 slot,或者是更改一个 slot,那么不要跟原来的重合。
	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
		return -EEXIST;

    // 创建一个新的 slot
	new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
	new->as_id = as_id;
	new->id = id;
	new->base_gfn = base_gfn;
	new->npages = npages;
	new->flags = mem->flags;
	new->userspace_addr = mem->userspace_addr;

    // TDX 相关的
	if (mem->flags & KVM_MEM_PRIVATE) {
		new->restricted_file = fget(mem->restricted_fd);
		new->restricted_offset = mem->restricted_offset;
	}

	new->kvm = kvm;
    // old 可能是空的,表示是 create,否则就是 move
	r = kvm_set_memslot(kvm, old, new, change);
	return 0;
}

kvm_set_memslot() KVM

会分情况调用四个函数:

  • kvm_create_memslot
  • kvm_delete_memslot
  • kvm_move_memslot
  • kvm_update_flags_memslot
static int kvm_set_memslot(struct kvm *kvm,
			   struct kvm_memory_slot *old,
			   struct kvm_memory_slot *new,
			   enum kvm_mr_change change)
{
	struct kvm_memory_slot *invalid_slot;
	int r;

    //...
    // DELETE 和 MOVE 这两种情况都是需要 invalidate 原来的 slot 的
    // 注意,invalidate 之后和 delete 是不一样的
	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
		invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
		kvm_invalidate_memslot(kvm, old, invalid_slot);
	}

	r = kvm_prepare_memory_region(kvm, old, new, change);
    // failure handling

	/*
	 * For DELETE and MOVE, the working slot is now active as the INVALID
	 * version of the old slot.  MOVE is particularly special as it reuses
	 * the old slot and returns a copy of the old slot (in working_slot).
	 * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
	 * old slot is detached but otherwise preserved.
	 */
	if (change == KVM_MR_CREATE)
		kvm_create_memslot(kvm, new);
	else if (change == KVM_MR_DELETE)
		kvm_delete_memslot(kvm, old, invalid_slot);
	else if (change == KVM_MR_MOVE)
		kvm_move_memslot(kvm, old, new, invalid_slot);
	else if (change == KVM_MR_FLAGS_ONLY)
		kvm_update_flags_memslot(kvm, old, new);

	/* Free the temporary INVALID slot used for DELETE and MOVE. */
	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
		kfree(invalid_slot);

	kvm_commit_memory_region(kvm, old, new, change);
	return 0;
}

kvm_mmu_page_get_gfn() KVM

sp 里保存了这个 shadow MMU page 所要 map 的 base GFN(sp->gfn),据此计算即可。

static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (sp->role.passthrough)
		return sp->gfn;

	if (!sp->role.direct)
		return sp->shadowed_translation[index] >> PAGE_SHIFT;

	return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
}

drop_spte() KVM

static void drop_spte(struct kvm *kvm, u64 *sptep)
{
	u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);

	if (is_shadow_present_pte(old_spte) || is_private_zapped_spte(old_spte))
		rmap_remove(kvm, sptep, old_spte);
}

kvm_create_memslot() KVM

Not finished….

static void kvm_create_memslot(struct kvm *kvm,
			       struct kvm_memory_slot *new)
{
	/* Add the new memslot to the inactive set and activate. */
	kvm_replace_memslot(kvm, NULL, new);
	kvm_activate_memslot(kvm, NULL, new);
}

kvm_prepare_memory_region() KVM

这个 memory region 指的可不是 QEMU 里的 MemoryRegion, 这个指的其实是一个 kvm_memory_slot,只不过不知道为啥名字一直没有改。

static int kvm_prepare_memory_region(struct kvm *kvm,
				     const struct kvm_memory_slot *old,
				     struct kvm_memory_slot *new,
				     enum kvm_mr_change change)
{
    // TDX-specific:创建一个新的 private memory region
	if (change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE)
		kvm_restrictedmem_register(new);

    // dirty bigmap 相关的
	if (change != KVM_MR_DELETE) {
		if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
			new->dirty_bitmap = NULL;
        // 这种情况只有在 MOVE 才会发生(old 存在还不是 DELETE)
        // 继承 old slot 的 dirty bitmap
		else if (old && old->dirty_bitmap)
			new->dirty_bitmap = old->dirty_bitmap;
        // 创建新的 dirty bitmap
		else if (kvm_use_dirty_bitmap(kvm)) {
			r = kvm_alloc_dirty_bitmap(new);
			if (kvm_dirty_log_manual_protect_and_init_set(kvm))
				bitmap_set(new->dirty_bitmap, 0, new->npages);
		}
	}

    // TDX-specific:
    // 只改了 flags,并且新的 flags 要用 private,这种情况看起来
    // 是要对一个 private 的 slot 进行 flag 上的更改。所以我们需要
    // 调用 UPM 的 API,注销旧的注册新的。
	if (change == KVM_MR_FLAGS_ONLY && (new->flags & KVM_MEM_PRIVATE)) {
		memcpy(&new->notifier, &old->notifier, sizeof(struct restrictedmem_notifier));
		kvm_restrictedmem_unregister((struct kvm_memory_slot *)old);
		fput(old->restricted_file);
		kvm_restrictedmem_register(new);
	}

	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
    // failure handling
}

kvm_arch_prepare_memory_region() KVM

因为 kvm_memory_slot 结构体有 arch specific 的部分,所以我们也需要对应的 arch 函数来处理。

int kvm_arch_prepare_memory_region(struct kvm *kvm,
				   const struct kvm_memory_slot *old,
				   struct kvm_memory_slot *new,
				   enum kvm_mr_change change)
{
	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
        // ...
		return kvm_alloc_memslot_metadata(kvm, new);
	}

	if (change == KVM_MR_FLAGS_ONLY)
		memcpy(&new->arch, &old->arch, sizeof(old->arch));
    //...
}

EPT

EPTP

EPT is used when the “enable EPT” VM-execution control is 1.

CR3 is still used for translating from GVA to GPA, then EPT is used to translate GPA to HPA.

Address translations between (GPA, HVA)

qemu_ram_block_from_hwaddr() QEMU

根据 GPA 拿到 RAMBlock 以及在其中的 offset。

此函数调用了 qemu_ram_block_from_host()

qemu_ram_block_from_host() QEMU

根据 HVA 拿到 RAMBlock 以及在其中的 offset。

kvm_physical_memory_addr_from_host() QEMU

根据 HVA 拿到 GPA。

// ram: HVA
// phys_addr: 返回的 GPA
int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, hwaddr *phys_addr)
{
    KVMMemoryListener *kml = &s->memory_listener;
    for (i = 0; i < s->nr_slots; i++) {
        KVMSlot *mem = &kml->slots[i];
        // 如果 HVA 在区间内
        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
            // 计算 GPA
            *phys_addr = mem->start_addr + (ram - mem->ram);
            break;
        }
    }
    //...
}

根据 GPA 拿到 HVA(需要借助 RAMBlock):

// pss->page 就是 GFN
ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
p = block->host + offset;

根据 GPA 拿到 HVA(不需要借助 RAMBlock):

hwaddr len = 4096;
void *ptr = cpu_physical_memory_map(pde_addr, &len, 0);
if (len == 4096)
    //...

GFN <-> GPA

ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;

Reference

QEMU的内存模拟 - 66Ring's Blog

QEMU下的内存结构MemoryRegion和AddressSpace | OenHan

QEMU 的 memory model | Deep Dark Fantasy