Linux MMU Notifier

Original patch: kernel/git/torvalds/linux.git - Linux kernel source tree

Codes:

  • include/linux/mmu_notifier.h

struct mmu_notifier_range Kernel

invalidate_range_startinvalidate_range_end 函数中会用到,表示要 invalidate 的区间。

// #ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_range {
	struct mm_struct *mm;
	unsigned long start; // HVA
	unsigned long end; // HVA
	unsigned flags;
	enum mmu_notifier_event event; // invalidate 原因
	void *owner;
};

struct mmu_notifier_event Kernel

多个 invalidate 的 reason。

enum mmu_notifier_event {
	MMU_NOTIFY_UNMAP = 0,
	MMU_NOTIFY_CLEAR,
	MMU_NOTIFY_PROTECTION_VMA,
	MMU_NOTIFY_PROTECTION_PAGE,
	MMU_NOTIFY_SOFT_DIRTY,
	MMU_NOTIFY_RELEASE,
	MMU_NOTIFY_MIGRATE,
	MMU_NOTIFY_EXCLUSIVE,
};

struct mmu_notifier_ops Kernel

这个结构体每一个成员在源代码中都有很详尽的注释。

struct mmu_notifier_ops {
	void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm);
	int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end);
	int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end);
	int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address);
	// * change_pte is called in cases that pte mapping to page is changed:
	// for example, when ksm remaps pte to point to a new shared page.
	// 这里的 mapping 指的是 kernel 页表,对于同一个 HVA,映射到了一个不同的 HPA(page migration)
	// 所以需要通知到 KVM 这个页换位置了,请 KVM 也更新下相关的信息,比如 EPT 里的 GPA->HPA mapping。
	void (*change_pte)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address, pte_t pte);
	// invalidate_range_start() is called when all pages in the
	// range are still mapped and have at least a refcount of one.
	// invalidate_range_end() is called when all pages in the
	// range have been unmapped and the pages have been freed by
	// the VM.(VM 并不是虚拟机的位置)。numa_balancing 应该调用不到这里吧,毕竟
	// 并没有 invalidate,而是换了位置,所以应该调用到 change_pte?
	int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range);
	void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range);
	void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end);
	struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
	void (*free_notifier)(struct mmu_notifier *subscription);
};

ops->invalidate_range_start()/ops->invalidate_range_end() Kernel

从这两个函数的签名可以看到,参数是所有的 notifier,表示要通知哪些注册的 notifer,以及一个 mmu_notifier_range,表示通知的内容(具体是哪些 range 要被 invalidate)。

invalidate_range_start() and invalidate_range_end() must be paired.

invalidate_range_start() is called when all pages in the range are still mapped and have at least a refcount of one.

invalidate_range_end() is called when all pages in the range have been unmapped and the pages have been freed by the VM.

The VM will remove the PTEs and the page between invalidate_range_start() and invalidate_range_end().

How does KVM subsystem use MMU notifier

There is a kernel config: CONFIG_MMU_NOTIFIER. Only this kernel config and KVM_ARCH_WANT_MMU_NOTIFIER is defined, the MMU Notifier in KVM is enabled:

#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
KVM_CREATE_VM
    kvm_dev_ioctl_create_vm
        kvm_create_vm
            kvm_init_mmu_notifier
                // current-mm means KVM will got notified if current process's memory has things changed
                mmu_notifier_register(&kvm->mmu_notifier, current->mm);

KVM_ARCH_WANT_MMU_NOTIFIER KVM

It is defined in each arch (x86, arm, riscv…):

#define KVM_ARCH_WANT_MMU_NOTIFIER

struct mmu_notifier In KVM

static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
	.clear_young		= kvm_mmu_notifier_clear_young,
	.test_young		= kvm_mmu_notifier_test_young,
	.change_pte		= kvm_mmu_notifier_change_pte,
	.release		= kvm_mmu_notifier_release,
};


kvm_create_vm
    kvm_init_mmu_notifier
        mmu_notifier_register
            kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
            __mmu_notifier_register

kvm_destroy_vm
    mmu_notifier_unregister
        subscription->ops->release
            kvm_mmu_notifier_release

struct kvm_hva_range KVM

可以看到和 kvm_gfn_range 很像。因为 kvm_mmu_notifier_ops 里的成员函数比如 invalidate_start, invalidate_end 都是基于 HVA 的,所以需要把 mmu_notifier_range 里的成员(mmu_notifier_range 本身就是基于 HVA 的,Kernel 并不 aware GPA 这个概念)赋值到这个里面。

struct kvm_hva_range {
	unsigned long start;
	unsigned long end;
    // 下面这两个属性是此结构体和 mmu_notifier_range 的最主要的区别,后者没有这两个结构体
	pte_t pte;
    // 下面 3 个属性都是函数,调用顺序详参 __kvm_handle_hva_range
    // 基本上就是 on_lock -> handler -> handler -> ... -> on_unlock
	hva_handler_t handler;
	on_lock_fn_t on_lock;
	on_unlock_fn_t on_unlock;

	bool flush_on_ret; // 要不要 flush remote TLB?
	bool may_block;
};

struct kvm_gfn_range KVM

表示一段 GFN 的区间。主要是为 KVM MMU Notifier 服务的。HVA 对于 KVM 处理用处并不大,很多时候 KVM 需要拿到 GFN 的信息。所以数据的流向是:mmu_notifier_range(HVA) -> kvm_hva_range(HVA) -> kvm_gfn_range(GFN)。

这个结构体是 Arch 无关的,所有的 Arch 的此结构体都一样。

struct kvm_gfn_range {
	struct kvm_memory_slot *slot; // 这个 GFN 所在的 slot
	gfn_t start;
	gfn_t end;
	pte_t pte;
	bool may_block;
};

kvm_handle_hva_range / kvm_handle_hva_range_no_flush / __kvm_handle_hva_range KVM

前两个函数都调用了 __kvm_handle_hva_range,最主要的逻辑在这里面。

概括:找到对应这个 HVA 区域的所有 GFN 区域,并且执行 kvm_hva_range->lock(), kvm_hva_range->handler() 以及 kvm_hva_range->unlock()。参数是 kvm_gfn_range

handler 做了什么以及在哪里置上的,可以参考:kvm_mmu_notifier_invalidate_range_start() 以及 kvm_mmu_unmap_gfn_range()

static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_hva_range *range)
{
	bool ret = false, locked = false;
	struct kvm_gfn_range gfn_range;
	struct kvm_memory_slot *slot;
	struct kvm_memslots *slots;
	int i, idx;

    //...
	// A null handler is allowed if and only if on_lock() is provided.
	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler)))
		return 0;

    //...
    // KVM_ADDRESS_SPACE_NUM is 2 in x86
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		struct interval_tree_node *node;

        // each address space has a bunch of slots
		slots = __kvm_memslots(kvm, i);
        // 望文生义:对于落在这个 range 区间的每一个 kvm_memory_slot
        // 相交就算在里面
		kvm_for_each_memslot_in_hva_range(node, slots, range->start, range->end - 1) {
			unsigned long hva_start, hva_end;
			slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
            // slot->userspace_addr 这是 slot 的 HVA
            // 这个 max 和 min 的作用是找到 slot 和 range 重合的区域。
			hva_start = max(range->start, slot->userspace_addr);
			hva_end = min(range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT));

            // 从 hva_range 到 gnf_range
            // 这个 PTE 指的好像是 HVA -> HPA 的 PTE
			gfn_range.pte = range->pte;
			gfn_range.may_block = range->may_block;

			/*
			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
			 */
            // start of the gfn of current **intersection** range
			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
            // end of the gfn of current **intersection** range
			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
			gfn_range.slot = slot;

			if (!locked) {
				locked = true;
                //...
				range->on_lock(kvm, range->start, range->end);
                //...
			}
			ret |= range->handler(kvm, &gfn_range);
		}
	}

    // flush 其他 vCPU 的 TLB
	if (range->flush_on_ret && ret)
		kvm_flush_remote_tlbs(kvm);

	if (locked) {
        //...
		range->on_unlock(kvm);
	}

	/* The notifiers are averse to booleans. :-( */
	return (int)ret;
}

kvm_mmu_notifier_invalidate_range_start() KVM

这个函数其实就是对 __kvm_handle_hva_range() 进行了一下包装。把 kvm_mmu_unmap_gfn_range 作为 handler 传进去。

static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
					const struct mmu_notifier_range *range)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
	const struct kvm_hva_range hva_range = {
		.start		= range->start,
		.end		= range->end,
		.pte		= __pte(0),
		.handler	= kvm_unmap_gfn_range,
		.on_lock	= kvm_mmu_invalidate_begin,
		.on_unlock	= kvm_arch_guest_memory_reclaimed,
		.flush_on_ret	= true,
		.may_block	= mmu_notifier_range_blockable(range),
	};

    //...
	__kvm_handle_hva_range(kvm, &hva_range);
}

kvm_mmu_invalidate_begin / on_lock() In kvm_hva_range KVM

void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, unsigned long end)
{
    // 表示我们已经开始 invalidate 的过程了。
    // 注意,在 invalidate_end 的 lock 时候才会 -1 恢复,而不是 invalidate_start 的 unlock。
	kvm->mmu_invalidate_in_progress++;
    //...
    // 记录一下 range 的开始 HVA 与终止 HVA
	kvm->mmu_invalidate_range_start = start;
	kvm->mmu_invalidate_range_end = end;
    //...
}

kvm_unmap_gfn_range() / handler() In kvm_hva_range KVM

做的事,把所有要 invalidate 的 GFN 的 range,

bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
	bool flush = false;

	if (kvm_memslots_have_rmaps(kvm))
		flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);

    // Only valid when tdp is enabled
	if (tdp_mmu_enabled)
        // zap leaves, 也就是把所有相关的 TDP leaf 都 zap 掉
        // 很好理解,因为对应的 guest memory 被回收了,那么 TDP 对应的 map 也应该更新。
		flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);

    // 如果这个 slot 是 APIC PAGE,那么 reload
	if (kvm_x86_ops.set_apic_access_page_addr &&
	    range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);

	return flush;
}

``