APF (async page fault)

Introduced in: [PATCH v7 02/12] Halt vcpu if page it tries to access is swapped out. - y

KVM 对于 async PF,提供了 3 个 PV features:

  • KVM_FEATURE_ASYNC_PF
  • KVM_FEATURE_ASYNC_PF_VMEXIT: paravirtualized async PF VM EXIT
  • KVM_FEATURE_ASYNC_PF_INT: Guest checks this feature bit before using the second async pf control msr 0x4b564d06 and async pf acknowledgment msr 0x4b564d07.

相关的 MSR 有以下这些(当然这都是虚拟的 MSR,为了给 PV 用的):

// 全称是 enable。从 MSR index 可以看出,这些 MSRs 并不是在同一个时间被引入的。
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define MSR_KVM_ASYNC_PF_INT	0x4b564d06
#define MSR_KVM_ASYNC_PF_ACK	0x4b564d07

Async PF / Async PF Int

这是一个 PV feature,名字叫做 kvm-asyncpf

有两种 page fault:stage 1 和 stage 2,前者是 guest 页表缺页引起的,后者是 host 页表(EPT)缺页引起的。

对于后者,当 host 在处理,把页带入进来的时候,我们其实可以让 guest 先去执行其他进程。host 可以通知 guest 已经执行完成了。

Asynchronous page fault is a way to try and use guest vcpu more efficiently by allowing it to execute other tasks while page is brought back into memory.

Async PF Int: 表示当 page swap back,要通知 guest vCPU 已经 handle 完成的时候,把通知的形式改成了 interrupt。

Concerns were expressed around APF delivery via synthetic #PF exception as in some cases such delivery may collide with real page fault. For 'page ready' notifications we can easily switch to using an interrupt instead.

struct kvm_vcpu_pv_apf_data Guest Kernel

实现和 KVM Steam Time/KVMClock 很像,都是一个 PV feature。

定义了一个新的 MSR:MSR_KVM_ASYNC_PF_EN。这个 MSR 是一个 GPA,指向 Guest Kernel 里下面这个结构体:

struct kvm_vcpu_pv_apf_data {
	/* Used for 'page not present' events delivered via #PF */
	__u32 flags;

	/* Used for 'page ready' events delivered via interrupt notification */
	__u32 token;

	__u8 pad[56];
    // 这是用来让 guest 通知 KVM 需不需要 enable 这个 feature
    // 
	__u32 enabled;
};

当然,要写到 MSR 里的 GPA 也是有一些 flags bits 可以 enable 的:

#define KVM_ASYNC_PF_ENABLED			(1 << 0)
#define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT	(1 << 2)
#define KVM_ASYNC_PF_DELIVERY_AS_INT		(1 << 3)

是不是感觉似曾相识?完全是和上面两个 feature 一样的套路。

When/how does guest enable async pf feature?

kvm_guest_init
    kvm_guest_cpu_init
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf)
    		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
            __this_cpu_write(apf_reason.enabled, 1);

When/how does guest disable async pf feature?

// 下面都是 guest kernel 里的 code,可以看到主要是在 guest kernel 想要 offline CPU 的时候。
kvm_guest_cpu_offline
    kvm_pv_disable_apf
        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
    	__this_cpu_write(apf_reason.enabled, 0);

KVM_REQ_APF_HALT / KVM_REQ_APF_READY / KVM

There is a request defined: KVM_REQ_APF_HALT

vcpu_enter_guest
    kvm_check_request(KVM_REQ_APF_HALT, vcpu)
        vcpu->arch.apf.halted = true;
        // give up vm entry

This request will be make in following 2 places:

  • kvm_faultin_pfn
  • kvm_arch_async_page_not_present

kvm_faultin_pfn() KVM

注意这个函数是在 KVM 里,而不是在 guest kernel 里。 也就是说那些关于 async PF 的 request 都是从 KVM 里发送,最后也在 KVM 里 handle。

static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
			   unsigned int access)
{
	int ret;
	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
    //...

	ret = __kvm_faultin_pfn(vcpu, fault);
	if (ret != RET_PF_CONTINUE)
		return ret;

    //...
	return RET_PF_CONTINUE;
}

Schedule a job to handle page fault asynchronously

kvm_arch_setup_async_pf
    kvm_setup_async_pf
/*
 * Try to schedule a job to handle page fault asynchronously. Returns 'true' on
 * success, 'false' on failure (page fault has to be handled synchronously).
 */
bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
			unsigned long hva, struct kvm_arch_async_pf *arch)
{
	struct kvm_async_pf *work;

	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
		return false;

	/* Arch specific code should not do async PF in this case */
	if (unlikely(kvm_is_error_hva(hva)))
		return false;

	/*
	 * do alloc nowait since if we are going to sleep anyway we
	 * may as well sleep faulting in page
	 */
	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
	if (!work)
		return false;

	work->wakeup_all = false;
	work->vcpu = vcpu;
	work->cr2_or_gpa = cr2_or_gpa;
	work->addr = hva;
	work->arch = *arch;
	work->mm = current->mm;
	mmget(work->mm);
	kvm_get_kvm(work->vcpu->kvm);

	INIT_WORK(&work->work, async_pf_execute);

	list_add_tail(&work->queue, &vcpu->async_pf.queue);
	vcpu->async_pf.queued++;
	work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work);

	schedule_work(&work->work);

	return true;
}

struct apf / apf_reason KVM

struct kvm_vcpu_arch {
    //...
	struct {
		bool halted;
		gfn_t gfns[ASYNC_PF_PER_VCPU];
		struct gfn_to_hva_cache data;
		u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */
		u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
		u16 vec;
		u32 id;
		bool send_user_only;
        // renamed from `host_apf_reason`, 
		u32 host_apf_flags;
		unsigned long nested_apf_token;
		bool delivery_as_pf_vmexit;
		bool pageready_pending;
	} apf;
    //...
}

Guest kernel 为其每一个 vCPU 都定义了一个 struct kvm_vcpu_pv_apf_data 名字叫 apf_reason

static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);

It can be read by:


struct kvm_vcpu_pv_apf_data Guest Kernel

struct kvm_vcpu_pv_apf_data {
	// Used for 'page not present' events delivered via #PF
	__u32 flags;

	// Used for 'page ready' events delivered via interrupt notification
	__u32 token;

	__u8 pad[56];
	__u32 enabled;
};