这里有一个 alternative:RFC: A KVM-specific alternative to UserfaultFD

RFC: A KVM-specific alternative to UserfaultFD - David Matlack

Userfaultfd — The Linux Kernel documentation

How does userfaultfd is used in post-copy live migration?

// Destination QEMU:
// When the destination side receive the MIG_CMD_POSTCOPY_ADVISE command,
// it will call fallocate() syscall to discard each RB's pages, so page fault
// can occur on them.
case MIG_CMD_POSTCOPY_ADVISE:
    loadvm_postcopy_handle_advise
        ram_postcopy_incoming_init
            postcopy_ram_incoming_init
                foreach_not_ignored_block(init_range, NULL)
                    // We need the whole of RAM to be truly empty for postcopy.
                    // Precopy will just overwrite this data, so doesn't need the discard.
                    ram_discard_range(block_name, 0, length)
                        bitmap_clear(rb->receivedmap) //...
                            ram_block_discard_range
                                ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE, start, length);

// Destination QEMU:
// create the userfault fd and register the region for handling
case MIG_CMD_POSTCOPY_LISTEN:
    loadvm_postcopy_handle_listen
        postcopy_ram_incoming_setup
            mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
            foreach_not_ignored_block(ram_block_enable_notify, mis)
                ram_block_enable_notify
                    // addr and len of this RAMBlock
                    // addr is the HVA of the RAMBlock
                    reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
                    reg_struct.range.len = rb->postcopy_length;
                    // Tell userfaultfd that it's responsible for this area
                    ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)
                        // Kernel
                        case UFFDIO_REGISTER:
                            userfaultfd_register

// Destination:
// EPT Violation happened in guest and try to map the GPA
// But since the range is discarded in the ADVISE stage
// so a page fault occur at that HVA address, and that
// page fault is reported to userfaultfd to let userspace know it
EXIT_REASON_EPT_VIOLATION:
    handle_ept_violation
        kvm_mmu_page_fault
            kvm_mmu_do_page_fault
                kvm_tdp_page_fault
                    kvm_tdp_mmu_page_fault
                        kvm_faultin_pfn
                            __kvm_faultin_pfn
                                __gfn_to_pfn_memslot
                                    // Pin guest page in memory and return its pfn.
                                    hva_to_pfn
                                        hva_to_pfn_slow
                                            // may swap the page in
                                            get_user_pages_unlocked

// Since we want to map the HVA and it is discarded, that HVA is
// in the range handled by userfaultfd, so userfaultfd get notified.
// and it will be redirected to the source.

userfaultfd_register() Kernel

static int userfaultfd_register(struct userfaultfd_ctx *ctx,
				unsigned long arg)
{
	struct mm_struct *mm = ctx->mm;
	struct vm_area_struct *vma, *prev, *cur;
	int ret;
	struct uffdio_register uffdio_register;
	struct uffdio_register __user *user_uffdio_register;
	unsigned long vm_flags, new_flags;
	bool found;
	bool basic_ioctls;
	unsigned long start, end, vma_end;
	struct vma_iterator vmi;
	bool wp_async = userfaultfd_wp_async_ctx(ctx);

	user_uffdio_register = (struct uffdio_register __user *) arg;

	ret = -EFAULT;
	if (copy_from_user(&uffdio_register, user_uffdio_register,
			   sizeof(uffdio_register)-sizeof(__u64)))
		goto out;

	ret = -EINVAL;
	if (!uffdio_register.mode)
		goto out;
	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
		goto out;
	vm_flags = 0;
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
		vm_flags |= VM_UFFD_MISSING;
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
		goto out;
#endif
		vm_flags |= VM_UFFD_WP;
	}
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
		goto out;
#endif
		vm_flags |= VM_UFFD_MINOR;
	}

	ret = validate_range(mm, uffdio_register.range.start,
			     uffdio_register.range.len);
	if (ret)
		goto out;

	start = uffdio_register.range.start;
	end = start + uffdio_register.range.len;

	ret = -ENOMEM;
	if (!mmget_not_zero(mm))
		goto out;

	ret = -EINVAL;
	mmap_write_lock(mm);
	vma_iter_init(&vmi, mm, start);
	vma = vma_find(&vmi, end);
	if (!vma)
		goto out_unlock;

	/*
	 * If the first vma contains huge pages, make sure start address
	 * is aligned to huge page size.
	 */
	if (is_vm_hugetlb_page(vma)) {
		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

		if (start & (vma_hpagesize - 1))
			goto out_unlock;
	}

	/*
	 * Search for not compatible vmas.
	 */
	found = false;
	basic_ioctls = false;
	cur = vma;
	do {
		cond_resched();

		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
		       !!(cur->vm_flags & __VM_UFFD_FLAGS));

		/* check not compatible vmas */
		ret = -EINVAL;
		if (!vma_can_userfault(cur, vm_flags, wp_async))
			goto out_unlock;

		/*
		 * UFFDIO_COPY will fill file holes even without
		 * PROT_WRITE. This check enforces that if this is a
		 * MAP_SHARED, the process has write permission to the backing
		 * file. If VM_MAYWRITE is set it also enforces that on a
		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
		 * F_WRITE_SEAL can be taken until the vma is destroyed.
		 */
		ret = -EPERM;
		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
			goto out_unlock;

		/*
		 * If this vma contains ending address, and huge pages
		 * check alignment.
		 */
		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
		    end > cur->vm_start) {
			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);

			ret = -EINVAL;

			if (end & (vma_hpagesize - 1))
				goto out_unlock;
		}
		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
			goto out_unlock;

		/*
		 * Check that this vma isn't already owned by a
		 * different userfaultfd. We can't allow more than one
		 * userfaultfd to own a single vma simultaneously or we
		 * wouldn't know which one to deliver the userfaults to.
		 */
		ret = -EBUSY;
		if (cur->vm_userfaultfd_ctx.ctx &&
		    cur->vm_userfaultfd_ctx.ctx != ctx)
			goto out_unlock;

		/*
		 * Note vmas containing huge pages
		 */
		if (is_vm_hugetlb_page(cur))
			basic_ioctls = true;

		found = true;
	} for_each_vma_range(vmi, cur, end);
	BUG_ON(!found);

	vma_iter_set(&vmi, start);
	prev = vma_prev(&vmi);
	if (vma->vm_start < start)
		prev = vma;

	ret = 0;
	for_each_vma_range(vmi, vma, end) {
		cond_resched();

		BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
		       vma->vm_userfaultfd_ctx.ctx != ctx);
		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

		/*
		 * Nothing to do: this vma is already registered into this
		 * userfaultfd and with the right tracking mode too.
		 */
		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
		    (vma->vm_flags & vm_flags) == vm_flags)
			goto skip;

		if (vma->vm_start > start)
			start = vma->vm_start;
		vma_end = min(end, vma->vm_end);

		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
					    new_flags,
					    (struct vm_userfaultfd_ctx){ctx});
		if (IS_ERR(vma)) {
			ret = PTR_ERR(vma);
			break;
		}

		/*
		 * In the vma_merge() successful mprotect-like case 8:
		 * the next vma was merged into the current one and
		 * the current one has not been updated yet.
		 */
		vma_start_write(vma);
		userfaultfd_set_vm_flags(vma, new_flags);
		vma->vm_userfaultfd_ctx.ctx = ctx;

		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
			hugetlb_unshare_all_pmds(vma);

	skip:
		prev = vma;
		start = vma->vm_end;
	}

out_unlock:
	mmap_write_unlock(mm);
	mmput(mm);
	if (!ret) {
		__u64 ioctls_out;

		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
		    UFFD_API_RANGE_IOCTLS;

		/*
		 * Declare the WP ioctl only if the WP mode is
		 * specified and all checks passed with the range
		 */
		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

		/* CONTINUE ioctl is only supported for MINOR ranges. */
		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

		/*
		 * Now that we scanned all vmas we can already tell
		 * userland which ioctls methods are guaranteed to
		 * succeed on this range.
		 */
		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
			ret = -EFAULT;
	}
out:
	return ret;
}

struct uffd_msg Kernel

/* read() structure */
struct uffd_msg {
	__u8	event;

	__u8	reserved1;
	__u16	reserved2;
	__u32	reserved3;

	union {
		struct {
			__u64	flags;
            // 这里面存的应该是 HVA。
			__u64	address;
			union {
				__u32 ptid;
			} feat;
		} pagefault;

		struct {
			__u32	ufd;
		} fork;

		struct {
			__u64	from;
			__u64	to;
			__u64	len;
		} remap;

		struct {
			__u64	start;
			__u64	end;
		} remove;

		struct {
			/* unused reserved fields */
			__u64	reserved1;
			__u64	reserved2;
			__u64	reserved3;
		} reserved;
	} arg;
} __attribute__((packed));