注意和 kvm_shadow_walk_iterator 不一样哦,后者是用来 Direct Page Fault 的框架用的(Search KVM Shadow Page Walking^),而 tdp_iter 是 Google 的 TDP patch set 引入,用来支持 TDP MMU 的。

tdp_iter 是有 goal GFN 的,所以整个遍历过程其实可以分成两步:

  • DFS 找到 goal GFN 所在的 leaf GFN;
  • 从这里开始以前序遍历的方式遍历之后的其他节点,直到其之后的所有节点都遍历过一遍。

利用这一点,我们可以人为在 for 循环里加一个终止条件,在 end GFN 遍历终止,从而实现只遍历一个 GFN range 内的所有 PSE, PTE 等结构。

arch/x86/kvm/mmu/tdp_iter.h Kernel

只有一百多行的文件。

下面这几个函数看起来简单,这么包装起来的主要原因是 RCU 保护住而不是一个大锁从而提高并行性。这可能也是 Google TDP MMU 比 Direct 方式性能好的原因之一。

/*
 * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
 * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
 * batched without having to collect the list of zapped SPs.  Flows that can
 * remove SPs must service pending TLB flushes prior to dropping RCU protection.
 */
// 读 sptep 指向的值
static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
	return READ_ONCE(*rcu_dereference(sptep));
}

// 给 sptep 设置原子性地设置值,同时返回旧值 
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
	return xchg(rcu_dereference(sptep), new_spte);
}

// 给 sptep 设置值
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
	WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}

kvm_tdp_mmu_spte_need_atomic_write() KVM

这是一个判断函数,用来判断一个 spte 需不需要被原子性地一次性写。

/*
 * SPTEs must be modified atomically if they are shadow-present, leaf
 * SPTEs, and have volatile bits, i.e. has bits that can be set outside
 * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
 * handler, and Accessed and Dirty bits can be set by the CPU.
 *
 * Note, non-leaf SPTEs do have Accessed bits and those bits are
 * technically volatile, but KVM doesn't consume the Accessed bit of
 * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit.  This
 * logic needs to be reassessed if KVM were to use non-leaf Accessed
 * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
 */
// 对于 non-leaf SPTE,我们不关心它的 Accessed bit,所以自然也不需要保证操作的原子性
// 对于 shadow-present^(也就是非 MMIO) 的 leaf SPTE,如果它有一些会在 MMU Lock 之外被置上的 bits,
// 比如 A/D by CPU,那么这就存在了关键区 critical section,所以我们需要原子性地操作它。
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
{
	return is_shadow_present_pte(old_spte) &&
	       is_last_spte(old_spte, level) &&
	       spte_has_volatile_bits(old_spte);
}

kvm_tdp_mmu_write_spte() / tdp_mmu_clear_spte_bits() KVM

对我们上面提到的两个 write 函数的包装,根据需不需要原子写执行不同的 wrapping 函数。

static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level)
{
	if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
		return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);

	__kvm_tdp_mmu_write_spte(sptep, new_spte);
	return old_spte;
}

static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level)
{
	atomic64_t *sptep_atomic;

	if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
		sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
		return (u64)atomic64_fetch_and(~mask, sptep_atomic);
	}

	__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
	return old_spte;
}

struct tdp_iter

TDP page table 的迭代器,用来遍历页表用的。

不只指向 leaf SPTE,也有可能指向每一级页表中的 PSE。用 level 来表示。

#define PT64_ROOT_MAX_LEVEL 5
// A TDP iterator performs a pre-order walk over a TDP paging structure.
struct tdp_iter {
	// A pointer to the current SPTE
	tdp_ptep_t sptep;
	/* A snapshot of the value at sptep */
    // 可以理解为就是 sptep 指向的值。
	u64 old_spte;
    // 内部维护的一个状态
    // The iterator's current level within the paging structure
	int level;
	// The lowest level the iterator should traverse to
	int min_level;
	// The level of the root page given to the iterator
	int root_level;
    // The lowest GFN mapped by the current SPTE */
    // 所谓 the lowest 是因为这个 sptep 指向的不一定就一定是 leaf
    // 有可能是中间的 PSE,一个 PSE 可以表示一个 gfn range,所以
    // 这个 gfn 表示的就是这个 range 里最小的。
	gfn_t gfn;
    // tdp_ptep_t 其实就是加了 rcu 保护的 u64
    // 所以这是一个长度为 5 的数组,每一个 entry 都是 u64 类型
    // 每一个指向的都是一个 PT,表示为了达到当前 SPTE 而已经遍历过的 page tables
	tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
    // 遍历是不是到头了
	bool valid;
    //...
};

arch/x86/kvm/mmu/tdp_iter.c Kernel

tdp_iter_start() / tdp_iter_restart() / tdp_iter_refresh_sptep() KVM

/*
 * Sets a TDP iterator to walk a pre-order traversal of the paging structure
 * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
 */
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
		    int min_level, gfn_t next_last_level_gfn)
{
    // sanity checks
    // ...
    // 这里设置的是静态的不会变的数据
	iter->next_last_level_gfn = next_last_level_gfn;
    // 记录下 root level
	iter->root_level = root->role.level;
    // 记录下应该 traverse 到的最小 level
	iter->min_level = min_level;
    // root spt 是我们遍历经过的第一个 page table
	iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
    // 暂时还不太重要
	iter->as_id = kvm_mmu_page_as_id(root);

    // 这里设置的是动态的会变的状态
	tdp_iter_restart(iter);
}

// 把 iter,重置到 root PT
void tdp_iter_restart(struct tdp_iter *iter)
{
	iter->yielded = false;
	iter->yielded_gfn = iter->next_last_level_gfn;
    // reset current level to root level
	iter->level = iter->root_level;
    // 当前 sptep 指向的 gfn range 的 start gfn
	iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
	tdp_iter_refresh_sptep(iter);

	iter->valid = true;
}

// Recalculates the pointer to the SPTE for the current GFN and level and
// reread the SPTE.
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
    // 根据 gfn 和 level 计算 sptep
    // 根据我们上一级 traverse 过的页表的 VA,加上我们 gfn 是这 512 entry 里的第几个 entry
    // 来算出我们当前的 sptep 是什么。
	iter->sptep = iter->pt_path[iter->level - 1] + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
    // 更新 sptep 指向的值
	iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}

spte_to_child_pt() KVM

Given an SPTE and its level, returns a pointer containing the HVA of the child page table referenced by the SPTE. Returns null if there is no such entry.

tdp_ptep_t spte_to_child_pt(u64 spte, int level)
{
	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
		return NULL;

	return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}

try_step_down() KVM

我们是有目标 GFN 的,目标就是 next_last_level_gfn,缩小范围,朝向目标更靠近一步!

Steps down one level in the paging structure toward the goal GFN.

Returns true if the iterator was able to step down a level, false otherwise.

static bool try_step_down(struct tdp_iter *iter)
{
	tdp_ptep_t child_pt;

    // 不允许更小了,false 返回
	if (iter->level == iter->min_level)
		return false;

	/*
	 * Reread the SPTE before stepping down to avoid traversing into page
	 * tables that are no longer linked from this entry.
	 */
	iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);

    // 拿到指向的 child_pt
	child_pt = spte_to_child_pt(iter->old_spte, iter->level);
	if (!child_pt)
		return false;

	iter->level--;
    // 这个 pt 我们 traverse 过了,记录下来
	iter->pt_path[iter->level - 1] = child_pt;
    // 表示这个 level 的 base gfn
	iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
    // 更新 sptep 和 old_spte
	tdp_iter_refresh_sptep(iter);
	return true;
}

try_step_side() KVM

Steps to the next entry in the current page table, at the current page table level. The next entry could

  • point to a page backing guest memory
  • or another page table
  • or it could be non-present.

Returns true if the iterator was able to step to the next entry in the page table, false if the iterator was already at the end of the current page table.

static bool try_step_side(struct tdp_iter *iter)
{
	// Check if is already at the end of the current PT.
	if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
	    (SPTE_ENT_PER_PAGE - 1))
		return false;

	iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
	iter->next_last_level_gfn = iter->gfn;
    // 这个 ++ 是最主要的
	iter->sptep++;
	iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);

	return true;
}

try_step_up() KVM

Tries to traverse back up a level in the paging structure so that the walk can continue from the next entry in the parent page table. Returns true on a successful step up, false if already in the root page.

static bool try_step_up(struct tdp_iter *iter)
{
    // 已经到顶了
	if (iter->level == iter->root_level)
		return false;

	iter->level++;
	iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
	tdp_iter_refresh_sptep(iter);
	return true;
}

tdp_iter_next() KVM

综合使用了 try_step_down(), try_step_up()try_step_side() 来实现 DFS 的算法。

/*
 * Step to the next SPTE in a pre-order traversal of the paging structure.
 * To get to the next SPTE, the iterator either steps down towards the goal
 * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
 * highter GFN.
 *
 * The basic algorithm is as follows:
 * 1. If the current SPTE is a non-last-level SPTE, step down into the page
 *    table it points to.
 * 2. If the iterator cannot step down, it will try to step to the next SPTE
 *    in the current page of the paging structure.
 * 3. If the iterator cannot step to the next entry in the current page, it will
 *    try to step up to the parent paging structure page. In this case, that
 *    SPTE will have already been visited, and so the iterator must also step
 *    to the side again.
 */
void tdp_iter_next(struct tdp_iter *iter)
{
    // 如果被别人设置了 yielded,那么我们需要重头(root)开始
	if (iter->yielded) {
		tdp_iter_restart(iter);
		return;
	}

    // DFS 算法,先往下找
	if (try_step_down(iter))
		return;

    // 走到这里说明找不动了
	do {
        // 找 512 entry 里的下一个
		if (try_step_side(iter))
			return;
    // 如果是 512 个的最后一个,那么一直 step up
	} while (try_step_up(iter));

    // 说明找到最后一个了
	iter->valid = false;
}

for_each_tdp_pte() / for_each_tdp_pte_min_level() KVM

// 简单包装一下,4K
#define for_each_tdp_pte(iter, root, start, end) \
	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)

// 以前序遍历的方式遍历 start 到 end 之间的所有 PSE 和 PTE
#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
	for (tdp_iter_start(&iter, root, min_level, start); \
	     iter.valid && iter.gfn < end;		     \
	     tdp_iter_next(&iter))

其他文件里定义的 helper function。

arch/x86/kvm/mmu/tdp_mmu.c:

// 和 for_each_tdp_pte 一模一样,为啥?
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
	for_each_tdp_pte(_iter, _root, _start, _end)

// 略过遍历过程中经历的 PSE,只在每一个 leaf PTE 处停留
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
		if (!is_shadow_present_pte(_iter.old_spte) ||		\
		    !is_last_spte(_iter.old_spte, _iter.level))		\
			continue;					\
		else

// 和 for_each_tdp_pte 基本一样
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
	for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)