KVM Steal Time

The original meaning of "steal time" refers to the time in the vm stolen by the hypervisor in a virtualized environment. Strictly speaking, it is the time when the VCPU is not running.

The significance of the st data is to show the guest the proportion of time it really occupies the CPU, so that the guest can adjust its behavior according to the st so as not to affect the business. If the st value is relatively high, it means that the proportion of time occupied by vCPU and pCPU is too small, and the task of the entire hypervisor is relatively heavy. Some high-computing tasks can be self-limited accordingly.

`kvm_steal_clock()/pv_steal_clock`

There is a static call pv_steal_clock defined:

DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
static u64 native_steal_clock(int cpu)
{
	return 0;
}

This static function will be called when:

When guest kernel initialize, it will judge if KVM_FEATURE_STEAL_TIME feature can be used, if so, it will update the static_call pointer:

 kvm_guest_init
    static_call_update(pv_steal_clock, kvm_steal_clock);

static u64 kvm_steal_clock(int cpu)
{
	u64 steal;
	struct kvm_steal_time *src;
	int version;

	src = &per_cpu(steal_time, cpu);
	do {
		version = src->version;
		virt_rmb();
		steal = src->steal;
		virt_rmb();
	} while ((version & 1) || (version != src->version));

	return steal;
}

`MSR_KVM_STEAL_TIME`

64-byte physical address of a memory area which must be in guest RAM, plus an enable bit in bit 0. This memory is expected to hold a copy of struct kvm_steal_time, whose data will be filled in by the hypervisor periodically. Only one write, or registration, is needed for each VCPU. The interval between updates of this structure is arbitrary and implementation-dependent. The hypervisor may update this structure at any time it sees fit until anything with bit0 == 0 is written to it. Guest is required to make sure this structure is initialized to zero.

`MSR_KVM_STEAL_TIME` Set by guest kernel (only one time, guest kernel register)

global has_steal_clock:

kvm_guest_init
    if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		has_steal_clock = 1;
	}

Write to the MSR_KVM_STEAL_TIME of the GPA of the per-vcpu steal_time variable:

kvm_guest_cpu_init
    if (has_steal_clock)
        kvm_register_steal_time
            wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));

KVM handling:

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
    //...
    case MSR_KVM_STEAL_TIME:
        //...
		vcpu->arch.st.msr_val = data;
        //...
		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
		break;
}

KVM handling when entering guest:

vcpu_enter_guest
    if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
		record_steal_time(vcpu);

`record_steal_time()`

// vcpu_enter_guest
/// if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
static void record_steal_time(struct kvm_vcpu *vcpu)
{
	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
	struct kvm_steal_time __user *st;
	struct kvm_memslots *slots;
	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
	u64 steal;
	u32 version;

    //...
    // The GPA of the steal_time variable in guest kernel
	st = (struct kvm_steal_time __user *)ghc->hva;
	/*
	 * Doing a TLB flush here, on the guest's behalf, can avoid
	 * expensive IPIs.
	 */
	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
        //...
        // read value to st_preempted from the shared steal_time variable
		asm volatile("1: xchgb %0, %2\n"
			     "xor %1, %1\n"
			     "2:\n"
			     _ASM_EXTABLE_UA(1b, 2b)
			     : "+q" (st_preempted),
			       "+&r" (err),
			       "+m" (st->preempted));

		vcpu->arch.st.preempted = 0;

        // flush TLB if it is requested
		if (st_preempted & KVM_VCPU_FLUSH_TLB)
			kvm_vcpu_flush_tlb_guest(vcpu);
        //...
	}

	unsafe_get_user(version, &st->version, out);

    // make sure version is even here
	if (version & 1)
		version += 1;  /* first time write, random junk */

    // the version is odd, indicates an in-progress update
	version += 1;
    // put this odd version to the correponding place in GPA
	unsafe_put_user(version, &st->version, out);

    // get steal from GPA
	unsafe_get_user(steal, &st->steal, out);
    // run_delay is an accumulated value, which records how many time we have spent
    // on waiting on a run queue so far.
    // Increase steal with the (gap between 2 run_delay, run_delay_1 - run_delay_0)
    // if calculate multiple times, the result will be run_delay_n - run_delay_0
    // which denotes during that period of time, how long the vcpu is spent on waiting
    // to be scheduled.
	steal += current->sched_info.run_delay - vcpu->arch.st.last_steal;
    // record last_steal for next update
	vcpu->arch.st.last_steal = current->sched_info.run_delay;
    // write the steal time back
	unsafe_put_user(steal, &st->steal, out);

    // write the version back to indicate the in-progress update is end
	version += 1;
	unsafe_put_user(version, &st->version, out);
    //...
}

`struct kvm_steal_time`

// per vcpu
struct kvm_steal_time {
    // the amount of time in which this vCPU did not run, in
	// nanoseconds. Time during which the vcpu is idle, will not be
	// reported as steal time.
	__u64 steal;

    // a sequence counter. In other words, **guest** has to check this
    // field before and after grabbing time information and make
	// sure they are both equal and even. An odd version indicates an
	// in-progress update.
	__u32 version;
	__u32 flags;

    // indicate the vCPU who owns this struct is running or not.
    // Non-zero values mean the vCPU has been preempted. Zero
	// means the vCPU is not preempted. NOTE, it is always zero if the
	// the hypervisor doesn't support this field.
	__u8  preempted;
	__u8  u8_pad[3];
	__u32 pad[11];
};

Update shared variable `steal_time` with the new steal_time when each time scheduled in

__fire_sched_in_preempt_notifiers
    kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_arch_vcpu_load
            kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
                record_steal_time
                    // flush TLB if requested
                    // update "steal" to represents the latest value of steal time

Update `preempted` when each time scheduled out

When each time the vCPU is scheduled out, set the shared variable st->preempted with the bit KVM_VCPU_PREEMPTED to indicate that the vCPU is preempted, so guest kernel can use this to do PV TLB flush rather than sending IPI, which reduces the overhead.

kvm_sched_out
    kvm_arch_vcpu_put
        kvm_steal_time_set_preempted
            if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))

kvm_steal_clock()/pv_steal_clock

MSR_KVM_STEAL_TIME

MSR_KVM_STEAL_TIME Set by guest kernel (only one time, guest kernel register)

record_steal_time()

struct kvm_steal_time

Update shared variable steal_time with the new steal_time when each time scheduled in

Update preempted when each time scheduled out