Misc of kvmclock

VMCS has a Primary Processor-Based VM-Execution Controls bit: Use TSC offsetting, which denotes:

This control determines whether executions of RDTSC, executions of RDTSCP, and executions of RDMSR that read from the IA32_TIME_STAMP_COUNTER MSR return a value modified by the TSC offset field.

This will work when kvmclock has not been used as the current clocksource to retrieve system time because tsc will be used as the clocksource and rdtsc will work.

当 TSC 频率发生变化时,要向该 pCPU 上的所有 vCPU 发送 KVM_REQ_CLOCK_UPDATE,更新它们的 tsc to nsec 转换参数。

若 vcpu->arch.tsc_always_catchup = 1,则每次 VMExit 都要给 vCPU 发送 KVM_REQ_CLOCK_UPDATE,以实现不断 catchup 比 pTSCfreq 更高的 vTSCfreq。

kvm_clock 作为一个 clocksource,其频率为 1GHz,实际上其 read 回调返回的值是就是 System Time(即 CLOCK_BOOTTIME)的读数,单位为 ns。

注意当 Host 和 Guest 都是 stable 时,我们实际上直接使用 TSC 作为 Clocksource,kvmclock 只用于提供 wall clock。

kvmclock 中的 vTSC 频率就是 Guest TSC 的实际运行频率,和 KVM_SET_TSC_KHZ 设置的 vTSCfreq 不一定相等(例如在 Host TSC 频率可变时会不相等)。每次 pCPU 频率改变,就会更新 cpu_tsc_khz 并对该 pCPU 上的所有 vCPU 发送 KVM_REQ_CLOCK_UPDATE,但 scale 始终不变。

kvmclock_cpufreq_notifier
static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
	list_for_each_entry(kvm, &vm_list, vm_list) {
		kvm_for_each_vcpu(i, vcpu, kvm) {
			if (vcpu->cpu != cpu)
				continue;
			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
            //...
		}
	}
}

When QEMU get supported CPUID from KVM:

kvm_dev_ioctl
    kvm_arch_dev_ioctl
        kvm_dev_ioctl_get_cpuid
            get_cpuid_func
                do_cpuid_func
                    __do_cpuid_func
                        case KVM_CPUID_FEATURES:
                            entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
                                          //...
                                         (1 << KVM_FEATURE_CLOCKSOURCE2) |

When host's tsc support tsc scaling, TSC catchup mechanism is not used.

When host's tsc is unstable, masterclock is not used.

When host's tsc is stable, kvmclock is not used to read the SYSTEM_TIME, it is only used to read the Wall time.

vgettsc() KVM

Update tsc_timestamp to current tsc.

Return (current tsc - last tsc) * mult.

static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode)
{
	long v;
	u64 tsc_pg_val;

	switch (clock->vclock_mode) {
    // Hyper-v case...
    // ...
	case VDSO_CLOCKMODE_TSC:
		*mode = VDSO_CLOCKMODE_TSC;
		*tsc_timestamp = read_tsc();
		v = (*tsc_timestamp - clock->cycle_last) &
			clock->mask;
		break;
	default:
		*mode = VDSO_CLOCKMODE_NONE;
	}

	if (*mode == VDSO_CLOCKMODE_NONE)
		*tsc_timestamp = v = 0;

	return v * clock->mult;
}

do_monotonic_raw() KVM

static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	unsigned long seq;
	int mode;
	u64 ns;

    //...
    // get wall time ns part (xtime_nsec, which means it is shifted)
    ns = gtod->raw_clock.base_cycles;

    // Plus delta and shift back, we get the real ns value
    ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
    ns >>= gtod->raw_clock.shift;


    ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
    //...

	*t = ns;

	return mode;
}

kvm_get_time_and_clockread() KVM

/* returns true if host is using TSC based clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
    // Only vclock_mode is
    //  - VDSO_CLOCKMODE_TSC and,
    //  - VDSO_CLOCKMODE_HVCLOCK
    // can be seen as host is using tsc as clocksource.
	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
		return false;

	return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
						      tsc_timestamp));
}

kvm_vcpu_write_tsc_multiplier() KVM

Mainly used for calculating the scaling ratio.

这个函数在两个地方被调用:

set_tsc_khz
    // 如果设置的值是 0,那么就用 default 的来赋值
    kvm_vcpu_write_tsc_multiplier
    // 如果不是,那么用 vTSCfreq / pTSCfreq 计算出 ratio,来赋值
    kvm_vcpu_write_tsc_multiplier

kvm_read_l1_tsc() KVM

Although it's name is read, it will calculate each time rather than read it from a variable.

Calculate l1's tsc using the formula: l1's tsc = l1's ratio * host_tsc + l1's offset.

l1's ratio = vTSCfreq / pTSCfreq.

u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
	return vcpu->arch.l1_tsc_offset +
		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}

compute_guest_tsc() KVM

通过两次 host 的 CLOCK_BOOTIME 之间的真实的时间差,并根据上次 guest TSC 写入的值,计算理论 guest TSC,用来修正我们用 vcpu->arch.l1_tsc_offsetvcpu->arch.l1_tsc_scaling_ratio 计算出来的 TSC 的值。

为什么我们用上面变量计算出来的 TSC 的值需要被修正呢?

// kernel_ns is the CLOCK_BOOTTIME
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
    // kernel_ns - vcpu->arch.this_tsc_nsec 表示这次 kernel_ns 到上次写入 TSC 的 kernel_ns 的时间差
	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
				      vcpu->arch.virtual_tsc_mult,
				      vcpu->arch.virtual_tsc_shift);

    // arch.this_tsc_write 是上次写入的 tsc 值,加上现实中经过的时间,得到
    // 更新的 tsc 值,并返回。
	tsc += vcpu->arch.this_tsc_write;
	return tsc;
}

TSC Matching / kvm_synchronize_tsc / MSR_IA32_TSC

TSC matching mechanism try best to keep vTSCs synchronized, if the vTSC of the differnet vCPU is a little different (let's say the delta is in 1 second), then make them identical and enter master clock mode.

MSR_IA32_TSC is pass-thru to the guest for the read operation. Userspace can set the vTSC value:

static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
	MSR_IA32_TSC,
    //...
};

// vmx_vcpu_create(), arch/x86/kvm/vmx/vmx.c
// enable passthrough for read operation on MSR_IA32_TSC
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);


// QEMU 只在第每个 vcpu 第一次进入的时候写 0,因为 env->tsc 初始化是 0,所以设置进去的也都是 0
kvm_start_vcpu_thread
    kvm_vcpu_thread_fn
        kvm_cpu_exec
            kvm_arch_put_registers
                kvm_put_msrs
                    kvm_put_msrs_vm
                        kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);

KVM wants to keep vTSCs synchronized, KVM calls such a state as the so called masterclock mode, when each time vTSC is set by userspace, this mode will be broken, so we need to use TSC Matching mechanism to re-synchronize them. Specifically, when a vTSC update request comes:

  • If this vTSC is not matched to the VM global one, then we regard it as an update request to the global vTSC, so we assign the value to the global one as a new TSC, then we go into the next generation;
  • If this vTSC is roughly matched with the global one, but they are not identical, or the value is equal to 0, this means userspace want to calibrate this value using the global one, so we assign the vcpu's vTSC using global value, then increase the number of matched vcpus by 1.

When all the vCPUs are matched, we send a master clock update request to indicate we are now in master clock mode.

  • 如果 host tsc unstable,set tsc offset to $(vTSC + elasped) - (pTSC * scale)$,即假设本次和上次写入的 TSC 值相同。
  • 如果是 stable 的,那就用 global 的 offset 来设置自己。因为 stable 意味着这几个 MSR 的值是一样的。

不管那种情况,执行完后,都需要将当前 vcpu 的 generation, tsc_write, 以及 CLOCK_BOOTTIME 和全局(整个 VM)同步。

如何判断是一个同步请求?

  • 如果值是 0,那么就是一个同步请求;
  • 如果值和根据上次写的 tsc 的值(不管是哪一个 vcpu 写的)推断出来的值相差在 1s 以内,那么判断这是一个同步的请求。

It will be called when each time QEMU write to the MSR_IA32_TSC.

case MSR_IA32_TSC:
    if (msr_info->host_initiated) {
        kvm_synchronize_tsc(vcpu, data);

kvm_vm_ioctl_create_vcpu
    kvm_arch_vcpu_postcreate
        kvm_synchronize_tsc(vcpu, 0);

static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
	struct kvm *kvm = vcpu->kvm;
	u64 offset, ns, elapsed;
	unsigned long flags;
	bool matched = false;
	bool synchronizing = false;

    //...
    // because when tsc offsetting and tsc scaling are both enabled
    // the tsc perceived by the guest should be calculated:
    //      vTSC = pTSC * scale + offset
    // so:
    //      offset = (vTSC - pTSC * scale)
	offset = kvm_compute_l1_tsc_offset(vcpu, data);
    // get host CLOCK_BOOTTIME
	ns = get_kvmclock_base_ns();
    // kvm->arch.last_tsc_nsec is the last host CLOCK_BOOTTIME
    // so elapsed denote the real time elapsed.
	elapsed = ns - kvm->arch.last_tsc_nsec;

    // if we haven't set the vTSCfreq, we shouldn't synchronize
	if (vcpu->arch.virtual_tsc_khz) {
		if (data == 0) {
            /*
             * QEMU: we are hot-plugging a new CPU.
			 * detection of vcpu initialization -- need to sync
			 * with other vCPUs. This particularly helps to keep
			 * kvm_clock stable after CPU hotplug
			 */
			synchronizing = true;
		} else {
            // calculate the expected vTSC based on last tsc
            // and time elapsed
			u64 tsc_exp = kvm->arch.last_tsc_write + elapsed;
            // 1 second
			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
			/*
			 * TSC write within a small delta (1 second)
			 * interpreted as an attempt to synchronize the CPU.
             *       | tsc_exp - data | < 1 second
			 */ 
			synchronizing = data < tsc_exp + tsc_hz &&
					data + tsc_hz > tsc_exp;
		}
	}

    // For the first call, this condition won't meet even "synchronizing" is true,
    // because we haven't set last_tsc_khz to virtual_tsc_khz.
	if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
        // host TSCs are synchronized, we can let all the vcpus
        // use the same tsc_offset
		if (host tsc is stable) {
			offset = kvm->arch.cur_tsc_offset;
        // Once TSC is marked unstable, it will be unstable during the boottime...
        // An unstable tsc indicates host TSCs are not synchronized,
        // then we cannot use the global shared offset, because it is meaningless
        // We assume userspace write **same value** to all the vcpus in short period time 
        // to try to sychronize them, so we compensate manually here with the little
        // time diff each time write
        // Also, when host tsc is disabled, masterclock WON'T be used.
		} else {
			data += elapsed;
            // "offset" will be written to VMCS later
            // I think this is related to TSC catchup
            // From log, offset is getting a bit smaller each time
            // That's because host's tsc is growing, however the guest's
            // tsc we want to set is always the same
            // Ratio is default to 1.
            // Because host's tsc is greater than guest's
            // So offset is a large value for overflowing add.
			offset = kvm_compute_l1_tsc_offset(vcpu, data);
		}
		matched = true;
	}

	__kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
}

static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
				  u64 ns, bool matched)
{
    //...
	// track to allow the matching interval to be extended at each write.
    // CLOCK_BOOTTIME at that time
    // Used to calculate elapsed time between 2 write
	kvm->arch.last_tsc_nsec = ns;

    // Used to calculate expected vTSC
	kvm->arch.last_tsc_write = tsc; // vTSC value to write

    // Used to judge if vTSCfreq is updated
	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; // vTSCfreq

    // ...
    // Write the offset to VMCS
	kvm_vcpu_write_tsc_offset(vcpu, offset);

	if (!matched) {
		/*
		 * We split periods of matched TSC writes into generations.
		 * For each generation, we track the original measured
		 * nanosecond time, offset, and write, so if TSCs are in
		 * sync, we can match exact offset, and if not, we can match
		 * exact software computation in compute_guest_tsc()
		 *
		 * These values are tracked in kvm->arch.cur_xxx variables.
		 */
		kvm->arch.cur_tsc_generation++;

        // Global's data should be updated, because we are assigning
        // the data we provided to the global one
		kvm->arch.cur_tsc_nsec = ns;
		kvm->arch.cur_tsc_write = tsc;
		kvm->arch.cur_tsc_offset = offset;
		kvm->arch.nr_vcpus_matched_tsc = 0;
	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
		kvm->arch.nr_vcpus_matched_tsc++;
	}

	/* Keep track of which generation this VCPU has synchronized to */
	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;

    // update masterclock status by sending request `KVM_REQ_MASTERCLOCK_UPDATE`
	kvm_track_tsc_matching(vcpu);
}

get_kvmclock_base_ns

// Get Host's CLOCK_BOOTTIME
// CLOCK_BOOTTIME = CLOCK_MONOTONIC + offs_boot
static s64 get_kvmclock_base_ns(void)
{
	return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
}

masterclock?

/*
 *
 * Assuming a stable TSC across physical CPUS, and a stable TSC
 * across virtual CPUs, the following condition is possible.
 * Each numbered line represents an event visible to both
 * CPUs at the next numbered event.
 *
 * "timespecX" represents host monotonic time. "tscX" represents
 * RDTSC value.
 *
 * 		VCPU0 on CPU0		|	VCPU1 on CPU1
 *
 * 1.  read timespec0,tsc0
 * 2.					| timespec1 = timespec0 + N
 * 					| tsc1 = tsc0 + M
 * 3. transition to guest		| transition to guest
 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
 * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
 * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
 *
 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
 *
 * 	- ret0 < ret1
 *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
 *		...
 *	- 0 < N - M => M < N
 *
 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
 * always the case (the difference between two distinct xtime instances
 * might be smaller then the difference between corresponding TSC reads,
 * when updating guest vcpus pvclock areas).
 *
 * To avoid that problem, do not allow visibility of distinct
 * system_timestamp/tsc_timestamp values simultaneously: use a master
 * copy of host monotonic time values. Update that master copy
 * in lockstep.
 *
 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
 *
 */

Masterclock is only used when host clocksource is tsc.

Because when tsc is unstable, kernel will change the clocksource to hpet, so masterclock is also disabled in this case.

ka->use_master_clock = host_tsc_clocksource && vcpus_matched
            && !ka->backwards_tsc_observed
            && !ka->boot_vcpu_runs_old_kvmclock;

由于我们的 kvmclock 依赖于 Host Boot Time 和 Host TSC 两个量,即使 Host TSC 同步且 Guest TSC 同步,在 pCPU0 和 pCPU1 分别取两者,前者的差值和后者的差值也可能不相等,并且谁大谁小都有可能,从而可能违反 kvmclock 的单调性。因此,我们通过只使用一份 Master Copy,即 Master Clock, 来解决这个问题。

kvmclock代码学习 - EwanHai - 博客园

Global kvm_guest_has_master_clock

static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);

What's the difference with ka->use_master_clock?

struct kvm_arch {
    //...
	bool use_master_clock;
    // Host's CLOCK_BOOTTIME
	u64 master_kernel_ns;
    // current host tsc (if not use master clock, the corresponding should be rdtsc())
	u64 master_cycle_now; 
    //...
};

ka->use_master_clock

The only place to assign the value:

kvm_arch_init_vm // 
kvm_hyperv_tsc_notifier
kvm_update_masterclock
kvm_vm_ioctl_set_clock
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
    //...
    // host_tsc_clocksource: true if host is using TSC based clocksource
    // 

	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				&& !ka->backwards_tsc_observed
				&& !ka->boot_vcpu_runs_old_kvmclock;
    //...
}

TSC Catchup / KVM_SET_TSC_KHZ

TSC catchup is a mechanism in corner case, it only works when

  • TSC scaling is not supported.
  • vTSCfreq > pTSCfreq.

TSC is pass-throughed to the guest, which means although guest set a higher frequency than host, the tsc it read is still progress using host's frequency, so we need to constantly set the tsc offset to catchup.

Place to set catchup on:

case KVM_SET_TSC_KHZ
    kvm_set_tsc_khz
        set_tsc_khz
            if vTSCfreq == pTSCfreq:
                // do nothing
                return 0
            if TSC scaling VMCS feature is not supported:
                if vTSCfreq > pTSCfreq:
                    vcpu->arch.tsc_catchup = 1;
                    // If this is 1, at each vm-exit send request KVM_REQ_CLOCK_UPDATE
                    // i.e., execute kvm_guest_time_update() before each vm-entry
            		vcpu->arch.tsc_always_catchup = 1;
                    // Do not forget this return clause, this means if we set the tsc_catchup to 1
                    // **we won't set vTSCfreq/pTSCfreq to vcpu->arch.l1_tsc_scaling_ratio**
                    return 0;
                else:
                    // report error because tsc is monotonic increasing
                    // so if vTSCfreq is less than pTSCfreq, each time
                    // the guest tsc should be calibrate back, that's weried.
            else:
                calculate the scale and write it to VMCS

Place to use catchup:

kvm_guest_time_update() {
    //...
    // Calculate guest's tsc, this value maybe higher than the value
    // guest read, because it 
	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);

	/*
	 * We may have to catch up the TSC to match elapsed wall clock
	 * time for two reasons, even if kvmclock is used.
	 *   1) CPU could have been running below the maximum TSC rate
	 *   2) Broken TSC compensation resets the base at each VCPU
	 *      entry to avoid unknown leaps of TSC even when running
	 *      again on the same CPU.  This may cause apparent elapsed
	 *      time to disappear, and the guest to stand still or run
	 *	very slowly.
	 */
	if (vcpu->tsc_catchup) {
        // compute_guest_tsc used to calculate the value should be, i.e.,
        // our target. because it is computed based on the vTSC value we
        // set also the real time elasped.
        // Why not using compute_guest_tsc's result directly? because we
        // should make sure the tsc is increasing monotomicly, so we still
        // should compare it with the real value guest see, i.e., kvm_read_l1_tsc
		u64 tsc = compute_guest_tsc(v, kernel_ns);
		if (tsc > tsc_timestamp) {
			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
			tsc_timestamp = tsc;
		}
	}
    //...
}

The reason why kvm_read_l1_tsc and compute_guest_tsc may different is because:

  • TSC maybe unstable, i.e., the frequency is not stable, so kvm_read_l1_tsc may return an inaccurate result. compute_guest_tsc compute the tsc based on time elapsed and the TSCfreq in theory which is a constant (vTSCfreq), so the tsc result should be accurate.

Will TSC catchup make the TSC not jump smoothly?

Kvmclock-related requests

KVM_REQ_CLOCK_UPDATE

Update this vCPU's clock. This request is the fundamental request which will be used by other requests handler.

// This normal request only triggers this function
// it indicates a request to update the corresponding clock
kvm_guest_time_update()

KVM_REQ_GLOBAL_CLOCK_UPDATE

Update all vCPU's clock. (It will kick off the vCPU).

This will eventually send KVM_REQ_CLOCK_UPDATE request and run kvm_guest_time_update on all vCPU.

vcpu_enter_guest
	if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
        kvm_gen_kvmclock_update
            // make request to vBSP
            kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                kvm_guest_time_update()
            // make request to other vcpus
            kvmclock_update_fn() { // INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
            	kvm_for_each_vcpu(i, vcpu, kvm) {
            		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                    // kick down the vcpu
            		kvm_vcpu_kick(vcpu);
            	}
            }

KVM_REQ_MASTERCLOCK_UPDATE

Update masterclock status, and update all vCPU's clock.

// handler
static void kvm_update_masterclock(struct kvm *kvm)
{
	kvm_hv_request_tsc_page_update(kvm);
    // block all cpu: kvm_make_all_cpus_request(kvm, KVM_REQ_BLOCK_VMENTRY);
    // and kick them down.
	kvm_start_pvclock_update(kvm);

    // 1. judge if we use masterclock mode of not
    // 2. set the corresponding data in kvm_arch using pvclock_gtod_data
	pvclock_update_vm_gtod_copy(kvm);

    // Update **each** vcpu clock, send "KVM_REQ_CLOCK_UPDATE" to all vcpu.
    // clear block bit, let all vCPUs re-enter to the guest
	kvm_end_pvclock_update(kvm);
}

Kvmclock related structures

Kvmclock guest kernel side

Guest kernel initialize kvmclock

When Guest kernel init, it will call kvmclock_init, this function is running on BSP:

  • 首先,我们为所有 vCPU 设置了回调 kvmclock_setup_percpu, when the AP is bringed up then this callback is called。
  • 将 vBSP 在 kvmclock 中实际使用的数据结构 pvclock_vsyscall_time_info (ptvi) 的物理地址利用 write_msr 写到 MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW.
  • Register kvmclock into the clocksource list: clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);.

Note: WALL_CLOCK MSR is not written here.

kvmclock 代码学习 - EwanHai - 博客园

start_kernel
    setup_arch
        init_hypervisor_platform
            detect_hypervisor_vendor
                kvm_detect //(*p)->detect();
                    kvm_cpuid_base
                        __kvm_cpuid_base
                            boot_cpu_has(X86_FEATURE_HYPERVISOR)
            kvm_init_platform  // x86_init.hyper.init_platform()
                kvmclock_init

void __init kvmclock_init(void)
{
    // Setup callback function for all CPU, when AP is start, kvmclock_setup_percpu will be executed
    // and the percpu pvti struct will be initialized.
    // When AP starts to initializing, in start_sencondary() kvm_setup_secondary_clock() will be called
    // to write the pvti's address to the SYSTEM_TIME MSR and register kvmclock.
	if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
			      kvmclock_setup_percpu, NULL) < 0) {
		return;
	}

    // ...
    // wrmsrl(msr_kvm_system_time, pa); pa is GPA of pvti
	kvm_register_clock("primary cpu clock");

    // When we have Invariant TSC and TSC is stable
    // We will lower kvmclock's priority and use tsc directly
	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
	    boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
	    !check_tsc_unstable())
		kvm_clock.rating = 299;

    // ...
    // register kvmclock into the clocksource list
	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
    // ...
}

After wrmsr vmexit, KVM call function kvm_write_system_time():

  • 如果写入的是 MSR_KVM_SYSTEM_TIME,表明 Guest 使用的是旧版 kvmclock,不支持 Master Clock 模式,此时要设置 kvm->arch.boot_vcpu_runs_old_kvmclock = 1,并对当前 vCPU(即 vCPU0)发送一个 KVM_REQ_MASTER_CLOCK_UPDATE,这最终会导致 kvm->arch.use_master_clock = 0
  • 令 vcpu->arch.time = GPA 1,该变量用于模拟对该 MSR 的读取
  • 向当前 vCPU(即 vCPU0)发送 KVM_REQ_GLOBAL_CLOCK_UPDATE,这最终会导致在所有 vCPU 上运行 kvm_guest_time_update
  • 最后,设置 vcpu->arch.pv_time 为 GPA,并令 vcpu->arch.pv_time_enabled = true

kvm_write_system_time

static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
				  bool old_msr, bool host_initiated)
{
	struct kvm_arch *ka = &vcpu->kvm->arch;

    // This condition holds if the write is from guest's BSP
	if (vcpu->vcpu_id == 0 && !host_initiated) {
        // if guest changes its mind and want to use a newer or older
        // kvm_clock, then issue a `KVM_REQ_MASTERCLOCK_UPDATE` request
        // to update the masterclock globally
		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

        // Set the kvmclock type
		ka->boot_vcpu_runs_old_kvmclock = old_msr;
	}

    // It represents the GPA of pvti, not a time value
	vcpu->arch.time = system_time;
	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);

	/* we verify if the enable bit is set... */
	if (system_time & 1)
		kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
				 sizeof(struct pvclock_vcpu_time_info));
	else
		kvm_gpc_deactivate(&vcpu->arch.pv_time);

	return;
}

CPUID[4000_0001].EAX is the CPUID PV Features (KVM_CPUID_FEATURES), of which the bit 0 and bit 3 are named "kvmclock". These 2 CPUID is passed from Userspace to KVM. The difference between these 2 bits is that bit 3 indicates that the new set of kvmclock msrs are available:

#define KVM_CPUID_FEATURES	0x40000001
#define KVM_FEATURE_CLOCKSOURCE		0
/* This indicates that the new set of kvmclock msrs
 * are available. The use of 0x11 and 0x12 is deprecated
 */
#define KVM_FEATURE_CLOCKSOURCE2        3

So there are 2 sets of MSRs. If KVM_FEATURE_CLOCKSOURCE:

  • MSR_KVM_WALL_CLOCK: 0x11
  • MSR_KVM_SYSTEM_TIME: 0x12 If KVM_FEATURE_CLOCKSOURCE2:
  • MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00
  • MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01

KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: Tell the guest that guest visible TSC value can be fully trusted for kvmclock computations and no warps are expected.

When starting from QEMU cmdline, CPUID

// arch/x86/kernel/kvmclock.c
struct clocksource kvm_clock = {
	.name	= "kvm-clock",
	.read	= kvm_clock_get_cycles,
	.rating	= 400,
	.mask	= CLOCKSOURCE_MASK(64),
	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
	.enable	= kvm_cs_enable,
};
EXPORT_SYMBOL_GPL(kvm_clock);

In QEMU:

[FEAT_KVM] = {
	.type = CPUID_FEATURE_WORD,
	.feat_names = {
		"kvmclock", "kvm-nopiodelay", "kvm-mmu", "kvmclock",
		"kvm-asyncpf", "kvm-steal-time", "kvm-pv-eoi", "kvm-pv-unhalt",
		NULL, "kvm-pv-tlb-flush", NULL, "kvm-pv-ipi",
		"kvm-poll-control", "kvm-pv-sched-yield", "kvm-asyncpf-int", "kvm-msi-ext-dest-id",
		NULL, NULL, NULL, NULL,
		NULL, NULL, NULL, NULL,
		"kvmclock-stable-bit", NULL, NULL, NULL,
		NULL, NULL, NULL, NULL,
	},
	.cpuid = { .eax = KVM_CPUID_FEATURES, .reg = R_EAX, },
	.tcg_features = TCG_KVM_FEATURES,
},

kvmclock corresponding to KVM_FEATURE_CLOCKSOURCE.

PVclock, Linux guests only.

Guest kernel side for handling PV

// represents the hypervisor current guest is running on from the guest kernel's point of view
struct hypervisor_x86 {
	/* Hypervisor name, e.g, "KVM" */
	const char	*name;
	/* Detection routine */
	uint32_t	(*detect)(void);
	/* Hypervisor type */
	enum x86_hypervisor_type type;
	/* init time callbacks */
	struct x86_hyper_init init;
	/* runtime callbacks */
	struct x86_hyper_runtime runtime;
	/* ignore nopv parameter */
	bool ignore_nopv;
};

extern const struct hypervisor_x86 x86_hyper_vmware; // vmware
extern const struct hypervisor_x86 x86_hyper_ms_hyperv; // hyperv
extern const struct hypervisor_x86 x86_hyper_xen_pv; // xen
extern const struct hypervisor_x86 x86_hyper_kvm; // kvm
extern const struct hypervisor_x86 x86_hyper_jailhouse; // jailhouse
extern const struct hypervisor_x86 x86_hyper_acrn; // acrn

const __initconst struct hypervisor_x86 x86_hyper_kvm = {
	.name				= "KVM",
	.detect				= kvm_detect,
	.type				= X86_HYPER_KVM,
	.init.guest_late_init		= kvm_guest_init,
	.init.x2apic_available		= kvm_para_available,
	.init.msi_ext_dest_id		= kvm_msi_ext_dest_id,
	.init.init_platform		= kvm_init_platform,
};

How to disable kvmclock?

Add no-kvmclock to guest kernel parameter, it will set the global variable kvmclock to 0 to indicate kvmclock shouldn't be used in the guest.

static int __init parse_no_kvmclock(char *arg)
{
	kvmclock = 0;
	return 0;
}
early_param("no-kvmclock", parse_no_kvmclock);

Tsc Virtualization

If the host uses TSC clock, then passthrough TSC as stable to the guest.

cpu_tsc_khz

It is a per cpu global variable:

// arch/x86/kvm/x86.c
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);

Host tsc from KVM point of view:

/*
 * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
 * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
 * can change during boot even if the TSC is constant, as it's possible for KVM
 * to be loaded before TSC calibration completes.  Ideally, KVM would get a
 * notification when calibration completes, but practically speaking calibration
 * will complete before userspace is alive enough to create VMs.
 */
static unsigned long get_cpu_tsc_khz(void)
{
	if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
		return tsc_khz;
	else
		return __this_cpu_read(cpu_tsc_khz);
}

It is modified when each time pCPU freq change:

notifier_call
    kvmclock_cpufreq_notifier
static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
    //...
    // __this_cpu_write(cpu_tsc_khz, khz);
   	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
    //...
}
struct kvm_vcpu_arch {
    // same as pv_time, but it | 1.
	gpa_t time;
	struct pvclock_vcpu_time_info hv_clock;
	unsigned int hw_tsc_khz;
    // MSR of the "MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW", i.e., GPA of guest ptvi: vcpu->arch.time = system_time;
	struct gfn_to_pfn_cache pv_time;
	/* set guest stopped flag in pvclock flags field */
	bool pvclock_set_guest_stopped_request;

	u64 l1_tsc_offset;
	u64 tsc_offset; // The final tsc_offset should write to the VMCS

	u64 last_guest_tsc;
	u64 last_host_tsc;
	u64 tsc_offset_adjustment;
	u64 this_tsc_nsec; // CLOCK_BOOTTIME
	u64 this_tsc_write; // TSC value written by userspace
	u64 this_tsc_generation;
	bool tsc_catchup; // need to catchup due to some reason, e.g., tsc scaling is not supported by hardware.
	bool tsc_always_catchup; // tsc scaling is not supported by hardware, and vtsc is larger than ptsc

    // these 2 are used to convert nsec to tsc value
	s8 virtual_tsc_shift;
	u32 virtual_tsc_mult;

	u32 virtual_tsc_khz;
	s64 ia32_tsc_adjust_msr;
	u64 l1_tsc_scaling_ratio;
	u64 tsc_scaling_ratio; /* current scaling ratio */
};

// 有可能 last 的一直在变,但是 current 的没有变
struct kvm_arch {
    //...
    /*
	 * This also protects nr_vcpus_matched_tsc which is read from a
	 * preemption-disabled region, so it must be a raw spinlock.
	 */
	raw_spinlock_t tsc_write_lock;
    // last host CLOCK_BOOTTIME
	u64 last_tsc_nsec;
    // last vTSC
	u64 last_tsc_write;
	u32 last_tsc_khz;
	u64 last_tsc_offset;

    // cur
	u64 cur_tsc_nsec;
	u64 cur_tsc_write;
	u64 cur_tsc_offset;
	u64 cur_tsc_generation;
	int nr_vcpus_matched_tsc;

	u32 default_tsc_khz;
    //...
}

cur_tsc_nsec/last_tsc_nsec, cur_tsc_write/last_tsc_write, cur_tsc_offset/last_tsc_offset

每次 QEMU 写入 tsc MSR_IA32_TSC,会记录 kvm->arch.last_tsc_nsec、kvm->arch.last_tsc_write、kvm->arch.last_tsc_khz,以供下次调用时使用:

  • nsec 表示写入时刻的 Host Boot Time (CLOCK_BOOTTIME)
  • write 表示写入的值
  • khz 表示 vCPU 的 vTSCfreq

kvm->cur_tsc_generation / vcpu->this_tsc_generation

When each time QEMU set tsc MSR value (vTSC), KMV want to make sure all the vTSCs are synced (masterclock mode), so there are 2 conditions:

kvm_synchronize_tsc
    __kvm_synchronize_tsc
        // if the vTSC is changed, and the masterclock mode is broken so we need to resync
        if (!matched) {
    		/*
    		 * We split periods of matched TSC writes into generations.
    		 * For each generation, we track the original measured
    		 * nanosecond time, offset, and write, so if TSCs are in
    		 * sync, we can match exact offset, and if not, we can match
    		 * exact software computation in compute_guest_tsc()
    		 *
    		 * These values are tracked in kvm->arch.cur_xxx variables.
    		 */
    		kvm->arch.cur_tsc_generation++;
      
    		kvm->arch.cur_tsc_nsec = ns;
    		kvm->arch.cur_tsc_write = tsc;
    		kvm->arch.cur_tsc_offset = offset;
      
    		kvm->arch.nr_vcpus_matched_tsc = 0;
    	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
    		kvm->arch.nr_vcpus_matched_tsc++;
    	}
    
    	/* Keep track of which generation this VCPU has synchronized to */
    	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;

The matched will be true when:

/*
 * For a reliable TSC, we can match TSC offsets, and for an unstable
 * TSC, we add elapsed time in this computation.  We could let the
 * compensation code attempt to catch up if we fall behind, but
 * it's better to try to match offsets from the beginning.
 */
if (synchronizing &&
    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
    //...
    matched = true;
}

vcpu->tsc_catchup

作用是 catchup 比 pTSCfreq 更高的 vTSCfreq。

除了 Always Catchup 模式,还有可能触发 catchup 行为。在 vCPU 加载时(kvm_arch_vcpu_load),如果 Host TSC Unstable,则会进行以下操作:

  • 根据当前时刻的 Host TSC 值 pTSC、vCPU 上次记录的 TSC 值 vTSC = vcpu->arch.last_guest_tsc,求得 offset = vTSC - pTSC * scale,并将其写入 L1 TSC Offset。
  • 然后设置 vcpu->arch.tsc_catchup

这是一种保守的策略,我们先将 vTSC 调整到上次记录的 vTSC 值,这一定是比理论上的正确 vTSC 值小的,然后设置 vcpu->arch.tsc_catchup,在接下来的 kvm_gen_kvmclock_update 中将 vTSC 的值修正为理论的正确值。

此后 vCPU 运行过程中,每次 KVM_REQ_CLOCK_UPDATE 请求,都会导致一次 catch up,这样至少每隔 300 秒都会有一次 catch up。

The only use case is following.

static int kvm_guest_time_update(struct kvm_vcpu *v)
{
    //...
	/*
	 * We may have to catch up the TSC to match elapsed wall clock
	 * time for two reasons, even if kvmclock is used.
	 *   1) CPU could have been running below the maximum TSC rate
	 *   2) Broken TSC compensation resets the base at each VCPU
	 *      entry to avoid unknown leaps of TSC even when running
	 *      again on the same CPU.  This may cause apparent elapsed
	 *      time to disappear, and the guest to stand still or run
	 *	very slowly.
	 */
	if (vcpu->tsc_catchup) {
		u64 tsc = compute_guest_tsc(v, kernel_ns);
		if (tsc > tsc_timestamp) {
			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
			tsc_timestamp = tsc;
		}
	}
}

arch->nr_vcpus_matched_tsc

/*
 * Infers attempts to synchronize the guest's tsc from host writes. Sets the
 * offset for the vcpu and tracks the TSC matching generation that the vcpu
 * participates in.
 */
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, u64 ns, bool matched)
{
    //...
	if (!matched) {
		/*
		 * We split periods of matched TSC writes into generations.
		 * For each generation, we track the original measured
		 * nanosecond time, offset, and write, so if TSCs are in
		 * sync, we can match exact offset, and if not, we can match
		 * exact software computation in compute_guest_tsc()
		 *
		 * These values are tracked in kvm->arch.cur_xxx variables.
		 */
		kvm->arch.nr_vcpus_matched_tsc = 0;
	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
		kvm->arch.nr_vcpus_matched_tsc++;
	}
    //...
}

Kvmclock processes

Kvmclock periodically update

每 300 秒(KVMCLOCK_SYNC_PERIOD)调用一次 kvmclock_sync_fn,它会调用 kvmclock_update_fn,其会对每个 vCPU 发送 KVM_REQ_CLOCK_UPDATE

kvmclock_update_fn
    // Send this request to **each** vcpu
	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
        // At **each** dst vcpu
        vcpu_enter_guest()
        	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
            	kvm_guest_time_update(vcpu);

KVM update pvti

There are 2 MSRs holding the GPA of pvti: MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW

struct pvti of each vcpu only needs to be wrmsr once by the guest (As we have discussed, in kvmclock_init()), host will update pvti periodically.

struct pvclock_gtod_data / pvclock_clock

Only when use_master_clock, pvclock_gtod_data's data will be used, because it is a global data.

This variable is shared between all the VMs. It will be updated each host tick.

Also because it is shared between VMs, so it only record host-related clock data, not any guest's.

This struct is NOT shared to guest, it is used for KVM to track clock-related data:

static struct pvclock_gtod_data pvclock_gtod_data;

struct pvclock_gtod_data {
    // Why there are 2 pvclock_clock? See 53fafdbb
    //...
    // CLOCK_MONOTONIC corresponding to tkr_mono in struct timekeeper
	struct pvclock_clock clock;
    // CLOCK_MONOTONIC_RAW corresponding to tkr_raw in struct timekeeper
    // KVMCLOCK's base is monotonic raw clock
    // See 53fafdbb
	struct pvclock_clock raw_clock;

    // Host's CLOCK_BOOTTIME
    // CLOCK_BOOTTIME = CLOCK_MONOTONIC + offs_boot
	ktime_t		offs_boot;
    // Host's CLOCK_REALTIME (wall time), the ns part is in
    // pvclock_clock->base_cycles
	u64		wall_time_sec;
};

struct pvclock_clock {
    // host using which clocksource: VDSO_CLOCKMODE_TSC, VDSO_CLOCKMODE_PVCLOCK, VDSO_CLOCKMODE_HVCLOCK
    // v denotes vDSO, not vCPU, so this is totally host's mode, not guest's
	int vclock_mode; 

    // clocksource's counter number timekeeper.cycle_last
	u64 cycle_last;

    // Bitmask for two's complement subtraction of non 64bit clocks
	u64 mask;
	u32 mult; // See mult in timekeeper
	u32 shift; // See shift in timekeeper
	u64 base_cycles; // CLOCK_REALTIME (Wall time) ns part

    // corresponding to timerkeeper's base
    // i.e., CLOCK_REALTIME (Wall time)
	u64 offset;
};

Struct pvclock_vsyscall_time_info / pvclock_vcpu_time_info / pvti

// each **vcpu->hv_clock** corresponding to a slot in this array (hv_clock_boot)
// e.g., this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
// the size of this array must be 1 page. (_aligned(PAGE_SIZE);)
static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE]; //...
// A map from its cpu index to pvti
DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);

struct pvclock_vsyscall_time_info {
	struct pvclock_vcpu_time_info pvti;
}; //...

// per vcpu
// This struct is shared between host and guest
struct pvclock_vcpu_time_info {
    u32   version; /* odd means the host is updating it,even means the update is done */

    // Guest's vTSC
	u64   tsc_timestamp;

    // Guest's CLOCK_BOOTTIME
	u64   system_time;

    // The following 2 are used to convert tsc value to nsec
    // with the help of vTSCfreq, you know.
    // see function __pvclock_read_cycles()
	u32   tsc_to_system_mul;
	s8    tsc_shift;

	u8    flags;
    //...
} __attribute__((__packed__)); /* 32 bytes */

Host's update to pvti

timekeeper/KVM_SET_CLOCK -> pvclock_gtod_data -> kvm_arch -> vcpu->hv_clock -> ptvi -> guest.

Host update to pvclock_gtod_data

KVM use pvclock_gtod_register_notifier function to register a callback pvclock_gtod_notify into timekeeper layer, when each time host kernel update its time, i.e., timekeeping_update is invoked, then pvclock_gtod_notify will be called.

pvclock_gtod_notify:

static struct pvclock_gtod_data pvclock_gtod_data;
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
			       void *priv)
{
	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
	struct timekeeper *tk = priv;

    // 1. update the pvclock_gtod_data variable using tk's values
	update_pvclock_gtod(tk);

    // 2. Corner case
    // if clocksource is not TSC,but global variable kvm_guest_has_master_clock is not 0,
    // this means clocksource changed from TSC to non-TSC,so now send to all VM's all vCPU
    // the request KVM_REQ_MASTER_CLOCK_UPDATE, then let kvm_guest_has_master_clock = 0 to disable
    // using the masterclock.
	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
	    atomic_read(&kvm_guest_has_master_clock) != 0)
        // finally call pvclock_gtod_update_fn, which will
		irq_work_queue(&pvclock_irq_work);
	return 0;
}

若 clocksource 不是 TSC,但全局变量 kvm_guest_has_master_clock 非零,说明 clocksource 从 TSC 变为了非 TSC,此时向所有 vCPU 发送 KVM_REQ_MASTER_CLOCK_UPDATE,然后令 kvm_guest_has_master_clock = 0。

pvclock_gtod_update_fn:

pvclock_gtod_notify
	irq_work_queue(&pvclock_irq_work);
        //...
        pvclock_gtod_update_fn

static void pvclock_gtod_update_fn(struct work_struct *work)
{
	struct kvm *kvm;
	struct kvm_vcpu *vcpu;
	unsigned long i;

	list_for_each_entry(kvm, &vm_list, vm_list)
		kvm_for_each_vcpu(i, vcpu, kvm)
			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	atomic_set(&kvm_guest_has_master_clock, 0);
}

Host's update to kvm_arch directly (KVM_SET_CLOCK):

E.g., QEMU will call this ioctl kvm_vm_ioctl(KVM_SET_CLOCK):

// Register a callback function that is invoked when the vm starts or stops running.
qemu_add_vm_change_state_handler(kvmclock_vm_state_change)
    ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);

vm_start
    // vm_prepare_start: Prepare for starting/resuming the VM
    vm_prepare_start
        vm_state_notify
            e->cb // kvmclock_vm_state_change
                if (running) {
                    ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
                }
            
// pause a VM from running
vm_stop
    do_vm_stop
        vm_state_notify
            e->cb // kvmclock_vm_state_change

In KVM:

static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
{
    //...
    // To comfirm if we use master clock mode or not
	pvclock_update_vm_gtod_copy(kvm);

    // If we are using master clock, we use the global
    // value directly, else, we will use host's
    // CLOCK_BOOTTIME calculated by the value from timekeeping layer
	if (ka->use_master_clock)
		now_raw_ns = ka->master_kernel_ns;
	else
		now_raw_ns = get_kvmclock_base_ns();

    // data.clock is the system time we want to set
    // because system time = host's CLOCK_BOOTTIME + kvmclock_offset
    // so we can calculate kvmclock_offset in a reverse way.
	ka->kvmclock_offset = data.clock - now_raw_ns;
	return 0;
}

Host update kvm_arch's use_master_clock based on pvclock_gtod_data / pvclock_update_vm_gtod_copy

This function mainly:

  1. judge if master clock mode should be used, and set the related status.
  2. set status from pvclock_gtod_data to kvm_arch .
KVM_REQ_MASTERCLOCK_UPDATE
kvm_update_masterclock
pvclock_update_vm_gtod_copy
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
	struct kvm_arch *ka = &kvm->arch;
	int vclock_mode;
	bool host_tsc_clocksource, vcpus_matched;

	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus));

	/*
	 * If the host uses TSC as the clocksource, then passthrough TSC as stable
	 * to the guest.
	 */
	host_tsc_clocksource = kvm_get_time_and_clockread(
					&ka->master_kernel_ns,
					&ka->master_cycle_now);

    // judge if masterclock mode is used, it should meet following conditions
    //  - host use tsc as clocksource
    //  - vcpus are matched
    //  - boot vcpu is not running with an old kvmclock
	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				&& !ka->backwards_tsc_observed
				&& !ka->boot_vcpu_runs_old_kvmclock;

	if (ka->use_master_clock)
		atomic_set(&kvm_guest_has_master_clock, 1);
}

From kvm_arch to vcpu->hv_clock then pvti / kvm_guest_time_update

This function is executed when receiving the request KVM_REQ_CLOCK_UPDATE.

In kvm_guest_time_update, we will first get 2 variable's value:

  • kernel_ns: denotes host's CLOCK_BOOTTIME
  • host_tsc: denotes host's TSC value

If use masterclock:

  • kernel_ns: use masterclock per-VM value in kvm_arch ka->master_kernel_ns.
  • host_tsc: use masterclock per-VM value: ka->master_cycle_now.

else:

  • kernel_ns: Calculate host's CLOCK_BOOTTIME using tk's value.
  • host_tsc: rdtsc()

这个函数主要:

  1. Assign kvm_arch's value to ' vcpu->hv_clock;
  2. Write vcpu->hv_clock's content to guest's pvti.
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
	unsigned long flags, tgt_tsc_khz;
	unsigned seq;
	struct kvm_vcpu_arch *vcpu = &v->arch;
	struct kvm_arch *ka = &v->kvm->arch;
	s64 kernel_ns;
	u64 tsc_timestamp, host_tsc;
	u8 pvclock_flags;
	bool use_master_clock;

    // **here is the place pvclock_gtod_data work**
    // master_cycle_now and master_kernel_ns are 
    // both calculated using pvclock_gtod_data
    // If use_master_clock is not set, then
    // pvclock_gtod_data won't be used.
    if (use_master_clock) {
        host_tsc = ka->master_cycle_now;
        kernel_ns = ka->master_kernel_ns;
    } else {
		host_tsc = rdtsc();
		kernel_ns = get_kvmclock_base_ns();
	}
    //...

    // Calculate guest's tsc, this tsc is the real tsc value
    // that guest will read each time
	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);

	/*
     * Corner case, since skylake TSC scaling is supported
	 */
	if (vcpu->tsc_catchup) {
		u64 tsc = compute_guest_tsc(v, kernel_ns);
		if (tsc > tsc_timestamp) {
			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
			tsc_timestamp = tsc;
		}
	}

	/* With all the info we got, fill in the values */
    // calculate vTSCfreq
	if (kvm_caps.has_tsc_control)
		tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
					    v->arch.l1_tsc_scaling_ratio);

    // recalculate tsc_shift and tsc_to_system_mul
	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				   &vcpu->hv_clock.tsc_shift,
				   &vcpu->hv_clock.tsc_to_system_mul);
		vcpu->hw_tsc_khz = tgt_tsc_khz;
	}

    // Assign value to hv_clock.tsc_timestamp
	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
    //...
    // recalculate system time (guest's CLOCK_BOOTTIME), 
	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;

    //...

    // copy from hv_clock to pvti
	if (vcpu->pv_time.active)
		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
    // Some Xen code...
	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
	return 0;
}

System time (CLOCK_BOOTTIME) using tsc

Just rdtsc is ok, and guest kernel won't write to tsc according to the code.

System time (CLOCK_BOOTTIME) using kvmclock

System time is the time since guest kernel boot (Guest's CLOCK_BOOTTIME).

Invariant TSC exposed by host means kvmclock is not necessary: Will use tsc as clocksource. Because TSC can also provide CLOCK_BOOTTIME. This way kvmclock is only used to provide wall clock. This is determined by the following process:

kvmclock_init
    if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
	    boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
	    !check_tsc_unstable())
        // tsc's rating is 300, so this will have a lower priorioty
        // than tsc
		kvm_clock.rating = 299;

clocksource_register_hz
    __clocksource_register_scale
        // Add current clocksource in to the global clocksource_list
        clocksource_enqueue
        	list_for_each_entry(tmp, &clocksource_list, list) {
        		/* Keep track of the place, where to insert */
        		if (tmp->rating < cs->rating)
        			break;
        		entry = &tmp->list;
        	}
        	list_add(&cs->list, entry);
        // swtich to the **best** clocksource
        clocksource_select();

Difference between system time and CLOCK_BOOTTIME

They are the same, system time is guest's CLOCK_BOOTTIME.

好像是通过 Host's CLOCK_BOOTTIME + Kvmclock Offset 来计算的。

system_time = kernel_ns + kvm->arch.kvmclock_offset

Guest read system time

variable kvm_clock is a type of clocksource in guest kernel.

struct clocksource kvm_clock = {
	.name	= "kvm-clock",
    //system time = Host Boot Time + Kvmclock Offset (in ns)
	.read	= kvm_clock_get_cycles,
	.rating	= 400,
	.mask	= CLOCKSOURCE_MASK(64),
	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
	.enable	= kvm_cs_enable,
};

Guest read system time using function kvm_clock_get_cycles.

kvm_clock_get_cycles / pvclock_clocksource_read

This function read CLOCK_BOOTTIME.

Read from the shared variable: pvti.

kvm_clock_get_cycles
    kvm_clock_read
    	pvclock_clocksource_read(this_cpu_pvti())

static __always_inline
u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
{
	unsigned version;
	u64 ret;
	u64 last;
	u8 flags;

    // Calculate guest's current CLOCK_BOOTTIME based on
    // last time's guest's CLOCK_BOOTTIME with the
    // read time elapsed.
    ret = __pvclock_read_cycles(src, rdtsc_ordered());
    //...
	return ret;
}

kvmclock 代码学习 - EwanHai - 博客园

__pvclock_read_cycles()

// Because pvti->system_time is the CLOCK_BOOTTIME when KVM update the pvti,
// When guest read, some time have elapsed, so we need to add it with the 
// elapsed time calculated by guest's current tsc - guest's tsc_timestamp when update
u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
{
    // tsc is guest's tsc.
	u64 delta = tsc - src->tsc_timestamp;
	u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift);
	return src->system_time + offset;
}

`

Wall clock using kvmclock x86_platform.get_wallclock

This function is used to get the CLOCK_REALTIME (wall clock).

kvmclock doesn't allow the wall clock to be written:

// x86_platform.set_wallclock = kvm_set_wallclock;
static int kvm_set_wallclock(const struct timespec64 *now)
{
	return -ENODEV;
}

How does guest get wall clock while not using kvmclock as the clocksource (because tsc is better)?

When guest detect it is running as a KVM, it will have the following call trace:

start_kernel
    setup_arch
        init_hypervisor_platform
            x86_init.hyper.init_platform(); // kvm_init_platform
            	kvmclock_init
                    // 1. add the callback function for retrieving wallclock
            		x86_platform.get_wallclock = kvm_get_wallclock;
                    // 2. write the pvti address to MSR_KVM_SYSTEM_TIME_NEW
                    kvm_register_clock("primary cpu clock");

So, although clocksource kvmclock in guest kernel is not enabled, the pvti struct is also

  • registered by function kvm_register_clock and,
  • updated constantly by function kvm_guest_time_update

so kvm_get_wallclock can also get the needed information without clocksource kvmclock.

Guest issue wall clock read request

Guest kernel has a global variable wall_clock which references the real time when the guest kernel boot (已经用 trace_printk 验证过了,的确是 guest 的), thus the guest calculates the current wall clock by adding it.

static struct pvclock_wall_clock wall_clock __bss_decrypted;

/*
 * references the true time when guest boot, thus the guest calculates the
 * current wall time by adding this with the system time (time elasped since boot).
 *
 * It is not updated frequently, because guest's boot time is fix, and guest
 * won't issue many writes to MSR_KVM_WALL_CLOCK/MSR_KVM_WALL_CLOCK_NEW
 */
struct pvclock_wall_clock {
	u32   version;
	u32   sec;
	u32   nsec;
} __attribute__((__packed__));

Guest read wall clock by first write to the MSR_KVM_WALL_CLOCK/MSR_KVM_WALL_CLOCK_NEW with the GPA of the wall_clock global variable, then a WRMSR VMexit is triggered and KVM can write the real data to the address provided.

// Wall clock is read after booting the kernel and
// kvmclock is initialized.
start_kernel
    setup_arch
        init_hypervisor_platform
            kvm_init_platform  // x86_init.hyper.init_platform()
                kvmclock_init
    timekeeping_init
        read_persistent_wall_and_boot_offset
            read_persistent_clock64
                kvm_get_wallclock // x86_platform.get_wallclock(ts)
                
// Wall clock is also read when guest suspend/resume
// https://www.cnblogs.com/haiyonghao/p/14440035.html
timekeeping_resume // timekeeping_syscore_ops.resume()
    read_persistent_clock64
        kvm_get_wallclock // x86_platform.get_wallclock(ts)
        
static void kvm_get_wallclock(struct timespec64 *now)
{ 
    // write physical address of wall_clock global variable to MSR MSR_KVM_WALL_CLOCK,
    // trigger wrmsr_vmexit then KVM can write the real data to the GPA in the MSR
	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
    // The host side (KVM) has prepared the needed information and return, now guest can read.
	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
}

pvclock_read_wallclock / pvclock_clocksource_read / Guest get wall clock from the shared memory

This function will use the data from 2 shared variables:

  • pvti
  • wall_clock
pvclock_read_wallclock
    pvclock_clocksource_read
        __pvclock_read_cycles

void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
			    struct pvclock_vcpu_time_info *vcpu_time,
			    struct timespec64 *ts)
{
	u32 version;
	u64 delta;
	struct timespec64 now;

    // The value stored in the shared variable wall_clock
    // is the real time when guest boot.
    //...
    now.tv_sec  = wall_clock->sec;
    now.tv_nsec = wall_clock->nsec;
    //...

    // As we have talked before, this function
    // is to get guest's system time, i.e., CLOCK_BOOTTIME
	delta = pvclock_clocksource_read(vcpu_time);
    // add wall time with CLOCK_BOOTTIME
	delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; 

    // Do some math calculations
	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	now.tv_sec = delta;

    // set the value
	set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}

KVM prepare wall clock data

kvm_emulate_wrmsr
    kvm_set_msr_with_filter
        kvm_set_msr_ignored_check
            __kvm_set_msr
                vmx_set_msr
                    kvm_set_msr_common
                        case MSR_KVM_WALL_CLOCK/MSR_KVM_WALL_CLOCK_NEW:
                            kvm_write_wall_clock

static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
    //...
	/*
	 * The **guest** calculates true wall clock time by adding
	 * system time (updated by kvm_guest_time_update below) to the
	 * value we provide. So we calculate the value in the reverse 
	 * way: wall_time - time_since_boot
     *  - ktime_get_real_ns(): get current wall time
     *  - get_kvmclock_ns(): read kvmclock, i.e., CLOCK_BOOTTIME
	 */
	wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);

    //...
    // write to the host-guest shared variable "wall_clock"
	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));

    // maybe used in future when 32bit is not enough
	if (sec_hi_ofs) {
		wc_sec_hi = wall_nsec >> 32;
		kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
				&wc_sec_hi, sizeof(wc_sec_hi));
	}
    //...
}

ktime_get_real_ns() / Get the real wall time

static inline u64 ktime_get_real_ns(void)
{
	return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
	return ktime_get_with_offset(TK_OFFS_REAL);
}

get_kvmclock_ns / get_kvmclock

This function gets guest's system time (CLOCK_BOOTTIME).

get_kvmclock_ns
    get_kvmclock
        __get_kvmclock
        
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
	struct kvm_arch *ka = &kvm->arch;
	struct pvclock_vcpu_time_info hv_clock;

	if (ka->use_master_clock) { //...
		struct timespec64 ts;
        //...
        // host's current tsc time
    	data->host_tsc = rdtsc();
        // host's tsc time when do the update
		hv_clock.tsc_timestamp = ka->master_cycle_now;
        // guest's CLOCK_BOOTTIME when do the update
		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
        // Some time have elapsed since last time update to now, so calibrate it
        // using host's tsc to calculate elapsed time and compensate it to guest's
        // CLOCK_BOOTTIME
		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
        // host's CLOCK_BOOTTIME + kvmclock_offset
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}
}

Reference

kvmclock 代码学习 - EwanHai - 博客园

KVM Time Virtualization | tcbbd的博客