KVMClock
Misc of kvmclock
VMCS has a Primary Processor-Based VM-Execution Controls bit: Use TSC offsetting
, which denotes:
This control determines whether executions of RDTSC, executions of RDTSCP, and executions of RDMSR that read from the IA32_TIME_STAMP_COUNTER MSR return a value modified by the TSC offset field.
This will work when kvmclock has not been used as the current clocksource to retrieve system time because tsc will be used as the clocksource and rdtsc
will work.
当 TSC 频率发生变化时,要向该 pCPU 上的所有 vCPU 发送 KVM_REQ_CLOCK_UPDATE
,更新它们的 tsc to nsec 转换参数。
若 vcpu->arch.tsc_always_catchup = 1,则每次 VMExit 都要给 vCPU 发送 KVM_REQ_CLOCK_UPDATE
,以实现不断 catchup 比 pTSCfreq 更高的 vTSCfreq。
kvm_clock 作为一个 clocksource,其频率为 1GHz,实际上其 read 回调返回的值是就是 System Time(即 CLOCK_BOOTTIME)的读数,单位为 ns。
注意当 Host 和 Guest 都是 stable 时,我们实际上直接使用 TSC 作为 Clocksource,kvmclock 只用于提供 wall clock。
kvmclock 中的 vTSC 频率就是 Guest TSC 的实际运行频率,和 KVM_SET_TSC_KHZ
设置的 vTSCfreq 不一定相等(例如在 Host TSC 频率可变时会不相等)。每次 pCPU 频率改变,就会更新 cpu_tsc_khz
并对该 pCPU 上的所有 vCPU 发送 KVM_REQ_CLOCK_UPDATE
,但 scale 始终不变。
kvmclock_cpufreq_notifier
static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
//...
}
}
}
When QEMU get supported CPUID from KVM:
kvm_dev_ioctl
kvm_arch_dev_ioctl
kvm_dev_ioctl_get_cpuid
get_cpuid_func
do_cpuid_func
__do_cpuid_func
case KVM_CPUID_FEATURES:
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
//...
(1 << KVM_FEATURE_CLOCKSOURCE2) |
When host's tsc support tsc scaling, TSC catchup mechanism is not used.
When host's tsc is unstable, masterclock is not used.
When host's tsc is stable, kvmclock is not used to read the SYSTEM_TIME, it is only used to read the Wall time.
vgettsc()
KVM
Update tsc_timestamp
to current tsc
.
Return (current tsc
- last tsc
) * mult
.
static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode)
{
long v;
u64 tsc_pg_val;
switch (clock->vclock_mode) {
// Hyper-v case...
// ...
case VDSO_CLOCKMODE_TSC:
*mode = VDSO_CLOCKMODE_TSC;
*tsc_timestamp = read_tsc();
v = (*tsc_timestamp - clock->cycle_last) &
clock->mask;
break;
default:
*mode = VDSO_CLOCKMODE_NONE;
}
if (*mode == VDSO_CLOCKMODE_NONE)
*tsc_timestamp = v = 0;
return v * clock->mult;
}
do_monotonic_raw()
KVM
static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
int mode;
u64 ns;
//...
// get wall time ns part (xtime_nsec, which means it is shifted)
ns = gtod->raw_clock.base_cycles;
// Plus delta and shift back, we get the real ns value
ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode);
ns >>= gtod->raw_clock.shift;
ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
//...
*t = ns;
return mode;
}
kvm_get_time_and_clockread()
KVM
/* returns true if host is using TSC based clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
// Only vclock_mode is
// - VDSO_CLOCKMODE_TSC and,
// - VDSO_CLOCKMODE_HVCLOCK
// can be seen as host is using tsc as clocksource.
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
tsc_timestamp));
}
kvm_vcpu_write_tsc_multiplier()
KVM
Mainly used for calculating the scaling ratio.
这个函数在两个地方被调用:
set_tsc_khz
// 如果设置的值是 0,那么就用 default 的来赋值
kvm_vcpu_write_tsc_multiplier
// 如果不是,那么用 vTSCfreq / pTSCfreq 计算出 ratio,来赋值
kvm_vcpu_write_tsc_multiplier
kvm_read_l1_tsc()
KVM
Although it's name is read, it will calculate each time rather than read it from a variable.
Calculate l1's tsc using the formula: l1's tsc = l1's ratio * host_tsc + l1's offset
.
l1's ratio = vTSCfreq / pTSCfreq
.
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
return vcpu->arch.l1_tsc_offset +
kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
compute_guest_tsc()
KVM
通过两次 host 的 CLOCK_BOOTIME
之间的真实的时间差,并根据上次 guest TSC 写入的值,计算理论 guest TSC,用来修正我们用 vcpu->arch.l1_tsc_offset
和 vcpu->arch.l1_tsc_scaling_ratio
计算出来的 TSC 的值。
为什么我们用上面变量计算出来的 TSC 的值需要被修正呢?
// kernel_ns is the CLOCK_BOOTTIME
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
// kernel_ns - vcpu->arch.this_tsc_nsec 表示这次 kernel_ns 到上次写入 TSC 的 kernel_ns 的时间差
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
vcpu->arch.virtual_tsc_mult,
vcpu->arch.virtual_tsc_shift);
// arch.this_tsc_write 是上次写入的 tsc 值,加上现实中经过的时间,得到
// 更新的 tsc 值,并返回。
tsc += vcpu->arch.this_tsc_write;
return tsc;
}
TSC Matching / kvm_synchronize_tsc
/ MSR_IA32_TSC
TSC matching mechanism try best to keep vTSCs synchronized, if the vTSC of the differnet vCPU is a little different (let's say the delta is in 1 second), then make them identical and enter master clock mode.
MSR_IA32_TSC
is pass-thru to the guest for the read operation. Userspace can set the vTSC value:
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_TSC,
//...
};
// vmx_vcpu_create(), arch/x86/kvm/vmx/vmx.c
// enable passthrough for read operation on MSR_IA32_TSC
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
// QEMU 只在第每个 vcpu 第一次进入的时候写 0,因为 env->tsc 初始化是 0,所以设置进去的也都是 0
kvm_start_vcpu_thread
kvm_vcpu_thread_fn
kvm_cpu_exec
kvm_arch_put_registers
kvm_put_msrs
kvm_put_msrs_vm
kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
KVM wants to keep vTSCs synchronized, KVM calls such a state as the so called masterclock mode, when each time vTSC is set by userspace, this mode will be broken, so we need to use TSC Matching mechanism to re-synchronize them. Specifically, when a vTSC update request comes:
- If this vTSC is not matched to the VM global one, then we regard it as an update request to the global vTSC, so we assign the value to the global one as a new TSC, then we go into the next generation;
- If this vTSC is roughly matched with the global one, but they are not identical, or the value is equal to 0, this means userspace want to calibrate this value using the global one, so we assign the vcpu's vTSC using global value, then increase the number of matched vcpus by 1.
When all the vCPUs are matched, we send a master clock update request to indicate we are now in master clock mode.
- 如果 host tsc unstable,set tsc offset to $(vTSC + elasped) - (pTSC * scale)$,即假设本次和上次写入的 TSC 值相同。
- 如果是 stable 的,那就用 global 的 offset 来设置自己。因为 stable 意味着这几个 MSR 的值是一样的。
不管那种情况,执行完后,都需要将当前 vcpu 的 generation, tsc_write, 以及 CLOCK_BOOTTIME
和全局(整个 VM)同步。
如何判断是一个同步请求?
- 如果值是 0,那么就是一个同步请求;
- 如果值和根据上次写的 tsc 的值(不管是哪一个 vcpu 写的)推断出来的值相差在 1s 以内,那么判断这是一个同步的请求。
It will be called when each time QEMU write to the MSR_IA32_TSC
.
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, data);
kvm_vm_ioctl_create_vcpu
kvm_arch_vcpu_postcreate
kvm_synchronize_tsc(vcpu, 0);
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched = false;
bool synchronizing = false;
//...
// because when tsc offsetting and tsc scaling are both enabled
// the tsc perceived by the guest should be calculated:
// vTSC = pTSC * scale + offset
// so:
// offset = (vTSC - pTSC * scale)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
// get host CLOCK_BOOTTIME
ns = get_kvmclock_base_ns();
// kvm->arch.last_tsc_nsec is the last host CLOCK_BOOTTIME
// so elapsed denote the real time elapsed.
elapsed = ns - kvm->arch.last_tsc_nsec;
// if we haven't set the vTSCfreq, we shouldn't synchronize
if (vcpu->arch.virtual_tsc_khz) {
if (data == 0) {
/*
* QEMU: we are hot-plugging a new CPU.
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
* kvm_clock stable after CPU hotplug
*/
synchronizing = true;
} else {
// calculate the expected vTSC based on last tsc
// and time elapsed
u64 tsc_exp = kvm->arch.last_tsc_write + elapsed;
// 1 second
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/*
* TSC write within a small delta (1 second)
* interpreted as an attempt to synchronize the CPU.
* | tsc_exp - data | < 1 second
*/
synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp;
}
}
// For the first call, this condition won't meet even "synchronizing" is true,
// because we haven't set last_tsc_khz to virtual_tsc_khz.
if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
// host TSCs are synchronized, we can let all the vcpus
// use the same tsc_offset
if (host tsc is stable) {
offset = kvm->arch.cur_tsc_offset;
// Once TSC is marked unstable, it will be unstable during the boottime...
// An unstable tsc indicates host TSCs are not synchronized,
// then we cannot use the global shared offset, because it is meaningless
// We assume userspace write **same value** to all the vcpus in short period time
// to try to sychronize them, so we compensate manually here with the little
// time diff each time write
// Also, when host tsc is disabled, masterclock WON'T be used.
} else {
data += elapsed;
// "offset" will be written to VMCS later
// I think this is related to TSC catchup
// From log, offset is getting a bit smaller each time
// That's because host's tsc is growing, however the guest's
// tsc we want to set is always the same
// Ratio is default to 1.
// Because host's tsc is greater than guest's
// So offset is a large value for overflowing add.
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
}
__kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
}
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
u64 ns, bool matched)
{
//...
// track to allow the matching interval to be extended at each write.
// CLOCK_BOOTTIME at that time
// Used to calculate elapsed time between 2 write
kvm->arch.last_tsc_nsec = ns;
// Used to calculate expected vTSC
kvm->arch.last_tsc_write = tsc; // vTSC value to write
// Used to judge if vTSCfreq is updated
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; // vTSCfreq
// ...
// Write the offset to VMCS
kvm_vcpu_write_tsc_offset(vcpu, offset);
if (!matched) {
/*
* We split periods of matched TSC writes into generations.
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
* exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
kvm->arch.cur_tsc_generation++;
// Global's data should be updated, because we are assigning
// the data we provided to the global one
kvm->arch.cur_tsc_nsec = ns;
kvm->arch.cur_tsc_write = tsc;
kvm->arch.cur_tsc_offset = offset;
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
kvm->arch.nr_vcpus_matched_tsc++;
}
/* Keep track of which generation this VCPU has synchronized to */
vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
// update masterclock status by sending request `KVM_REQ_MASTERCLOCK_UPDATE`
kvm_track_tsc_matching(vcpu);
}
get_kvmclock_base_ns
// Get Host's CLOCK_BOOTTIME
// CLOCK_BOOTTIME = CLOCK_MONOTONIC + offs_boot
static s64 get_kvmclock_base_ns(void)
{
return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
}
masterclock
?
/*
*
* Assuming a stable TSC across physical CPUS, and a stable TSC
* across virtual CPUs, the following condition is possible.
* Each numbered line represents an event visible to both
* CPUs at the next numbered event.
*
* "timespecX" represents host monotonic time. "tscX" represents
* RDTSC value.
*
* VCPU0 on CPU0 | VCPU1 on CPU1
*
* 1. read timespec0,tsc0
* 2. | timespec1 = timespec0 + N
* | tsc1 = tsc0 + M
* 3. transition to guest | transition to guest
* 4. ret0 = timespec0 + (rdtsc - tsc0) |
* 5. | ret1 = timespec1 + (rdtsc - tsc1)
* | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
*
* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
*
* - ret0 < ret1
* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
* ...
* - 0 < N - M => M < N
*
* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
* always the case (the difference between two distinct xtime instances
* might be smaller then the difference between corresponding TSC reads,
* when updating guest vcpus pvclock areas).
*
* To avoid that problem, do not allow visibility of distinct
* system_timestamp/tsc_timestamp values simultaneously: use a master
* copy of host monotonic time values. Update that master copy
* in lockstep.
*
* Rely on synchronization of host TSCs and guest TSCs for monotonicity.
*
*/
Masterclock is only used when host clocksource is tsc.
Because when tsc is unstable, kernel will change the clocksource to hpet, so masterclock is also disabled in this case.
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;
由于我们的 kvmclock 依赖于 Host Boot Time 和 Host TSC 两个量,即使 Host TSC 同步且 Guest TSC 同步,在 pCPU0 和 pCPU1 分别取两者,前者的差值和后者的差值也可能不相等,并且谁大谁小都有可能,从而可能违反 kvmclock 的单调性。因此,我们通过只使用一份 Master Copy,即 Master Clock, 来解决这个问题。
Global kvm_guest_has_master_clock
static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
What's the difference with ka->use_master_clock
?
masterclock
Related properties
struct kvm_arch {
//...
bool use_master_clock;
// Host's CLOCK_BOOTTIME
u64 master_kernel_ns;
// current host tsc (if not use master clock, the corresponding should be rdtsc())
u64 master_cycle_now;
//...
};
ka->use_master_clock
The only place to assign the value:
kvm_arch_init_vm //
kvm_hyperv_tsc_notifier
kvm_update_masterclock
kvm_vm_ioctl_set_clock
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
//...
// host_tsc_clocksource: true if host is using TSC based clocksource
//
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;
//...
}
TSC Catchup / KVM_SET_TSC_KHZ
TSC catchup is a mechanism in corner case, it only works when
- TSC scaling is not supported.
- vTSCfreq > pTSCfreq.
TSC is pass-throughed to the guest, which means although guest set a higher frequency than host, the tsc it read is still progress using host's frequency, so we need to constantly set the tsc offset to catchup.
Place to set catchup on:
case KVM_SET_TSC_KHZ
kvm_set_tsc_khz
set_tsc_khz
if vTSCfreq == pTSCfreq:
// do nothing
return 0
if TSC scaling VMCS feature is not supported:
if vTSCfreq > pTSCfreq:
vcpu->arch.tsc_catchup = 1;
// If this is 1, at each vm-exit send request KVM_REQ_CLOCK_UPDATE
// i.e., execute kvm_guest_time_update() before each vm-entry
vcpu->arch.tsc_always_catchup = 1;
// Do not forget this return clause, this means if we set the tsc_catchup to 1
// **we won't set vTSCfreq/pTSCfreq to vcpu->arch.l1_tsc_scaling_ratio**
return 0;
else:
// report error because tsc is monotonic increasing
// so if vTSCfreq is less than pTSCfreq, each time
// the guest tsc should be calibrate back, that's weried.
else:
calculate the scale and write it to VMCS
Place to use catchup:
kvm_guest_time_update() {
//...
// Calculate guest's tsc, this value maybe higher than the value
// guest read, because it
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
* 2) Broken TSC compensation resets the base at each VCPU
* entry to avoid unknown leaps of TSC even when running
* again on the same CPU. This may cause apparent elapsed
* time to disappear, and the guest to stand still or run
* very slowly.
*/
if (vcpu->tsc_catchup) {
// compute_guest_tsc used to calculate the value should be, i.e.,
// our target. because it is computed based on the vTSC value we
// set also the real time elasped.
// Why not using compute_guest_tsc's result directly? because we
// should make sure the tsc is increasing monotomicly, so we still
// should compare it with the real value guest see, i.e., kvm_read_l1_tsc
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
//...
}
The reason why kvm_read_l1_tsc
and compute_guest_tsc
may different is because:
- TSC maybe unstable, i.e., the frequency is not stable, so
kvm_read_l1_tsc
may return an inaccurate result.compute_guest_tsc
compute the tsc based on time elapsed and the TSCfreq in theory which is a constant (vTSCfreq), so the tsc result should be accurate.
Will TSC catchup make the TSC not jump smoothly?
Kvmclock-related requests
KVM_REQ_CLOCK_UPDATE
Update this vCPU's clock. This request is the fundamental request which will be used by other requests handler.
// This normal request only triggers this function
// it indicates a request to update the corresponding clock
kvm_guest_time_update()
KVM_REQ_GLOBAL_CLOCK_UPDATE
Update all vCPU's clock. (It will kick off the vCPU).
This will eventually send KVM_REQ_CLOCK_UPDATE
request and run kvm_guest_time_update
on all vCPU.
vcpu_enter_guest
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update
// make request to vBSP
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
kvm_guest_time_update()
// make request to other vcpus
kvmclock_update_fn() { // INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
// kick down the vcpu
kvm_vcpu_kick(vcpu);
}
}
KVM_REQ_MASTERCLOCK_UPDATE
Update masterclock
status, and update all vCPU's clock.
// handler
static void kvm_update_masterclock(struct kvm *kvm)
{
kvm_hv_request_tsc_page_update(kvm);
// block all cpu: kvm_make_all_cpus_request(kvm, KVM_REQ_BLOCK_VMENTRY);
// and kick them down.
kvm_start_pvclock_update(kvm);
// 1. judge if we use masterclock mode of not
// 2. set the corresponding data in kvm_arch using pvclock_gtod_data
pvclock_update_vm_gtod_copy(kvm);
// Update **each** vcpu clock, send "KVM_REQ_CLOCK_UPDATE" to all vcpu.
// clear block bit, let all vCPUs re-enter to the guest
kvm_end_pvclock_update(kvm);
}
Kvmclock related structures
Kvmclock guest kernel side
Guest kernel initialize kvmclock
When Guest kernel init, it will call kvmclock_init
, this function is running on BSP:
- 首先,我们为所有 vCPU 设置了回调 kvmclock_setup_percpu, when the AP is bringed up then this callback is called。
- 将 vBSP 在 kvmclock 中实际使用的数据结构
pvclock_vsyscall_time_info
(ptvi) 的物理地址利用write_msr
写到MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW
. - Register kvmclock into the clocksource list:
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
.
Note: WALL_CLOCK
MSR is not written here.
start_kernel
setup_arch
init_hypervisor_platform
detect_hypervisor_vendor
kvm_detect //(*p)->detect();
kvm_cpuid_base
__kvm_cpuid_base
boot_cpu_has(X86_FEATURE_HYPERVISOR)
kvm_init_platform // x86_init.hyper.init_platform()
kvmclock_init
void __init kvmclock_init(void)
{
// Setup callback function for all CPU, when AP is start, kvmclock_setup_percpu will be executed
// and the percpu pvti struct will be initialized.
// When AP starts to initializing, in start_sencondary() kvm_setup_secondary_clock() will be called
// to write the pvti's address to the SYSTEM_TIME MSR and register kvmclock.
if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, NULL) < 0) {
return;
}
// ...
// wrmsrl(msr_kvm_system_time, pa); pa is GPA of pvti
kvm_register_clock("primary cpu clock");
// When we have Invariant TSC and TSC is stable
// We will lower kvmclock's priority and use tsc directly
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
!check_tsc_unstable())
kvm_clock.rating = 299;
// ...
// register kvmclock into the clocksource list
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
// ...
}
After wrmsr vmexit, KVM call function kvm_write_system_time()
:
- 如果写入的是
MSR_KVM_SYSTEM_TIME
,表明 Guest 使用的是旧版 kvmclock,不支持 Master Clock 模式,此时要设置kvm->arch.boot_vcpu_runs_old_kvmclock
= 1,并对当前 vCPU(即 vCPU0)发送一个KVM_REQ_MASTER_CLOCK_UPDATE
,这最终会导致kvm->arch.use_master_clock = 0
。 -
令 vcpu->arch.time = GPA 1,该变量用于模拟对该 MSR 的读取 - 向当前 vCPU(即 vCPU0)发送
KVM_REQ_GLOBAL_CLOCK_UPDATE
,这最终会导致在所有 vCPU 上运行kvm_guest_time_update
- 最后,设置 vcpu->arch.pv_time 为 GPA,并令 vcpu->arch.pv_time_enabled = true
kvm_write_system_time
static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
bool old_msr, bool host_initiated)
{
struct kvm_arch *ka = &vcpu->kvm->arch;
// This condition holds if the write is from guest's BSP
if (vcpu->vcpu_id == 0 && !host_initiated) {
// if guest changes its mind and want to use a newer or older
// kvm_clock, then issue a `KVM_REQ_MASTERCLOCK_UPDATE` request
// to update the masterclock globally
if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
// Set the kvmclock type
ka->boot_vcpu_runs_old_kvmclock = old_msr;
}
// It represents the GPA of pvti, not a time value
vcpu->arch.time = system_time;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (system_time & 1)
kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
sizeof(struct pvclock_vcpu_time_info));
else
kvm_gpc_deactivate(&vcpu->arch.pv_time);
return;
}
Kvmclock related features
CPUID[4000_0001].EAX
is the CPUID PV Features (KVM_CPUID_FEATURES), of which the bit 0 and bit 3 are named "kvmclock". These 2 CPUID is passed from Userspace to KVM. The difference between these 2 bits is that bit 3 indicates that the new set of kvmclock msrs are available:
#define KVM_CPUID_FEATURES 0x40000001
#define KVM_FEATURE_CLOCKSOURCE 0
/* This indicates that the new set of kvmclock msrs
* are available. The use of 0x11 and 0x12 is deprecated
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
So there are 2 sets of MSRs. If KVM_FEATURE_CLOCKSOURCE
:
-
MSR_KVM_WALL_CLOCK
: 0x11 -
MSR_KVM_SYSTEM_TIME
: 0x12 IfKVM_FEATURE_CLOCKSOURCE2
: -
MSR_KVM_WALL_CLOCK_NEW
: 0x4b564d00 -
MSR_KVM_SYSTEM_TIME_NEW
: 0x4b564d01
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT
: Tell the guest that guest visible TSC value can be fully trusted for kvmclock computations and no warps are expected.
When starting from QEMU cmdline, CPUID
// arch/x86/kernel/kvmclock.c
struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.enable = kvm_cs_enable,
};
EXPORT_SYMBOL_GPL(kvm_clock);
In QEMU:
[FEAT_KVM] = {
.type = CPUID_FEATURE_WORD,
.feat_names = {
"kvmclock", "kvm-nopiodelay", "kvm-mmu", "kvmclock",
"kvm-asyncpf", "kvm-steal-time", "kvm-pv-eoi", "kvm-pv-unhalt",
NULL, "kvm-pv-tlb-flush", NULL, "kvm-pv-ipi",
"kvm-poll-control", "kvm-pv-sched-yield", "kvm-asyncpf-int", "kvm-msi-ext-dest-id",
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
"kvmclock-stable-bit", NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
},
.cpuid = { .eax = KVM_CPUID_FEATURES, .reg = R_EAX, },
.tcg_features = TCG_KVM_FEATURES,
},
kvmclock corresponding to KVM_FEATURE_CLOCKSOURCE.
PVclock, Linux guests only.
Guest kernel side for handling PV
// represents the hypervisor current guest is running on from the guest kernel's point of view
struct hypervisor_x86 {
/* Hypervisor name, e.g, "KVM" */
const char *name;
/* Detection routine */
uint32_t (*detect)(void);
/* Hypervisor type */
enum x86_hypervisor_type type;
/* init time callbacks */
struct x86_hyper_init init;
/* runtime callbacks */
struct x86_hyper_runtime runtime;
/* ignore nopv parameter */
bool ignore_nopv;
};
extern const struct hypervisor_x86 x86_hyper_vmware; // vmware
extern const struct hypervisor_x86 x86_hyper_ms_hyperv; // hyperv
extern const struct hypervisor_x86 x86_hyper_xen_pv; // xen
extern const struct hypervisor_x86 x86_hyper_kvm; // kvm
extern const struct hypervisor_x86 x86_hyper_jailhouse; // jailhouse
extern const struct hypervisor_x86 x86_hyper_acrn; // acrn
const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
.type = X86_HYPER_KVM,
.init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available,
.init.msi_ext_dest_id = kvm_msi_ext_dest_id,
.init.init_platform = kvm_init_platform,
};
How to disable kvmclock?
Add no-kvmclock
to guest kernel parameter, it will set the global variable kvmclock
to 0 to indicate kvmclock shouldn't be used in the guest.
static int __init parse_no_kvmclock(char *arg)
{
kvmclock = 0;
return 0;
}
early_param("no-kvmclock", parse_no_kvmclock);
Tsc Virtualization
If the host uses TSC clock, then passthrough TSC as stable to the guest.
cpu_tsc_khz
It is a per cpu global variable:
// arch/x86/kvm/x86.c
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
Host tsc from KVM point of view:
/*
* Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
* per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
* can change during boot even if the TSC is constant, as it's possible for KVM
* to be loaded before TSC calibration completes. Ideally, KVM would get a
* notification when calibration completes, but practically speaking calibration
* will complete before userspace is alive enough to create VMs.
*/
static unsigned long get_cpu_tsc_khz(void)
{
if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
return tsc_khz;
else
return __this_cpu_read(cpu_tsc_khz);
}
It is modified when each time pCPU freq change:
notifier_call
kvmclock_cpufreq_notifier
static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
//...
// __this_cpu_write(cpu_tsc_khz, khz);
smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
//...
}
Tsc-related properties in kvm_vcpu_arch
struct kvm_vcpu_arch {
// same as pv_time, but it | 1.
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
// MSR of the "MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW", i.e., GPA of guest ptvi: vcpu->arch.time = system_time;
struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
u64 l1_tsc_offset;
u64 tsc_offset; // The final tsc_offset should write to the VMCS
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
u64 this_tsc_nsec; // CLOCK_BOOTTIME
u64 this_tsc_write; // TSC value written by userspace
u64 this_tsc_generation;
bool tsc_catchup; // need to catchup due to some reason, e.g., tsc scaling is not supported by hardware.
bool tsc_always_catchup; // tsc scaling is not supported by hardware, and vtsc is larger than ptsc
// these 2 are used to convert nsec to tsc value
s8 virtual_tsc_shift;
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
u64 l1_tsc_scaling_ratio;
u64 tsc_scaling_ratio; /* current scaling ratio */
};
// 有可能 last 的一直在变,但是 current 的没有变
struct kvm_arch {
//...
/*
* This also protects nr_vcpus_matched_tsc which is read from a
* preemption-disabled region, so it must be a raw spinlock.
*/
raw_spinlock_t tsc_write_lock;
// last host CLOCK_BOOTTIME
u64 last_tsc_nsec;
// last vTSC
u64 last_tsc_write;
u32 last_tsc_khz;
u64 last_tsc_offset;
// cur
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
u32 default_tsc_khz;
//...
}
cur_tsc_nsec/last_tsc_nsec
, cur_tsc_write/last_tsc_write
, cur_tsc_offset/last_tsc_offset
每次 QEMU 写入 tsc MSR_IA32_TSC
,会记录 kvm->arch.last_tsc_nsec、kvm->arch.last_tsc_write、kvm->arch.last_tsc_khz,以供下次调用时使用:
- nsec 表示写入时刻的 Host Boot Time (
CLOCK_BOOTTIME
) - write 表示写入的值
- khz 表示 vCPU 的 vTSCfreq
kvm->cur_tsc_generation
/ vcpu->this_tsc_generation
When each time QEMU set tsc MSR value (vTSC), KMV want to make sure all the vTSCs are synced (masterclock mode), so there are 2 conditions:
kvm_synchronize_tsc
__kvm_synchronize_tsc
// if the vTSC is changed, and the masterclock mode is broken so we need to resync
if (!matched) {
/*
* We split periods of matched TSC writes into generations.
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
* exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
kvm->arch.cur_tsc_generation++;
kvm->arch.cur_tsc_nsec = ns;
kvm->arch.cur_tsc_write = tsc;
kvm->arch.cur_tsc_offset = offset;
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
kvm->arch.nr_vcpus_matched_tsc++;
}
/* Keep track of which generation this VCPU has synchronized to */
vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
The matched
will be true when:
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
* compensation code attempt to catch up if we fall behind, but
* it's better to try to match offsets from the beginning.
*/
if (synchronizing &&
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
//...
matched = true;
}
vcpu->tsc_catchup
作用是 catchup 比 pTSCfreq 更高的 vTSCfreq。
除了 Always Catchup 模式,还有可能触发 catchup 行为。在 vCPU 加载时(kvm_arch_vcpu_load),如果 Host TSC Unstable,则会进行以下操作:
- 根据当前时刻的 Host TSC 值 pTSC、vCPU 上次记录的 TSC 值
vTSC = vcpu->arch.last_guest_tsc
,求得offset = vTSC - pTSC * scale
,并将其写入 L1 TSC Offset。 - 然后设置
vcpu->arch.tsc_catchup
。
这是一种保守的策略,我们先将 vTSC 调整到上次记录的 vTSC 值,这一定是比理论上的正确 vTSC 值小的,然后设置 vcpu->arch.tsc_catchup
,在接下来的 kvm_gen_kvmclock_update
中将 vTSC 的值修正为理论的正确值。
此后 vCPU 运行过程中,每次 KVM_REQ_CLOCK_UPDATE
请求,都会导致一次 catch up,这样至少每隔 300 秒都会有一次 catch up。
The only use case is following.
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
//...
/*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
* 2) Broken TSC compensation resets the base at each VCPU
* entry to avoid unknown leaps of TSC even when running
* again on the same CPU. This may cause apparent elapsed
* time to disappear, and the guest to stand still or run
* very slowly.
*/
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
}
arch->nr_vcpus_matched_tsc
/*
* Infers attempts to synchronize the guest's tsc from host writes. Sets the
* offset for the vcpu and tracks the TSC matching generation that the vcpu
* participates in.
*/
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, u64 ns, bool matched)
{
//...
if (!matched) {
/*
* We split periods of matched TSC writes into generations.
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
* exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
kvm->arch.nr_vcpus_matched_tsc++;
}
//...
}
Kvmclock processes
Kvmclock periodically update
每 300 秒(KVMCLOCK_SYNC_PERIOD
)调用一次 kvmclock_sync_fn
,它会调用 kvmclock_update_fn
,其会对每个 vCPU 发送 KVM_REQ_CLOCK_UPDATE
。
kvmclock_update_fn
// Send this request to **each** vcpu
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
// At **each** dst vcpu
vcpu_enter_guest()
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
kvm_guest_time_update(vcpu);
KVM update pvti
There are 2 MSRs holding the GPA of pvti
: MSR_KVM_SYSTEM_TIME/MSR_KVM_SYSTEM_TIME_NEW
struct pvti
of each vcpu only needs to be wrmsr once by the guest (As we have discussed, in kvmclock_init
()), host will update pvti
periodically.
struct pvclock_gtod_data
/ pvclock_clock
Only when use_master_clock
, pvclock_gtod_data
's data will be used, because it is a global data.
This variable is shared between all the VMs. It will be updated each host tick.
Also because it is shared between VMs, so it only record host-related clock data, not any guest's.
This struct is NOT shared to guest, it is used for KVM to track clock-related data:
static struct pvclock_gtod_data pvclock_gtod_data;
struct pvclock_gtod_data {
// Why there are 2 pvclock_clock? See 53fafdbb
//...
// CLOCK_MONOTONIC corresponding to tkr_mono in struct timekeeper
struct pvclock_clock clock;
// CLOCK_MONOTONIC_RAW corresponding to tkr_raw in struct timekeeper
// KVMCLOCK's base is monotonic raw clock
// See 53fafdbb
struct pvclock_clock raw_clock;
// Host's CLOCK_BOOTTIME
// CLOCK_BOOTTIME = CLOCK_MONOTONIC + offs_boot
ktime_t offs_boot;
// Host's CLOCK_REALTIME (wall time), the ns part is in
// pvclock_clock->base_cycles
u64 wall_time_sec;
};
struct pvclock_clock {
// host using which clocksource: VDSO_CLOCKMODE_TSC, VDSO_CLOCKMODE_PVCLOCK, VDSO_CLOCKMODE_HVCLOCK
// v denotes vDSO, not vCPU, so this is totally host's mode, not guest's
int vclock_mode;
// clocksource's counter number timekeeper.cycle_last
u64 cycle_last;
// Bitmask for two's complement subtraction of non 64bit clocks
u64 mask;
u32 mult; // See mult in timekeeper
u32 shift; // See shift in timekeeper
u64 base_cycles; // CLOCK_REALTIME (Wall time) ns part
// corresponding to timerkeeper's base
// i.e., CLOCK_REALTIME (Wall time)
u64 offset;
};
Struct pvclock_vsyscall_time_info
/ pvclock_vcpu_time_info
/ pvti
// each **vcpu->hv_clock** corresponding to a slot in this array (hv_clock_boot)
// e.g., this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
// the size of this array must be 1 page. (_aligned(PAGE_SIZE);)
static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE]; //...
// A map from its cpu index to pvti
DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
}; //...
// per vcpu
// This struct is shared between host and guest
struct pvclock_vcpu_time_info {
u32 version; /* odd means the host is updating it,even means the update is done */
// Guest's vTSC
u64 tsc_timestamp;
// Guest's CLOCK_BOOTTIME
u64 system_time;
// The following 2 are used to convert tsc value to nsec
// with the help of vTSCfreq, you know.
// see function __pvclock_read_cycles()
u32 tsc_to_system_mul;
s8 tsc_shift;
u8 flags;
//...
} __attribute__((__packed__)); /* 32 bytes */
Host's update to pvti
timekeeper/KVM_SET_CLOCK -> pvclock_gtod_data -> kvm_arch -> vcpu->hv_clock -> ptvi -> guest
.
Host update to pvclock_gtod_data
KVM use pvclock_gtod_register_notifier
function to register a callback pvclock_gtod_notify
into timekeeper layer, when each time host kernel update its time, i.e., timekeeping_update
is invoked, then pvclock_gtod_notify
will be called.
pvclock_gtod_notify
:
static struct pvclock_gtod_data pvclock_gtod_data;
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
void *priv)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
struct timekeeper *tk = priv;
// 1. update the pvclock_gtod_data variable using tk's values
update_pvclock_gtod(tk);
// 2. Corner case
// if clocksource is not TSC,but global variable kvm_guest_has_master_clock is not 0,
// this means clocksource changed from TSC to non-TSC,so now send to all VM's all vCPU
// the request KVM_REQ_MASTER_CLOCK_UPDATE, then let kvm_guest_has_master_clock = 0 to disable
// using the masterclock.
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
// finally call pvclock_gtod_update_fn, which will
irq_work_queue(&pvclock_irq_work);
return 0;
}
若 clocksource 不是 TSC,但全局变量 kvm_guest_has_master_clock
非零,说明 clocksource 从 TSC 变为了非 TSC,此时向所有 vCPU 发送 KVM_REQ_MASTER_CLOCK_UPDATE
,然后令 kvm_guest_has_master_clock
= 0。
pvclock_gtod_update_fn
:
pvclock_gtod_notify
irq_work_queue(&pvclock_irq_work);
//...
pvclock_gtod_update_fn
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
unsigned long i;
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
}
Host's update to kvm_arch
directly (KVM_SET_CLOCK
):
E.g., QEMU will call this ioctl kvm_vm_ioctl(KVM_SET_CLOCK)
:
// Register a callback function that is invoked when the vm starts or stops running.
qemu_add_vm_change_state_handler(kvmclock_vm_state_change)
ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
vm_start
// vm_prepare_start: Prepare for starting/resuming the VM
vm_prepare_start
vm_state_notify
e->cb // kvmclock_vm_state_change
if (running) {
ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
}
// pause a VM from running
vm_stop
do_vm_stop
vm_state_notify
e->cb // kvmclock_vm_state_change
In KVM:
static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
{
//...
// To comfirm if we use master clock mode or not
pvclock_update_vm_gtod_copy(kvm);
// If we are using master clock, we use the global
// value directly, else, we will use host's
// CLOCK_BOOTTIME calculated by the value from timekeeping layer
if (ka->use_master_clock)
now_raw_ns = ka->master_kernel_ns;
else
now_raw_ns = get_kvmclock_base_ns();
// data.clock is the system time we want to set
// because system time = host's CLOCK_BOOTTIME + kvmclock_offset
// so we can calculate kvmclock_offset in a reverse way.
ka->kvmclock_offset = data.clock - now_raw_ns;
return 0;
}
Host update kvm_arch
's use_master_clock
based on pvclock_gtod_data
/ pvclock_update_vm_gtod_copy
This function mainly:
- judge if master clock mode should be used, and set the related status.
- set status from
pvclock_gtod_data
tokvm_arch
.
KVM_REQ_MASTERCLOCK_UPDATE
kvm_update_masterclock
pvclock_update_vm_gtod_copy
static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus));
/*
* If the host uses TSC as the clocksource, then passthrough TSC as stable
* to the guest.
*/
host_tsc_clocksource = kvm_get_time_and_clockread(
&ka->master_kernel_ns,
&ka->master_cycle_now);
// judge if masterclock mode is used, it should meet following conditions
// - host use tsc as clocksource
// - vcpus are matched
// - boot vcpu is not running with an old kvmclock
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
}
From kvm_arch
to vcpu->hv_clock
then pvti
/ kvm_guest_time_update
This function is executed when receiving the request KVM_REQ_CLOCK_UPDATE
.
In kvm_guest_time_update
, we will first get 2 variable's value:
-
kernel_ns
: denotes host's CLOCK_BOOTTIME -
host_tsc
: denotes host's TSC value
If use masterclock:
-
kernel_ns
: use masterclock per-VM value in kvm_archka->master_kernel_ns
. -
host_tsc
: use masterclock per-VM value:ka->master_cycle_now
.
else:
-
kernel_ns
: Calculate host'sCLOCK_BOOTTIME
using tk's value. -
host_tsc
:rdtsc()
这个函数主要:
- Assign
kvm_arch
's value to 'vcpu->hv_clock
; - Write
vcpu->hv_clock
's content to guest's pvti.
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
u8 pvclock_flags;
bool use_master_clock;
// **here is the place pvclock_gtod_data work**
// master_cycle_now and master_kernel_ns are
// both calculated using pvclock_gtod_data
// If use_master_clock is not set, then
// pvclock_gtod_data won't be used.
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
} else {
host_tsc = rdtsc();
kernel_ns = get_kvmclock_base_ns();
}
//...
// Calculate guest's tsc, this tsc is the real tsc value
// that guest will read each time
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* Corner case, since skylake TSC scaling is supported
*/
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
/* With all the info we got, fill in the values */
// calculate vTSCfreq
if (kvm_caps.has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
// recalculate tsc_shift and tsc_to_system_mul
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
// Assign value to hv_clock.tsc_timestamp
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
//...
// recalculate system time (guest's CLOCK_BOOTTIME),
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
//...
// copy from hv_clock to pvti
if (vcpu->pv_time.active)
kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
// Some Xen code...
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
System time (CLOCK_BOOTTIME) using tsc
Just rdtsc
is ok, and guest kernel won't write to tsc according to the code.
System time (CLOCK_BOOTTIME) using kvmclock
System time is the time since guest kernel boot (Guest's CLOCK_BOOTTIME
).
Invariant TSC exposed by host means kvmclock is not necessary: Will use tsc as clocksource. Because TSC can also provide CLOCK_BOOTTIME
. This way kvmclock is only used to provide wall clock. This is determined by the following process:
kvmclock_init
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
!check_tsc_unstable())
// tsc's rating is 300, so this will have a lower priorioty
// than tsc
kvm_clock.rating = 299;
clocksource_register_hz
__clocksource_register_scale
// Add current clocksource in to the global clocksource_list
clocksource_enqueue
list_for_each_entry(tmp, &clocksource_list, list) {
/* Keep track of the place, where to insert */
if (tmp->rating < cs->rating)
break;
entry = &tmp->list;
}
list_add(&cs->list, entry);
// swtich to the **best** clocksource
clocksource_select();
Difference between system time and CLOCK_BOOTTIME
They are the same, system time is guest's CLOCK_BOOTTIME
.
好像是通过 Host's CLOCK_BOOTTIME
+ Kvmclock Offset 来计算的。
system_time = kernel_ns + kvm->arch.kvmclock_offset
Guest read system time
variable kvm_clock
is a type of clocksource in guest kernel.
struct clocksource kvm_clock = {
.name = "kvm-clock",
//system time = Host Boot Time + Kvmclock Offset (in ns)
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.enable = kvm_cs_enable,
};
Guest read system time using function kvm_clock_get_cycles
.
kvm_clock_get_cycles
/ pvclock_clocksource_read
This function read CLOCK_BOOTTIME.
Read from the shared variable: pvti
.
kvm_clock_get_cycles
kvm_clock_read
pvclock_clocksource_read(this_cpu_pvti())
static __always_inline
u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
{
unsigned version;
u64 ret;
u64 last;
u8 flags;
// Calculate guest's current CLOCK_BOOTTIME based on
// last time's guest's CLOCK_BOOTTIME with the
// read time elapsed.
ret = __pvclock_read_cycles(src, rdtsc_ordered());
//...
return ret;
}
__pvclock_read_cycles()
// Because pvti->system_time is the CLOCK_BOOTTIME when KVM update the pvti,
// When guest read, some time have elapsed, so we need to add it with the
// elapsed time calculated by guest's current tsc - guest's tsc_timestamp when update
u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
{
// tsc is guest's tsc.
u64 delta = tsc - src->tsc_timestamp;
u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift);
return src->system_time + offset;
}
`
Wall clock using kvmclock x86_platform.get_wallclock
This function is used to get the CLOCK_REALTIME
(wall clock).
kvmclock doesn't allow the wall clock to be written:
// x86_platform.set_wallclock = kvm_set_wallclock;
static int kvm_set_wallclock(const struct timespec64 *now)
{
return -ENODEV;
}
How does guest get wall clock while not using kvmclock as the clocksource (because tsc is better)?
When guest detect it is running as a KVM, it will have the following call trace:
start_kernel
setup_arch
init_hypervisor_platform
x86_init.hyper.init_platform(); // kvm_init_platform
kvmclock_init
// 1. add the callback function for retrieving wallclock
x86_platform.get_wallclock = kvm_get_wallclock;
// 2. write the pvti address to MSR_KVM_SYSTEM_TIME_NEW
kvm_register_clock("primary cpu clock");
So, although clocksource kvmclock
in guest kernel is not enabled, the pvti
struct is also
- registered by function
kvm_register_clock
and, - updated constantly by function
kvm_guest_time_update
so kvm_get_wallclock
can also get the needed information without clocksource kvmclock
.
Guest issue wall clock read request
Guest kernel has a global variable wall_clock
which references the real time when the guest kernel boot (已经用 trace_printk
验证过了,的确是 guest 的), thus the guest calculates the current wall clock by adding it.
static struct pvclock_wall_clock wall_clock __bss_decrypted;
/*
* references the true time when guest boot, thus the guest calculates the
* current wall time by adding this with the system time (time elasped since boot).
*
* It is not updated frequently, because guest's boot time is fix, and guest
* won't issue many writes to MSR_KVM_WALL_CLOCK/MSR_KVM_WALL_CLOCK_NEW
*/
struct pvclock_wall_clock {
u32 version;
u32 sec;
u32 nsec;
} __attribute__((__packed__));
Guest read wall clock by first write to the MSR_KVM_WALL_CLOCK
/MSR_KVM_WALL_CLOCK_NEW
with the GPA of the wall_clock
global variable, then a WRMSR VMexit is triggered and KVM can write the real data to the address provided.
// Wall clock is read after booting the kernel and
// kvmclock is initialized.
start_kernel
setup_arch
init_hypervisor_platform
kvm_init_platform // x86_init.hyper.init_platform()
kvmclock_init
timekeeping_init
read_persistent_wall_and_boot_offset
read_persistent_clock64
kvm_get_wallclock // x86_platform.get_wallclock(ts)
// Wall clock is also read when guest suspend/resume
// https://www.cnblogs.com/haiyonghao/p/14440035.html
timekeeping_resume // timekeeping_syscore_ops.resume()
read_persistent_clock64
kvm_get_wallclock // x86_platform.get_wallclock(ts)
static void kvm_get_wallclock(struct timespec64 *now)
{
// write physical address of wall_clock global variable to MSR MSR_KVM_WALL_CLOCK,
// trigger wrmsr_vmexit then KVM can write the real data to the GPA in the MSR
wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
// The host side (KVM) has prepared the needed information and return, now guest can read.
pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
}
pvclock_read_wallclock
/ pvclock_clocksource_read
/ Guest get wall clock from the shared memory
This function will use the data from 2 shared variables:
pvti
wall_clock
pvclock_read_wallclock
pvclock_clocksource_read
__pvclock_read_cycles
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec64 *ts)
{
u32 version;
u64 delta;
struct timespec64 now;
// The value stored in the shared variable wall_clock
// is the real time when guest boot.
//...
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
//...
// As we have talked before, this function
// is to get guest's system time, i.e., CLOCK_BOOTTIME
delta = pvclock_clocksource_read(vcpu_time);
// add wall time with CLOCK_BOOTTIME
delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
// Do some math calculations
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
// set the value
set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
}
KVM prepare wall clock data
kvm_emulate_wrmsr
kvm_set_msr_with_filter
kvm_set_msr_ignored_check
__kvm_set_msr
vmx_set_msr
kvm_set_msr_common
case MSR_KVM_WALL_CLOCK/MSR_KVM_WALL_CLOCK_NEW:
kvm_write_wall_clock
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
//...
/*
* The **guest** calculates true wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
* value we provide. So we calculate the value in the reverse
* way: wall_time - time_since_boot
* - ktime_get_real_ns(): get current wall time
* - get_kvmclock_ns(): read kvmclock, i.e., CLOCK_BOOTTIME
*/
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
//...
// write to the host-guest shared variable "wall_clock"
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
// maybe used in future when 32bit is not enough
if (sec_hi_ofs) {
wc_sec_hi = wall_nsec >> 32;
kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
&wc_sec_hi, sizeof(wc_sec_hi));
}
//...
}
ktime_get_real_ns()
/ Get the real wall time
static inline u64 ktime_get_real_ns(void)
{
return ktime_to_ns(ktime_get_real());
}
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
*/
static inline ktime_t ktime_get_real(void)
{
return ktime_get_with_offset(TK_OFFS_REAL);
}
get_kvmclock_ns
/ get_kvmclock
This function gets guest's system time (CLOCK_BOOTTIME
).
get_kvmclock_ns
get_kvmclock
__get_kvmclock
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
if (ka->use_master_clock) { //...
struct timespec64 ts;
//...
// host's current tsc time
data->host_tsc = rdtsc();
// host's tsc time when do the update
hv_clock.tsc_timestamp = ka->master_cycle_now;
// guest's CLOCK_BOOTTIME when do the update
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
// Some time have elapsed since last time update to now, so calibrate it
// using host's tsc to calculate elapsed time and compensate it to guest's
// CLOCK_BOOTTIME
data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
} else {
// host's CLOCK_BOOTTIME + kvmclock_offset
data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
}
}