/*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <machine/clock.h> #include <machine/cpufunc.h> #include <machine/md_var.h> #include <machine/pcb.h> #include <machine/specialreg.h> #include <machine/vmm.h> #include "vmx.h" #include "vmx_msr.h" #include "x86.h" static bool vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) { return ((msr_val & (1UL << (bitpos + 32))) != 0); } static bool vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) { return ((msr_val & (1UL << bitpos)) == 0); } uint32_t vmx_revision(void) { return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); } /* * Generate a bitmask to be used for the VMCS execution control fields. * * The caller specifies what bits should be set to one in 'ones_mask' * and what bits should be set to zero in 'zeros_mask'. The don't-care * bits are set to the default value. The default values are obtained * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining * VMX Capabilities". * * Returns zero on success and non-zero on error. */ int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, uint32_t zeros_mask, uint32_t *retval) { int i; uint64_t val, trueval; bool true_ctls_avail, one_allowed, zero_allowed; /* We cannot ask the same bit to be set to both '1' and '0' */ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) return (EINVAL); true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; val = rdmsr(ctl_reg); if (true_ctls_avail) trueval = rdmsr(true_ctl_reg); /* step c */ else trueval = val; /* step a */ for (i = 0; i < 32; i++) { one_allowed = vmx_ctl_allows_one_setting(trueval, i); zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); KASSERT(one_allowed || zero_allowed, ("invalid zero/one setting for bit %d of ctl 0x%0x, " "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); if (zero_allowed && !one_allowed) { /* b(i),c(i) */ if (ones_mask & (1 << i)) return (EINVAL); *retval &= ~(1 << i); } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ if (zeros_mask & (1 << i)) return (EINVAL); *retval |= 1 << i; } else { if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ *retval &= ~(1 << i); else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ *retval |= 1 << i; else if (!true_ctls_avail) *retval &= ~(1 << i); /* b(iii) */ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ *retval &= ~(1 << i); else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ *retval |= 1 << i; else { panic("vmx_set_ctlreg: unable to determine " "correct value of ctl bit %d for msr " "0x%0x and true msr 0x%0x", i, ctl_reg, true_ctl_reg); } } } return (0); } void msr_bitmap_initialize(char *bitmap) { memset(bitmap, 0xff, PAGE_SIZE); } int msr_bitmap_change_access(char *bitmap, u_int msr, int access) { int byte, bit; if (msr <= 0x00001FFF) byte = msr / 8; else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) byte = 1024 + (msr - 0xC0000000) / 8; else return (EINVAL); bit = msr & 0x7; if (access & MSR_BITMAP_ACCESS_READ) bitmap[byte] &= ~(1 << bit); else bitmap[byte] |= 1 << bit; byte += 2048; if (access & MSR_BITMAP_ACCESS_WRITE) bitmap[byte] &= ~(1 << bit); else bitmap[byte] |= 1 << bit; return (0); } static uint64_t misc_enable; static uint64_t platform_info; static uint64_t turbo_ratio_limit; static uint64_t host_msrs[GUEST_MSR_NUM]; static bool nehalem_cpu(void) { u_int family, model; /* * The family:model numbers belonging to the Nehalem microarchitecture * are documented in Section 35.5, Intel SDM dated Feb 2014. */ family = CPUID_TO_FAMILY(cpu_id); model = CPUID_TO_MODEL(cpu_id); if (family == 0x6) { switch (model) { case 0x1A: case 0x1E: case 0x1F: case 0x2E: return (true); default: break; } } return (false); } static bool westmere_cpu(void) { u_int family, model; /* * The family:model numbers belonging to the Westmere microarchitecture * are documented in Section 35.6, Intel SDM dated Feb 2014. */ family = CPUID_TO_FAMILY(cpu_id); model = CPUID_TO_MODEL(cpu_id); if (family == 0x6) { switch (model) { case 0x25: case 0x2C: return (true); default: break; } } return (false); } static bool pat_valid(uint64_t val) { int i, pa; /* * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" * * Extract PA0 through PA7 and validate that each one encodes a * valid memory type. */ for (i = 0; i < 8; i++) { pa = (val >> (i * 8)) & 0xff; if (pa == 2 || pa == 3 || pa >= 8) return (false); } return (true); } void vmx_msr_init(void) { uint64_t bus_freq, ratio; int i; /* * It is safe to cache the values of the following MSRs because * they don't change based on curcpu, curproc or curthread. */ host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); /* * Initialize emulated MSRs */ misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); /* * Set mandatory bits * 11: branch trace disabled * 12: PEBS unavailable * Clear unsupported features * 16: SpeedStep enable * 18: enable MONITOR FSM */ misc_enable |= (1 << 12) | (1 << 11); misc_enable &= ~((1 << 18) | (1 << 16)); if (nehalem_cpu() || westmere_cpu()) bus_freq = 133330000; /* 133Mhz */ else bus_freq = 100000000; /* 100Mhz */ /* * XXXtime * The ratio should really be based on the virtual TSC frequency as * opposed to the host TSC. */ ratio = (tsc_freq / bus_freq) & 0xff; /* * The register definition is based on the micro-architecture * but the following bits are always the same: * [15:8] Maximum Non-Turbo Ratio * [28] Programmable Ratio Limit for Turbo Mode * [29] Programmable TDC-TDP Limit for Turbo Mode * [47:40] Maximum Efficiency Ratio * * The other bits can be safely set to 0 on all * micro-architectures up to Haswell. */ platform_info = (ratio << 8) | (ratio << 40); /* * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is * dependent on the maximum cores per package supported by the micro- * architecture. For e.g., Westmere supports 6 cores per package and * uses the low 48 bits. Sandybridge support 8 cores per package and * uses up all 64 bits. * * However, the unused bits are reserved so we pretend that all bits * in this MSR are valid. */ for (i = 0; i < 8; i++) turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; } void vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu) { /* * The permissions bitmap is shared between all vcpus so initialize it * once when initializing the vBSP. */ if (vcpu->vcpuid == 0) { guest_msr_rw(vmx, MSR_LSTAR); guest_msr_rw(vmx, MSR_CSTAR); guest_msr_rw(vmx, MSR_STAR); guest_msr_rw(vmx, MSR_SF_MASK); guest_msr_rw(vmx, MSR_KGSBASE); } /* * Initialize guest IA32_PAT MSR with default value after reset. */ vcpu->guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); return; } void vmx_msr_guest_enter(struct vmx_vcpu *vcpu) { /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ update_pcb_bases(curpcb); wrmsr(MSR_LSTAR, vcpu->guest_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, vcpu->guest_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, vcpu->guest_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, vcpu->guest_msrs[IDX_MSR_SF_MASK]); wrmsr(MSR_KGSBASE, vcpu->guest_msrs[IDX_MSR_KGSBASE]); } void vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) { uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; uint32_t host_aux = cpu_auxmsr(); if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) wrmsr(MSR_TSC_AUX, guest_tsc_aux); } void vmx_msr_guest_exit(struct vmx_vcpu *vcpu) { /* Save guest MSRs */ vcpu->guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); vcpu->guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); vcpu->guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); vcpu->guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); vcpu->guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); /* Restore host MSRs */ wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); /* MSR_KGSBASE will be restored on the way back to userspace */ } void vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) { uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; uint32_t host_aux = cpu_auxmsr(); if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) /* * Note that it is not necessary to save the guest value * here; vcpu->guest_msrs[IDX_MSR_TSC_AUX] always * contains the current value since it is updated whenever * the guest writes to it (which is expected to be very * rare). */ wrmsr(MSR_TSC_AUX, host_aux); } int vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu) { int error; error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: *val = 0; break; case MSR_MTRRcap: case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: if (vm_rdmtrr(&vcpu->mtrr, num, val) != 0) { vm_inject_gp(vcpu->vcpu); } break; case MSR_IA32_MISC_ENABLE: *val = misc_enable; break; case MSR_PLATFORM_INFO: *val = platform_info; break; case MSR_TURBO_RATIO_LIMIT: case MSR_TURBO_RATIO_LIMIT1: *val = turbo_ratio_limit; break; case MSR_PAT: *val = vcpu->guest_msrs[IDX_MSR_PAT]; break; default: error = EINVAL; break; } return (error); } int vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu) { uint64_t changed; int error; error = 0; switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: break; /* ignore writes */ case MSR_MTRRcap: case MSR_MTRRdefType: case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: case MSR_MTRR64kBase: case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) { vm_inject_gp(vcpu->vcpu); } break; case MSR_IA32_MISC_ENABLE: changed = val ^ misc_enable; /* * If the host has disabled the NX feature then the guest * also cannot use it. However, a Linux guest will try to * enable the NX feature by writing to the MISC_ENABLE MSR. * * This can be safely ignored because the memory management * code looks at CPUID.80000001H:EDX.NX to check if the * functionality is actually enabled. */ changed &= ~(1UL << 34); /* * Punt to userspace if any other bits are being modified. */ if (changed) error = EINVAL; break; case MSR_PAT: if (pat_valid(val)) vcpu->guest_msrs[IDX_MSR_PAT] = val; else vm_inject_gp(vcpu->vcpu); break; case MSR_TSC: error = vmx_set_tsc_offset(vcpu, val - rdtsc()); break; case MSR_TSC_AUX: if (vmx_have_msr_tsc_aux) /* * vmx_msr_guest_enter_tsc_aux() will apply this * value when it is called immediately before guest * entry. */ vcpu->guest_msrs[IDX_MSR_TSC_AUX] = val; else vm_inject_gp(vcpu->vcpu); break; default: error = EINVAL; break; } return (error); }