summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/boot/compressed/sev.c6
-rw-r--r--arch/x86/entry/entry_64.S35
-rw-r--r--arch/x86/events/core.c10
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/io.h20
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/pkru.h4
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/set_memory.h4
-rw-r--r--arch/x86/include/asm/sev-common.h66
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cc_platform.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c162
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h8
-rw-r--r--arch/x86/kernel/fpu/core.c21
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/head64.c60
-rw-r--r--arch/x86/kernel/setup.c72
-rw-r--r--arch/x86/kernel/sev-shared.c2
-rw-r--r--arch/x86/kernel/sev.c68
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/hyperv.c7
-rw-r--r--arch/x86/kvm/ioapic.h1
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c134
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c6
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c67
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/svm/avic.c17
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c263
-rw-r--r--arch/x86/kvm/svm/svm.c22
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c53
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c20
-rw-r--r--arch/x86/kvm/vmx/vmx.c134
-rw-r--r--arch/x86/kvm/x86.c100
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/mem_encrypt.c441
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c438
-rw-r--r--arch/x86/net/bpf_jit_comp.c51
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts4
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/xen/xen-asm.S20
60 files changed, 1513 insertions, 998 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..8910b09b5601 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -269,6 +269,7 @@ config X86
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
@@ -1523,16 +1524,20 @@ config X86_CPA_STATISTICS
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1917,6 +1922,7 @@ config X86_SGX
select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
@@ -1932,6 +1938,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 670e998fe930..28bcf04c022e 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
static bool early_setup_sev_es(void)
{
if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
+ sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED);
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
enum es_result result;
if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -202,5 +202,5 @@ finish:
if (result == ES_OK)
vc_finish_insn(&ctxt);
else if (result != ES_RETRY)
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..97b1f84bb53f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -890,6 +894,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -897,21 +902,14 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)
@@ -993,11 +991,6 @@ SYM_CODE_START_LOCAL(error_entry)
pushq %r12
ret
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
-
/*
* There are two places in the kernel that can potentially fault with
* usergs. Handle them here. B stepping K8s sometimes report a
@@ -1020,8 +1013,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ ret
.Lbstep_iret:
/* Fix truncated RIP */
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..68dea7ce6a22 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
- event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
return err;
}
@@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
/*
@@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
return 0;
if (is_metric_idx(hwc->idx))
@@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
- !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5480db242083..9d376e528dfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
+
#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d0b126835b8..63158fd55856 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -197,8 +197,6 @@ static inline bool efi_runtime_supported(void)
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
-extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-
extern void efi_thunk_runtime_setup(void);
efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5a0bcf8b78d7..048b6d5aff50 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -108,7 +108,7 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTOR_LAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE 0xB7
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 5c6a4af0b911..f6d91ecb8026 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
@@ -256,21 +257,6 @@ static inline void slow_down_io(void)
#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include <linux/jump_label.h>
-
-extern struct static_key_false sev_enable_key;
-static inline bool sev_key_active(void)
-{
- return static_branch_unlikely(&sev_enable_key);
-}
-
-#else /* !CONFIG_AMD_MEM_ENCRYPT */
-
-static inline bool sev_key_active(void) { return false; }
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
@@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
@@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c5ce9845c999..87761396e8cc 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS pushfq; popq %rax
#endif
-#define INTERRUPT_RETURN jmp native_iret
-
#endif
#endif /* __ASSEMBLY__ */
@@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#define INTERRUPT_RETURN \
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
#else
#define SWAPGS swapgs
+#define INTERRUPT_RETURN jmp native_iret
#endif
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index cefe1d81e2e8..9e50da3ed01a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
+KVM_X86_OP(get_if_flag)
KVM_X86_OP(tlb_flush_all)
KVM_X86_OP(tlb_flush_current)
KVM_X86_OP_NULL(tlb_remote_flush)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ac61f85e07b..555f4de47ef2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
@@ -1036,6 +1036,7 @@ struct kvm_x86_msr_filter {
#define APICV_INHIBIT_REASON_PIT_REINJ 4
#define APICV_INHIBIT_REASON_X2APIC 5
#define APICV_INHIBIT_REASON_BLOCKIRQ 6
+#define APICV_INHIBIT_REASON_ABSENT 7
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1348,6 +1349,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 21c4a694ca11..27d276232c80 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -752,11 +752,6 @@ extern void default_banner(void);
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#define INTERRUPT_RETURN \
- ANNOTATE_RETPOLINE_SAFE; \
- ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
- X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-
#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 4cd49afa0ca4..74f0a2d34ffd 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -4,8 +4,8 @@
#include <asm/cpufeature.h>
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 355d38c0cf60..2c5f12ae7d04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -855,4 +855,12 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 872617542bbc..ff0f2d90338a 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
@@ -99,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap)
unsigned long decoy_addr;
int rc;
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 2cef6c5a52c2..1b2fd32b42fe 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -18,20 +18,19 @@
/* SEV Information Request/Response */
#define GHCB_MSR_SEV_INFO_RESP 0x001
#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
GHCB_MSR_SEV_INFO_RESP)
+
#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
-#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
-#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
/* CPUID Request/Response */
#define GHCB_MSR_CPUID_REQ 0x004
@@ -46,31 +45,48 @@
#define GHCB_CPUID_REQ_EBX 1
#define GHCB_CPUID_REQ_ECX 2
#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) \
- (GHCB_MSR_CPUID_REQ | \
- (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
- (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
/* GHCB Hypervisor Feature Request/Response */
-#define GHCB_MSR_HV_FT_REQ 0x080
-#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
#define GHCB_MSR_TERM_REASON_POS 16
#define GHCB_MSR_TERM_REASON_MASK 0xff
-#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \
- ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+/*
+ * Error codes related to GHCB input that can be communicated back to the guest
+ * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
index 03bb2f343ddb..8a25b1c0d480 100644
--- a/arch/x86/kernel/cc_platform.c
+++ b/arch/x86/kernel/cc_platform.c
@@ -50,6 +50,14 @@ static bool amd_cc_platform_has(enum cc_attr attr)
case CC_ATTR_GUEST_STATE_ENCRYPT:
return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
default:
return false;
}
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c9f0f3d63f75..eaf25a234ff5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
+ return chunks >> shift;
}
static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8471a8b9b48e..4b41efc9e367 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -6,11 +6,13 @@
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/miscdevice.h>
+#include <linux/node.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
static struct task_struct *ksgxd_tsk;
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
/*
* These variables are part of the state of the reclaimer, and must be accessed
@@ -60,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
ret = __eremove(sgx_get_epc_virt_addr(page));
if (!ret) {
/*
@@ -471,6 +492,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
+ page->flags = 0;
spin_unlock(&node->lock);
atomic_long_dec(&sgx_nr_free_pages);
@@ -624,7 +646,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
- list_add_tail(&page->list, &node->free_page_list);
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
spin_unlock(&node->lock);
atomic_long_inc(&sgx_nr_free_pages);
@@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
return true;
}
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
/**
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
@@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
((high & GENMASK_ULL(19, 0)) << 32);
}
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
@@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void)
if (!node_isset(nid, sgx_numa_mask)) {
spin_lock_init(&sgx_numa_nodes[nid].lock);
INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
}
sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
sgx_nr_epc_sections++;
}
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..0f17def9fe6f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -26,9 +26,13 @@
/* Pages, which are being tracked by the page reclaimer. */
#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
struct sgx_epc_page {
unsigned int section;
- unsigned int flags;
+ u16 flags;
+ u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
};
@@ -39,6 +43,8 @@ struct sgx_epc_page {
*/
struct sgx_numa_node {
struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
spinlock_t lock;
};
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8ea306b1bf8e..dd3777ac0443 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,6 +99,19 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
* Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
*
@@ -116,13 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
os_xsave(fpu->fpstate);
-
- /*
- * AVX512 state is tracked here because its use is
- * known to slow the max clock speed of the core.
- */
- if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
- fpu->avx512_timestamp = jiffies;
+ update_avx_timestamp(fpu);
return;
}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d5958278eba6..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5e80a8377ad..de563db9cdcd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+{
+ unsigned long vaddr, vaddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
@@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -276,34 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- *
- * This is early code, use an open coded check for SME instead of
- * using cc_platform_has(). This eliminates worries about removing
- * instrumentation or checking boot_cpu_data in the cc_platform_has()
- * function.
- */
- if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
- i = pmd_index(vaddr);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
+ return sme_postprocess_startup(bp, pmd);
}
unsigned long __startup_secondary_64(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6a190c7f4d71..e04f5e6eb33f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -713,9 +713,6 @@ static void __init early_reserve_memory(void)
early_reserve_initrd();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
-
memblock_x86_reserve_range_setup_data();
reserve_ibft_region();
@@ -742,28 +739,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
-static char * __init prepare_command_line(void)
-{
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-
- parse_early_param();
-
- return command_line;
-}
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -853,23 +828,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
/*
- * x86_configure_nx() is called before parse_early_param() (called by
- * prepare_command_line()) to detect whether hardware doesn't support
- * NX (so that the early EHCI debug console setup can safely call
- * set_fixmap()). It may then be called again from within noexec_setup()
- * during parsing early parameters to honor the respective command line
- * option.
- */
- x86_configure_nx();
-
- /*
- * This parses early params and it needs to run before
- * early_reserve_memory() because latter relies on such settings
- * supplied as early params.
- */
- *cmdline_p = prepare_command_line();
-
- /*
* Do some memory reservations *before* memory is added to memblock, so
* memblock allocations won't overwrite it.
*
@@ -902,6 +860,36 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
+ */
+ x86_configure_nx();
+
+ parse_early_param();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 787dc5f568b5..ce987688bbc0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -221,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
fail:
/* Terminate the guest */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 74f0ec955384..c9482f472ab6 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -26,6 +26,7 @@
#include <asm/fpu/xcr.h>
#include <asm/processor.h>
#include <asm/realmode.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include <asm/svm.h>
#include <asm/smp.h>
@@ -86,9 +87,6 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -209,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static inline u64 sev_es_rd_ghcb_msr(void)
{
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
@@ -294,11 +289,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __put_user() independent of whether kernel or user
@@ -320,26 +310,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -362,11 +368,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __get_user() independent of whether kernel or user
@@ -388,26 +389,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -1411,7 +1427,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
@@ -1459,7 +1475,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
/* Do initial setup or terminate the guest */
if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..a698196377be 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515fe0ad..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 54a83a744538..f33c804a922a 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
int i, j, k, l, ret;
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
ret = -ENOMEM;
memset(log, 0, sizeof(log));
for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5e19e6e4c2ce..8d8c1cc7cb53 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index e66e620c3bed..539333ac4b38 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 650642b18d15..c2d7cfe82d00 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
- void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 759952dd1222..f206fc35deff 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3be9beea838d..fcdf3f8bb59a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -1936,7 +1936,11 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return sp->role.invalid ||
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages due not use the MMU generation. */
+ return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2173,10 +2177,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
- --iterator->level;
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -3976,6 +3980,34 @@ out_retry:
return true;
}
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int mmu_seq)
+{
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
+ return true;
+
+ return fault->slot &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+}
+
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
@@ -4013,8 +4045,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
else
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
+
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
@@ -4855,7 +4888,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
- .cr4 = cr4,
+ .cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4919,7 +4952,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
- update_pkru_bitmask(context);
+ context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -5025,6 +5058,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5032,24 +5073,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5369,7 +5396,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5854,8 +5881,6 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
- bool flush = false;
-
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
/*
@@ -5863,17 +5888,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
* logging at a 4k granularity and never creates collapsible
* 2m SPTEs during dirty logging.
*/
- flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (flush)
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6182,23 +6204,46 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
bool was_recovery_enabled, is_recovery_enabled;
uint old_period, new_period;
int err;
- was_recovery_enabled = nx_huge_pages_recovery_ratio;
- old_period = nx_huge_pages_recovery_period_ms;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
if (err)
return err;
- is_recovery_enabled = nx_huge_pages_recovery_ratio;
- new_period = nx_huge_pages_recovery_period_ms;
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
- if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
+ if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
@@ -6262,18 +6307,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ bool enabled;
+ uint period;
- if (!period && ratio) {
- /* Make sure the period is not less than one second. */
- ratio = min(ratio, 3600u);
- period = 60 * 60 * 1000 / ratio;
- }
+ enabled = calc_nx_huge_pages_recovery_period(&period);
- return READ_ONCE(nx_huge_pages) && ratio
- ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f87d36898c44..708a5d297fe1 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index b3ed302c1a35..caa96c270b95 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
+ iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/
void tdp_iter_next(struct tdp_iter *iter)
{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
if (try_step_down(iter))
return;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1748b988d3a..e19cabbcb65c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -45,6 +45,12 @@ struct tdp_iter {
* iterator walks off the end of the paging structure.
*/
bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
};
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a54c3491af42..1beb4ca90560 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 *sptep = rcu_dereference(pt) + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_child_spte;
if (shared) {
/*
@@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -504,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -577,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -642,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding.
*
- * If this function yields, it will also reset the tdp_iter's walk over the
- * paging structure and the calling function should skip to the next
- * iteration to allow the iterator to continue its traversal from the
- * paging structure root.
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
*
- * Return true if this function yielded and the iterator's traversal was reset.
- * Return false if a yield was not needed.
+ * Returns true if this function yielded.
*/
-static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush,
- bool shared)
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
{
+ WARN_ON(iter->yielded);
+
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
@@ -673,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_restart(iter);
-
- return true;
+ iter->yielded = true;
}
- return false;
+ return iter->yielded;
}
/*
@@ -1033,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
- flush |= zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ flush = zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
return flush;
}
@@ -1364,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static bool zap_collapsible_spte_range(struct kvm *kvm,
+static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot,
- bool flush)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1378,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
- flush = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1393,6 +1391,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
+ /* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1401,30 +1400,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
- flush = true;
}
rcu_read_unlock();
-
- return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush)
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
- flush = zap_collapsible_spte_range(kvm, root, slot, flush);
-
- return flush;
+ zap_collapsible_spte_range(kvm, root, slot);
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 476b133544dd..3899004a5d91 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index affc0ea98d30..8f9af7b7dbbe 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -900,6 +900,7 @@ out:
bool svm_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
@@ -989,16 +990,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ if (is_run)
+ avic_vcpu_load(vcpu, cpu);
+ else
+ avic_vcpu_put(vcpu);
+ }
+ put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 871c426ec389..b4095dfeeee6 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -281,7 +281,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 21ac0a5de4e0..be2883141220 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1543,28 +1543,50 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}
-static int sev_lock_for_migration(struct kvm *kvm)
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
/*
- * Bail if this VM is already involved in a migration to avoid deadlock
- * between two VMs trying to migrate to/from each other.
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
*/
- if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1))
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
- mutex_lock(&kvm->lock);
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
}
-static void sev_unlock_after_migration(struct kvm *kvm)
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
- mutex_unlock(&kvm->lock);
- atomic_set_release(&sev->migration_in_progress, 0);
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
}
@@ -1607,14 +1629,15 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
+ src->enc_context_owner = NULL;
- INIT_LIST_HEAD(&dst->regions_list);
- list_replace_init(&src->regions_list, &dst->regions_list);
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1666,15 +1689,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- ret = sev_lock_for_migration(kvm);
- if (ret)
- return ret;
-
- if (sev_guest(kvm)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
@@ -1682,16 +1696,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
}
source_kvm = source_kvm_file->private_data;
- ret = sev_lock_for_migration(source_kvm);
+ ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (!sev_guest(source_kvm)) {
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
- goto out_source;
+ goto out_unlock;
}
src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ /*
+ * VMs mirroring src's encryption context rely on it to keep the
+ * ASID allocated, but below we are clearing src_sev->asid.
+ */
+ if (src_sev->num_mirrored_vms) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1728,13 +1752,11 @@ out_dst_cgroup:
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
-out_source:
- sev_unlock_after_migration(source_kvm);
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
-out_unlock:
- sev_unlock_after_migration(kvm);
return ret;
}
@@ -1953,76 +1975,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info source_sev, *mirror_sev;
+ struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
- goto e_source_put;
+ goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
- mutex_lock(&source_kvm->lock);
-
- if (!sev_guest(source_kvm)) {
- ret = -EINVAL;
- goto e_source_unlock;
- }
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
- /* Mirrors of mirrors should work, but let's not get silly */
- if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
- goto e_source_unlock;
+ goto e_unlock;
}
- memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
- sizeof(source_sev));
-
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
-
- fput(source_kvm_file);
- mutex_unlock(&source_kvm->lock);
- mutex_lock(&kvm->lock);
-
- /*
- * Disallow out-of-band SEV/SEV-ES init if the target is already an
- * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
- * created after SEV/SEV-ES initialization, e.g. to init intercepts.
- */
- if (sev_guest(kvm) || kvm->created_vcpus) {
- ret = -EINVAL;
- goto e_mirror_unlock;
- }
+ source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
- mirror_sev->asid = source_sev.asid;
- mirror_sev->fd = source_sev.fd;
- mirror_sev->es_active = source_sev.es_active;
- mirror_sev->handle = source_sev.handle;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ ret = 0;
+
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
- mutex_unlock(&kvm->lock);
- return 0;
-
-e_mirror_unlock:
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(source_kvm);
- return ret;
-e_source_unlock:
- mutex_unlock(&source_kvm->lock);
-e_source_put:
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -2034,17 +2040,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
+ WARN_ON(sev->num_mirrored_vms);
+
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
- kvm_put_kvm(sev->enc_context_owner);
+ struct kvm *owner_kvm = sev->enc_context_owner;
+ struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+ mutex_lock(&owner_kvm->lock);
+ if (!WARN_ON(!owner_sev->num_mirrored_vms))
+ owner_sev->num_mirrored_vms--;
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
return;
}
- mutex_lock(&kvm->lock);
-
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -2064,8 +2077,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}
@@ -2249,7 +2260,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
}
static void dump_ghcb(struct vcpu_svm *svm)
@@ -2341,24 +2352,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
- u64 exit_code = 0;
+ u64 exit_code;
+ u64 reason;
ghcb = svm->sev_es.ghcb;
- /* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage)
- goto vmgexit_err;
-
/*
- * Retrieve the exit code now even though is may not be marked valid
+ * Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
exit_code = ghcb_get_sw_exit_code(ghcb);
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
if (!ghcb_sw_exit_code_is_valid(ghcb) ||
!ghcb_sw_exit_info_1_is_valid(ghcb) ||
!ghcb_sw_exit_info_2_is_valid(ghcb))
@@ -2437,30 +2453,34 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
break;
default:
+ reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
- return 0;
+ return true;
vmgexit_err:
vcpu = &svm->vcpu;
- if (ghcb->ghcb_usage) {
+ if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
} else {
- vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
- return -EINVAL;
+ return false;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
@@ -2482,7 +2502,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
svm->sev_es.ghcb_sa_sync = false;
}
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
svm->sev_es.ghcb_sa = NULL;
svm->sev_es.ghcb_sa_free = false;
}
@@ -2530,14 +2550,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
- return false;
+ goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
- return false;
+ goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
@@ -2555,7 +2575,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
- return false;
+ goto e_scratch;
}
scratch_va = (void *)svm->sev_es.ghcb;
@@ -2568,18 +2588,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
- return false;
+ goto e_scratch;
}
- scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- return false;
+ goto e_scratch;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
- kfree(scratch_va);
- return false;
+ kvfree(scratch_va);
+ goto e_scratch;
}
/*
@@ -2596,6 +2616,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
svm->sev_es.ghcb_sa_len = len;
return true;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return false;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2646,7 +2672,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
break;
}
@@ -2682,10 +2708,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- fallthrough;
+
+ ret = -EINVAL;
+ break;
}
default:
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
+ break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
@@ -2709,14 +2738,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
if (!ghcb_gpa) {
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
@@ -2726,15 +2759,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
exit_code = ghcb_get_sw_exit_code(ghcb);
- ret = sev_es_validate_vmgexit(svm);
- if (ret)
- return ret;
+ if (!sev_es_validate_vmgexit(svm))
+ return 1;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = -EINVAL;
+ ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
@@ -2775,20 +2807,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 1);
- ghcb_set_sw_exit_info_2(ghcb,
- X86_TRAP_UD |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
- ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
break;
default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
@@ -2810,7 +2839,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
return -EINVAL;
if (!setup_vmgexit_scratch(svm, in, bytes))
- return -EINVAL;
+ return 1;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5630c241d5f6..5151efa424ac 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1585,6 +1585,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -3568,14 +3577,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(vcpu->kvm)) {
- /*
- * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
- * bit to determine the state of the IF flag.
- */
- if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
- return true;
- } else if (is_guest_mode(vcpu)) {
+ if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
@@ -3586,7 +3588,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm))
return false;
} else {
- if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ if (!svm_get_if_flag(vcpu))
return true;
}
@@ -4621,6 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
.tlb_flush_all = svm_flush_tlb,
.tlb_flush_current = svm_flush_tlb,
@@ -4651,7 +4654,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5faad3dc10e2..1c7306c370fa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e2f66951566..9c941535f78c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -2594,8 +2591,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
@@ -3344,8 +3343,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -4502,9 +4500,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4857,6 +4854,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5289,8 +5287,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
- if (ghc->gpa != vmptr &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd4..1c94783b5a54 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,6 +5,7 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
@@ -77,13 +78,18 @@ after_clear_sn:
pi_set_on(pi_desc);
}
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
@@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return 0;
WARN_ON(irqs_disabled());
@@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..0dbf94eb954f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1363,6 +1363,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2646,15 +2651,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2918,6 +2914,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2933,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -3963,8 +3964,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
return 0;
@@ -5881,18 +5881,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
@@ -5914,10 +5910,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -6262,9 +6278,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6290,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, this may cause a vmexit or it may
- * be injected into L2. Either way, this interrupt will be
- * processed via KVM_REQ_EVENT, not RVI, because we do not use
- * virtual interrupt delivery to inject L1 interrupts into L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6601,9 +6628,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail.
*/
if (unlikely(vmx->emulation_required)) {
-
- /* We don't emulate invalid state of a nested guest */
- vmx->fail = is_guest_mode(vcpu);
+ vmx->fail = 0;
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1;
@@ -6826,6 +6851,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -7509,6 +7547,7 @@ static void hardware_unsetup(void)
static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
@@ -7558,6 +7597,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current,
@@ -7761,10 +7801,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f..e50e97ac4408 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -1330,7 +1331,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -3258,6 +3259,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
@@ -3389,7 +3413,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ if (kvm_get_msr_feature(&msr_ent))
return 1;
if (data & ~msr_ent.data)
return 1;
@@ -4133,6 +4157,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
@@ -4448,8 +4473,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5124,6 +5148,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly, so fail.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5134,6 +5169,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
+ /*
+ * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
+ * KVM_SET_CPUID case above.
+ */
+ r = -EINVAL;
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5698,6 +5741,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -6078,6 +6122,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -7077,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count)
{
if (vcpu->arch.pio.count) {
- /* Complete previous iteration. */
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
} else {
int r = __emulator_pio_in(vcpu, size, port, count);
if (!r)
@@ -7086,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */
}
- WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val);
return 1;
}
@@ -8776,10 +8826,9 @@ static void kvm_apicv_init(struct kvm *kvm)
{
init_rwsem(&kvm->arch.apicv_update_lock);
- if (enable_apicv)
- clear_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
- else
+ set_bit(APICV_INHIBIT_REASON_ABSENT,
+ &kvm->arch.apicv_inhibit_reasons);
+ if (!enable_apicv)
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
@@ -8952,14 +9001,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * if_flag is obsolete and useless, so do not bother
- * setting it for SEV-ES guests. Userspace can just
- * use kvm_run->ready_for_interrupt_injection.
- */
- kvm_run->if_flag = !vcpu->arch.guest_state_protected
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -9528,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9648,10 +9689,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9802,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * notified with kvm_vcpu_kick. Assigned devices can
+ * use the POSTED_INTR_VECTOR even if APICv is disabled,
+ * so do it even if APICv is disabled on this vCPU.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9849,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 997669ae9caa..4abcd8d9836d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5864219221ca..fe3d3061fc11 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,9 +2,11 @@
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -13,6 +15,7 @@ KCSAN_SANITIZE := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
@@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
+
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 35487305d8af..50d209939c66 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -1,419 +1,18 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * AMD Memory Encryption Support
+ * Memory Encryption Support Common Code
*
* Copyright (C) 2016 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/dma-direct.h>
+#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
-#include <linux/cc_platform.h>
-
-#include <asm/tlbflush.h>
-#include <asm/fixmap.h>
-#include <asm/setup.h>
-#include <asm/bootparam.h>
-#include <asm/set_memory.h>
-#include <asm/cacheflush.h>
-#include <asm/processor-flags.h>
-#include <asm/msr.h>
-#include <asm/cmdline.h>
-
-#include "mm_internal.h"
-
-/*
- * Since SME related variables are set early in the boot process they must
- * reside in the .data section so as not to be zeroed out when the .bss
- * section is later cleared.
- */
-u64 sme_me_mask __section(".data") = 0;
-u64 sev_status __section(".data") = 0;
-u64 sev_check_data __section(".data") = 0;
-EXPORT_SYMBOL(sme_me_mask);
-DEFINE_STATIC_KEY_FALSE(sev_enable_key);
-EXPORT_SYMBOL_GPL(sev_enable_key);
-
-/* Buffer used for early in-place encryption by BSP, no locking needed */
-static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
-
-/*
- * This routine does not change the underlying encryption setting of the
- * page(s) that map this memory. It assumes that eventually the memory is
- * meant to be accessed as either encrypted or decrypted but the contents
- * are currently not in the desired state.
- *
- * This routine follows the steps outlined in the AMD64 Architecture
- * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
- */
-static void __init __sme_early_enc_dec(resource_size_t paddr,
- unsigned long size, bool enc)
-{
- void *src, *dst;
- size_t len;
-
- if (!sme_me_mask)
- return;
-
- wbinvd();
-
- /*
- * There are limited number of early mapping slots, so map (at most)
- * one page at time.
- */
- while (size) {
- len = min_t(size_t, sizeof(sme_early_buffer), size);
-
- /*
- * Create mappings for the current and desired format of
- * the memory. Use a write-protected mapping for the source.
- */
- src = enc ? early_memremap_decrypted_wp(paddr, len) :
- early_memremap_encrypted_wp(paddr, len);
-
- dst = enc ? early_memremap_encrypted(paddr, len) :
- early_memremap_decrypted(paddr, len);
-
- /*
- * If a mapping can't be obtained to perform the operation,
- * then eventual access of that area in the desired mode
- * will cause a crash.
- */
- BUG_ON(!src || !dst);
-
- /*
- * Use a temporary buffer, of cache-line multiple size, to
- * avoid data corruption as documented in the APM.
- */
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
-
- early_memunmap(dst, len);
- early_memunmap(src, len);
-
- paddr += len;
- size -= len;
- }
-}
-
-void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, true);
-}
-
-void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, false);
-}
-
-static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
- bool map)
-{
- unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
- pmdval_t pmd_flags, pmd;
-
- /* Use early_pmd_flags but remove the encryption mask */
- pmd_flags = __sme_clr(early_pmd_flags);
-
- do {
- pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
- __early_make_pgtable((unsigned long)vaddr, pmd);
-
- vaddr += PMD_SIZE;
- paddr += PMD_SIZE;
- size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
- } while (size);
-
- flush_tlb_local();
-}
-
-void __init sme_unmap_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- /* Get the command line address before unmapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
-}
-
-void __init sme_map_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
-
- /* Get the command line address after mapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
-}
-
-void __init sme_early_init(void)
-{
- unsigned int i;
-
- if (!sme_me_mask)
- return;
-
- early_pmd_flags = __sme_set(early_pmd_flags);
-
- __supported_pte_mask = __sme_set(__supported_pte_mask);
-
- /* Update the protection map with memory encryption mask */
- for (i = 0; i < ARRAY_SIZE(protection_map); i++)
- protection_map[i] = pgprot_encrypted(protection_map[i]);
-
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- swiotlb_force = SWIOTLB_FORCE;
-}
-
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-}
-
-static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
-{
- unsigned long pfn = 0;
- pgprot_t prot;
-
- switch (level) {
- case PG_LEVEL_4K:
- pfn = pte_pfn(*kpte);
- prot = pte_pgprot(*kpte);
- break;
- case PG_LEVEL_2M:
- pfn = pmd_pfn(*(pmd_t *)kpte);
- prot = pmd_pgprot(*(pmd_t *)kpte);
- break;
- case PG_LEVEL_1G:
- pfn = pud_pfn(*(pud_t *)kpte);
- prot = pud_pgprot(*(pud_t *)kpte);
- break;
- default:
- WARN_ONCE(1, "Invalid level for kpte\n");
- return 0;
- }
-
- if (ret_prot)
- *ret_prot = prot;
-
- return pfn;
-}
-
-void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
-{
-#ifdef CONFIG_PARAVIRT
- unsigned long sz = npages << PAGE_SHIFT;
- unsigned long vaddr_end = vaddr + sz;
-
- while (vaddr < vaddr_end) {
- int psize, pmask, level;
- unsigned long pfn;
- pte_t *kpte;
-
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- WARN_ONCE(1, "kpte lookup for vaddr\n");
- return;
- }
-
- pfn = pg_level_to_pfn(level, kpte, NULL);
- if (!pfn)
- continue;
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
-
- vaddr = (vaddr & pmask) + psize;
- }
-#endif
-}
-
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
-{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
-
- pfn = pg_level_to_pfn(level, kpte, &old_prot);
- if (!pfn)
- return;
-
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
- else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
-
- /* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
-
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
-
- /*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
- */
- clflush_cache_range(__va(pa), size);
-
- /* Encrypt/decrypt the contents in-place */
- if (enc)
- sme_early_encrypt(pa, size);
- else
- sme_early_decrypt(pa, size);
-
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
-}
-
-static int __init early_set_memory_enc_dec(unsigned long vaddr,
- unsigned long size, bool enc)
-{
- unsigned long vaddr_end, vaddr_next, start;
- unsigned long psize, pmask;
- int split_page_size_mask;
- int level, ret;
- pte_t *kpte;
-
- start = vaddr;
- vaddr_next = vaddr;
- vaddr_end = vaddr + size;
-
- for (; vaddr < vaddr_end; vaddr = vaddr_next) {
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- ret = 1;
- goto out;
- }
-
- if (level == PG_LEVEL_4K) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
- continue;
- }
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- /*
- * Check whether we can change the large page in one go.
- * We request a split when the address is not aligned and
- * the number of pages to set/clear encryption bit is smaller
- * than the number of pages in the large page.
- */
- if (vaddr == (vaddr & pmask) &&
- ((vaddr_end - vaddr) >= psize)) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & pmask) + psize;
- continue;
- }
-
- /*
- * The virtual address is part of a larger page, create the next
- * level page table mapping (4K or 2M). If it is part of a 2M
- * page then we request a split of the large page into 4K
- * chunks. A 1GB large page is split into 2M pages, resp.
- */
- if (level == PG_LEVEL_2M)
- split_page_size_mask = 0;
- else
- split_page_size_mask = 1 << PG_LEVEL_2M;
-
- /*
- * kernel_physical_mapping_change() does not flush the TLBs, so
- * a TLB flush is required after we exit from the for loop.
- */
- kernel_physical_mapping_change(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
- }
-
- ret = 0;
-
- notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
-out:
- __flush_tlb_all();
- return ret;
-}
-
-int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, false);
-}
-
-int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, true);
-}
-
-void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
-{
- notify_range_enc_status_changed(vaddr, npages, enc);
-}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -441,30 +40,6 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-void __init mem_encrypt_free_decrypted_mem(void)
-{
- unsigned long vaddr, vaddr_end, npages;
- int r;
-
- vaddr = (unsigned long)__start_bss_decrypted_unused;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
-
- /*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
- */
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
- r = set_memory_encrypted(vaddr, npages);
- if (r) {
- pr_warn("failed to free unused decrypted pages\n");
- return;
- }
- }
-
- free_init_pages("unused decrypted", vaddr, vaddr_end);
-}
-
static void print_mem_encrypt_feature_info(void)
{
pr_info("AMD Memory Encryption Features active:");
@@ -493,20 +68,12 @@ static void print_mem_encrypt_feature_info(void)
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
- if (!sme_me_mask)
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return;
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- /*
- * With SEV, we need to unroll the rep string I/O instructions,
- * but SEV-ES supports them through the #VC handler.
- */
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
- !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- static_branch_enable(&sev_enable_key);
-
print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
new file mode 100644
index 000000000000..2b2d018ea345
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
+EXPORT_SYMBOL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ flush_tlb_local();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
+static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
+{
+ unsigned long pfn = 0;
+ pgprot_t prot;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid level for kpte\n");
+ return 0;
+ }
+
+ if (ret_prot)
+ *ret_prot = prot;
+
+ return pfn;
+}
+
+void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
+{
+#ifdef CONFIG_PARAVIRT
+ unsigned long sz = npages << PAGE_SHIFT;
+ unsigned long vaddr_end = vaddr + sz;
+
+ while (vaddr < vaddr_end) {
+ int psize, pmask, level;
+ unsigned long pfn;
+ pte_t *kpte;
+
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ WARN_ONCE(1, "kpte lookup for vaddr\n");
+ return;
+ }
+
+ pfn = pg_level_to_pfn(level, kpte, NULL);
+ if (!pfn)
+ continue;
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
+
+ vaddr = (vaddr & pmask) + psize;
+ }
+#endif
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ pfn = pg_level_to_pfn(level, kpte, &old_prot);
+ if (!pfn)
+ return;
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next, start;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ start = vaddr;
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+ notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+ notify_range_enc_status_changed(vaddr, npages, enc);
+}
+
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..bafe36e69227 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1252,19 +1252,54 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
@@ -1305,7 +1340,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 0ac3d4357136..65fa3d866226 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -249,7 +249,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
@@ -263,7 +263,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index b15ebfe40a73..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 6d98609387ba..c5e29db02a46 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -98,6 +98,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -154,8 +155,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index c736cf2ac76b..e2c5b296120d 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -68,7 +68,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"(__parainstructions|__alt_instructions)(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
-#if CONFIG_FW_LOADER_BUILTIN
+#if CONFIG_FW_LOADER
"__(start|end)_builtin_fw|"
#endif
"__(start|stop)___ksymtab(_gpl)?|"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 220dd9678494..444d824775f6 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
.pushsection .noinstr.text, "ax"
/*
@@ -193,6 +194,25 @@ SYM_CODE_START(xen_iret)
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs