diff options
Diffstat (limited to 'arch/x86')
197 files changed, 7507 insertions, 2763 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index 677111acbaa3..f2e1d6c347fb 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -3,6 +3,4 @@ boot/compressed/vmlinux tools/test_get_len tools/insn_sanity tools/insn_decoder_test -purgatory/kexec-purgatory.c purgatory/purgatory.ro - diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be0b95e51df6..f9920f1341c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,7 +94,6 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -186,8 +185,8 @@ config X86 select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER if X86_64 + select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT @@ -245,6 +244,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION @@ -277,8 +277,8 @@ config X86 select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT + select TRACE_IRQFLAGS_NMI_SUPPORT select USER_STACKTRACE_SUPPORT - select VIRT_TO_BUS select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS @@ -391,8 +391,8 @@ config PGTABLE_LEVELS config CC_HAS_SANE_STACKPROTECTOR bool - default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT - default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC)) + default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT + default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) help We have to make sure stack protector is unconditionally disabled if the compiler produces broken code or if it does not let us control @@ -462,29 +462,6 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config RETPOLINE - bool "Avoid speculative indirect branches in kernel" - select OBJTOOL if HAVE_OBJTOOL - default y - help - Compile kernel with the retpoline compiler options to guard against - kernel-to-user data leaks by avoiding speculative indirect - branches. Requires a compiler with -mindirect-branch=thunk-extern - support for full protection. The kernel may run slower. - -config CC_HAS_SLS - def_bool $(cc-option,-mharden-sls=all) - -config SLS - bool "Mitigate Straight-Line-Speculation" - depends on CC_HAS_SLS && X86_64 - select OBJTOOL if HAVE_OBJTOOL - default n - help - Compile the kernel with straight-line-speculation options to guard - against straight line speculation. The kernel image might be slightly - larger. - config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) @@ -1833,15 +1810,6 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT -config ARCH_RANDOM - def_bool y - prompt "x86 architectural random number generator" if EXPERT - help - Enable the x86 architectural RDRAND instruction - (Intel Bull Mountain technology) to generate random numbers. - If supported, this is a high bandwidth, cryptographically - secure hardware random number generator. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT @@ -2032,7 +2000,7 @@ config KEXEC config KEXEC_FILE bool "kexec file based system call" select KEXEC_CORE - select BUILD_BIN2C + select HAVE_IMA_KEXEC if IMA depends on X86_64 depends on CRYPTO=y depends on CRYPTO_SHA256=y @@ -2453,6 +2421,91 @@ source "kernel/livepatch/Kconfig" endmenu +config CC_HAS_SLS + def_bool $(cc-option,-mharden-sls=all) + +config CC_HAS_RETURN_THUNK + def_bool $(cc-option,-mfunction-return=thunk-extern) + +menuconfig SPECULATION_MITIGATIONS + bool "Mitigations for speculative execution vulnerabilities" + default y + help + Say Y here to enable options which enable mitigations for + speculative execution hardware vulnerabilities. + + If you say N, all mitigations will be disabled. You really + should know what you are doing to say so. + +if SPECULATION_MITIGATIONS + +config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" + default y + depends on (X86_64 || X86_PAE) + help + This feature reduces the number of hardware side channels by + ensuring that the majority of kernel addresses are not mapped + into userspace. + + See Documentation/x86/pti.rst for more details. + +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + select OBJTOOL if HAVE_OBJTOOL + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + +config RETHUNK + bool "Enable return-thunks" + depends on RETPOLINE && CC_HAS_RETURN_THUNK + select OBJTOOL if HAVE_OBJTOOL + default y if X86_64 + help + Compile the kernel with the return-thunks compiler option to guard + against kernel-to-user data leaks by avoiding return speculation. + Requires a compiler with -mfunction-return=thunk-extern + support for full protection. The kernel may run slower. + +config CPU_UNRET_ENTRY + bool "Enable UNRET on kernel entry" + depends on CPU_SUP_AMD && RETHUNK && X86_64 + default y + help + Compile the kernel with support for the retbleed=unret mitigation. + +config CPU_IBPB_ENTRY + bool "Enable IBPB on kernel entry" + depends on CPU_SUP_AMD && X86_64 + default y + help + Compile the kernel with support for the retbleed=ibpb mitigation. + +config CPU_IBRS_ENTRY + bool "Enable IBRS on kernel entry" + depends on CPU_SUP_INTEL && X86_64 + default y + help + Compile the kernel with support for the spectre_v2=ibrs mitigation. + This mitigates both spectre_v2 and retbleed at great cost to + performance. + +config SLS + bool "Mitigate Straight-Line-Speculation" + depends on CC_HAS_SLS && X86_64 + select OBJTOOL if HAVE_OBJTOOL + default n + help + Compile the kernel with straight-line-speculation options to guard + against straight line speculation. The kernel image might be slightly + larger. + +endif + config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 340399f69954..bdfe08f1a930 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_NMI_SUPPORT - def_bool y - config EARLY_PRINTK_USB bool diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a74886aed349..7854685c5f25 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -21,6 +21,13 @@ ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk RETPOLINE_VDSO_CFLAGS := -mretpoline endif + +ifdef CONFIG_RETHUNK +RETHUNK_CFLAGS := -mfunction-return=thunk-extern +RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS) +endif + +export RETHUNK_CFLAGS export RETPOLINE_CFLAGS export RETPOLINE_VDSO_CFLAGS diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 44c350d627c7..d4a314cc50d6 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -110,6 +110,7 @@ void kernel_add_identity_map(unsigned long start, unsigned long end) void initialize_identity_maps(void *rmode) { unsigned long cmdline; + struct setup_data *sd; /* Exclude the encryption mask from __PHYSICAL_MASK */ physical_mask &= ~sme_me_mask; @@ -163,6 +164,18 @@ void initialize_identity_maps(void *rmode) cmdline = get_cmd_line_ptr(); kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + /* + * Also map the setup_data entries passed via boot_params in case they + * need to be accessed by uncompressed kernel via the identity mapping. + */ + sd = (struct setup_data *)boot_params->hdr.setup_data; + while (sd) { + unsigned long sd_addr = (unsigned long)sd; + + kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len); + sd = (struct setup_data *)sd->next; + } + sev_prep_identity_maps(top_level_pgt); /* Load the new page-table. */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 98a4852ed6a0..7207219509f6 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -135,7 +135,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_VARS=y CONFIG_EFI_CAPSULE_LOADER=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 69784505a7a8..5ce67b73e218 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -134,7 +134,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_VARS=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2831685adf6f..04d07ab744b2 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -61,14 +61,15 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o -blake2s-x86_64-y := blake2s-shash.o -obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o +obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o +polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o + obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 43852ba6e19c..2402b9418cd7 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -23,6 +23,11 @@ #define VMOVDQ vmovdqu +/* + * Note: the "x" prefix in these aliases means "this is an xmm register". The + * alias prefixes have no relation to XCTR where the "X" prefix means "XOR + * counter". + */ #define xdata0 %xmm0 #define xdata1 %xmm1 #define xdata2 %xmm2 @@ -31,8 +36,10 @@ #define xdata5 %xmm5 #define xdata6 %xmm6 #define xdata7 %xmm7 -#define xcounter %xmm8 -#define xbyteswap %xmm9 +#define xcounter %xmm8 // CTR mode only +#define xiv %xmm8 // XCTR mode only +#define xbyteswap %xmm9 // CTR mode only +#define xtmp %xmm9 // XCTR mode only #define xkey0 %xmm10 #define xkey4 %xmm11 #define xkey8 %xmm12 @@ -45,7 +52,7 @@ #define p_keys %rdx #define p_out %rcx #define num_bytes %r8 - +#define counter %r9 // XCTR mode only #define tmp %r10 #define DDQ_DATA 0 #define XDATA 1 @@ -102,7 +109,7 @@ ddq_add_8: * do_aes num_in_par load_keys key_len * This increments p_in, but not p_out */ -.macro do_aes b, k, key_len +.macro do_aes b, k, key_len, xctr .set by, \b .set load_keys, \k .set klen, \key_len @@ -111,29 +118,48 @@ ddq_add_8: vmovdqa 0*16(p_keys), xkey0 .endif - vpshufb xbyteswap, xcounter, xdata0 - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr + .if \xctr + movq counter, xtmp + .set i, 0 + .rept (by) + club XDATA, i + vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata + .set i, (i +1) + .endr + .set i, 0 + .rept (by) + club XDATA, i + vpxor xiv, var_xdata, var_xdata + .set i, (i +1) + .endr + .else + vpshufb xbyteswap, xcounter, xdata0 + .set i, 1 + .rept (by - 1) + club XDATA, i + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata + vptest ddq_low_msk(%rip), var_xdata + jnz 1f + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: + .if \xctr + add $by, counter + .else + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter + vptest ddq_low_msk(%rip), xcounter + jnz 1f + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + .endif .set i, 1 .rept (by - 1) @@ -371,94 +397,99 @@ ddq_add_8: .endr .endm -.macro do_aes_load val, key_len - do_aes \val, 1, \key_len +.macro do_aes_load val, key_len, xctr + do_aes \val, 1, \key_len, \xctr .endm -.macro do_aes_noload val, key_len - do_aes \val, 0, \key_len +.macro do_aes_noload val, key_len, xctr + do_aes \val, 0, \key_len, \xctr .endm /* main body of aes ctr load */ -.macro do_aes_ctrmain key_len +.macro do_aes_ctrmain key_len, xctr cmp $16, num_bytes - jb .Ldo_return2\key_len + jb .Ldo_return2\xctr\key_len - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter + .if \xctr + shr $4, counter + vmovdqu (p_iv), xiv + .else + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + .endif mov num_bytes, tmp and $(7*16), tmp - jz .Lmult_of_8_blks\key_len + jz .Lmult_of_8_blks\xctr\key_len /* 1 <= tmp <= 7 */ cmp $(4*16), tmp - jg .Lgt4\key_len - je .Leq4\key_len + jg .Lgt4\xctr\key_len + je .Leq4\xctr\key_len -.Llt4\key_len: +.Llt4\xctr\key_len: cmp $(2*16), tmp - jg .Leq3\key_len - je .Leq2\key_len + jg .Leq3\xctr\key_len + je .Leq2\xctr\key_len -.Leq1\key_len: - do_aes_load 1, \key_len +.Leq1\xctr\key_len: + do_aes_load 1, \key_len, \xctr add $(1*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq2\key_len: - do_aes_load 2, \key_len +.Leq2\xctr\key_len: + do_aes_load 2, \key_len, \xctr add $(2*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq3\key_len: - do_aes_load 3, \key_len +.Leq3\xctr\key_len: + do_aes_load 3, \key_len, \xctr add $(3*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq4\key_len: - do_aes_load 4, \key_len +.Leq4\xctr\key_len: + do_aes_load 4, \key_len, \xctr add $(4*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lgt4\key_len: +.Lgt4\xctr\key_len: cmp $(6*16), tmp - jg .Leq7\key_len - je .Leq6\key_len + jg .Leq7\xctr\key_len + je .Leq6\xctr\key_len -.Leq5\key_len: - do_aes_load 5, \key_len +.Leq5\xctr\key_len: + do_aes_load 5, \key_len, \xctr add $(5*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq6\key_len: - do_aes_load 6, \key_len +.Leq6\xctr\key_len: + do_aes_load 6, \key_len, \xctr add $(6*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq7\key_len: - do_aes_load 7, \key_len +.Leq7\xctr\key_len: + do_aes_load 7, \key_len, \xctr add $(7*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lmult_of_8_blks\key_len: +.Lmult_of_8_blks\xctr\key_len: .if (\key_len != KEY_128) vmovdqa 0*16(p_keys), xkey0 vmovdqa 4*16(p_keys), xkey4 @@ -471,17 +502,19 @@ ddq_add_8: vmovdqa 9*16(p_keys), xkey12 .endif .align 16 -.Lmain_loop2\key_len: +.Lmain_loop2\xctr\key_len: /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len + do_aes_noload 8, \key_len, \xctr add $(8*16), p_out sub $(8*16), num_bytes - jne .Lmain_loop2\key_len + jne .Lmain_loop2\xctr\key_len -.Ldo_return2\key_len: - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) +.Ldo_return2\xctr\key_len: + .if !\xctr + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + .endif RET .endm @@ -494,7 +527,7 @@ ddq_add_8: */ SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_128 + do_aes_ctrmain KEY_128 0 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) @@ -507,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_192 + do_aes_ctrmain KEY_192 0 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) @@ -520,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_256 + do_aes_ctrmain KEY_256 0 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) + +/* + * routine to do AES128 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 1 + +SYM_FUNC_END(aes_xctr_enc_128_avx_by8) + +/* + * routine to do AES192 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 1 + +SYM_FUNC_END(aes_xctr_enc_192_avx_by8) + +/* + * routine to do AES256 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 1 + +SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 41901ba9d3a2..a5b0cb3efeba 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); + + +asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + /* * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data @@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req) return err; } +static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv, + unsigned int byte_ctr) +{ + if (ctx->key_length == AES_KEYSIZE_128) + aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else + aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); +} + +static int xctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; + struct skcipher_walk walk; + unsigned int nbytes; + unsigned int byte_ctr = 0; + int err; + __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, + walk.src.virt.addr, nbytes & AES_BLOCK_MASK, + walk.iv, byte_ctr); + nbytes &= ~AES_BLOCK_MASK; + byte_ctr += walk.nbytes - nbytes; + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(block, walk.iv, AES_BLOCK_SIZE); + block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); + aesni_enc(ctx, keystream, (u8 *)block); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - + nbytes, walk.src.virt.addr + walk.nbytes + - nbytes, keystream, nbytes); + byte_ctr += nbytes; + nbytes = 0; + } + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + return err; +} + static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -1051,6 +1118,33 @@ static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; #ifdef CONFIG_X86_64 +/* + * XCTR does not have a non-AVX implementation, so it must be enabled + * conditionally. + */ +static struct skcipher_alg aesni_xctr = { + .base = { + .cra_name = "__xctr(aes)", + .cra_driver_name = "__xctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = xctr_crypt, + .decrypt = xctr_crypt, +}; + +static struct simd_skcipher_alg *aesni_simd_xctr; +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_64 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) { @@ -1163,7 +1257,7 @@ static int __init aesni_init(void) static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } -#endif +#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1180,8 +1274,22 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + err = simd_register_skciphers_compat(&aesni_xctr, 1, + &aesni_simd_xctr); + if (err) + goto unregister_aeads; +#endif /* CONFIG_X86_64 */ + return 0; +#ifdef CONFIG_X86_64 +unregister_aeads: + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); +#endif /* CONFIG_X86_64 */ + unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void) simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +#endif /* CONFIG_X86_64 */ } late_initcall(aesni_init); diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 69853c13e8fb..aaba21230528 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -4,7 +4,6 @@ */ #include <crypto/internal/blake2s.h> -#include <crypto/internal/simd.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { blake2s_compress_generic(state, block, nblocks, inc); return; } diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c deleted file mode 100644 index 59ae28abe35c..000000000000 --- a/arch/x86/crypto/blake2s-shash.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/internal/blake2s.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/hash.h> - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -static int crypto_blake2s_update_x86(struct shash_desc *desc, - const u8 *in, unsigned int inlen) -{ - return crypto_blake2s_update(desc, in, inlen, false); -} - -static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) -{ - return crypto_blake2s_final(desc, out, false); -} - -#define BLAKE2S_ALG(name, driver_name, digest_size) \ - { \ - .base.cra_name = name, \ - .base.cra_driver_name = driver_name, \ - .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ - .base.cra_module = THIS_MODULE, \ - .digestsize = digest_size, \ - .setkey = crypto_blake2s_setkey, \ - .init = crypto_blake2s_init, \ - .update = crypto_blake2s_update_x86, \ - .final = crypto_blake2s_final_x86, \ - .descsize = sizeof(struct blake2s_state), \ - } - -static struct shash_alg blake2s_algs[] = { - BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), - BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), - BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), - BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), -}; - -static int __init blake2s_mod_init(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); - return 0; -} - -static void __exit blake2s_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); -} - -module_init(blake2s_mod_init); -module_exit(blake2s_mod_exit); - -MODULE_ALIAS_CRYPTO("blake2s-128"); -MODULE_ALIAS_CRYPTO("blake2s-128-x86"); -MODULE_ALIAS_CRYPTO("blake2s-160"); -MODULE_ALIAS_CRYPTO("blake2s-160-x86"); -MODULE_ALIAS_CRYPTO("blake2s-224"); -MODULE_ALIAS_CRYPTO("blake2s-224-x86"); -MODULE_ALIAS_CRYPTO("blake2s-256"); -MODULE_ALIAS_CRYPTO("blake2s-256-x86"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index ba06322c1e39..019c64c1340a 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -144,7 +144,7 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -225,7 +225,7 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index c392a6edbfff..ca53e96996ac 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -1,26 +1,4 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2012 Xyratex Technology Limited * diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S new file mode 100644 index 000000000000..a6ebe4e7dd2b --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_asm.S @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI + * instructions. It works on 8 blocks at a time, by precomputing the first 8 + * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation + * allows us to split finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STRIDE_BLOCKS 8 + +#define GSTAR %xmm7 +#define PL %xmm8 +#define PH %xmm9 +#define TMP_XMM %xmm11 +#define LO %xmm12 +#define HI %xmm13 +#define MI %xmm14 +#define SUM %xmm15 + +#define KEY_POWERS %rdi +#define MSG %rsi +#define BLOCKS_LEFT %rdx +#define ACCUMULATOR %rcx +#define TMP %rax + +.section .rodata.cst16.gstar, "aM", @progbits, 16 +.align 16 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +.text + +/* + * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length + * count pointed to by MSG and KEY_POWERS. + */ +.macro schoolbook1 count + .set i, 0 + .rept (\count) + schoolbook1_iteration i 0 + .set i, (i +1) + .endr +.endm + +/* + * Computes the product of two 128-bit polynomials at the memory locations + * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of + * the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += X_0 * Y_1 + X_1 * Y_0 + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an + * extra multiplication of SUM and h^8. + */ +.macro schoolbook1_iteration i xor_sum + movups (16*\i)(MSG), %xmm0 + .if (\i == 0 && \xor_sum == 1) + pxor SUM, %xmm0 + .endif + vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 + vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 + vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 + vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 + vpxor %xmm2, MI, MI + vpxor %xmm1, LO, LO + vpxor %xmm4, HI, HI + vpxor %xmm3, MI, MI +.endm + +/* + * Performs the same computation as schoolbook1_iteration, except we expect the + * arguments to already be loaded into xmm0 and xmm1 and we set the result + * registers LO, MI, and HI directly rather than XOR'ing into them. + */ +.macro schoolbook1_noload + vpclmulqdq $0x01, %xmm0, %xmm1, MI + vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x00, %xmm0, %xmm1, LO + vpclmulqdq $0x11, %xmm0, %xmm1, HI + vpxor %xmm2, MI, MI +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + */ +.macro schoolbook2 + vpslldq $8, MI, PL + vpsrldq $8, MI, PH + pxor LO, PL + pxor HI, PH +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) + pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 + pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 + pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] + vpxor TMP_XMM, PH, \dest +.endm + +/* + * Compute schoolbook multiplication for 8 blocks + * m_0h^8 + ... + m_7h^1 + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + */ +.macro full_stride reduce + pxor LO, LO + pxor HI, HI + pxor MI, MI + + schoolbook1_iteration 7 0 + .if \reduce + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 6 0 + .if \reduce + pshufd $0b01001110, TMP_XMM, TMP_XMM + .endif + + schoolbook1_iteration 5 0 + .if \reduce + pxor PL, TMP_XMM + .endif + + schoolbook1_iteration 4 0 + .if \reduce + pxor TMP_XMM, PH + .endif + + schoolbook1_iteration 3 0 + .if \reduce + pclmulqdq $0x11, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 2 0 + .if \reduce + vpxor TMP_XMM, PH, SUM + .endif + + schoolbook1_iteration 1 0 + + schoolbook1_iteration 0 1 + + addq $(8*16), MSG + schoolbook2 +.endm + +/* + * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS + */ +.macro partial_stride + mov BLOCKS_LEFT, TMP + shlq $4, TMP + addq $(16*STRIDE_BLOCKS), KEY_POWERS + subq TMP, KEY_POWERS + + movups (MSG), %xmm0 + pxor SUM, %xmm0 + movaps (KEY_POWERS), %xmm1 + schoolbook1_noload + dec BLOCKS_LEFT + addq $16, MSG + addq $16, KEY_POWERS + + test $4, BLOCKS_LEFT + jz .Lpartial4BlocksDone + schoolbook1 4 + addq $(4*16), MSG + addq $(4*16), KEY_POWERS +.Lpartial4BlocksDone: + test $2, BLOCKS_LEFT + jz .Lpartial2BlocksDone + schoolbook1 2 + addq $(2*16), MSG + addq $(2*16), KEY_POWERS +.Lpartial2BlocksDone: + test $1, BLOCKS_LEFT + jz .LpartialDone + schoolbook1 1 +.LpartialDone: + schoolbook2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void clmul_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(clmul_polyval_mul) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (%rdi), %xmm0 + movups (%rsi), %xmm1 + schoolbook1_noload + schoolbook2 + montgomery_reduction SUM + movups SUM, (%rdi) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * rdi - pointer to precomputed key powers h^8 ... h^1 + * rsi - pointer to message blocks + * rdx - number of blocks to hash + * rcx - pointer to the accumulator + * + * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + * const u8 *in, size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(clmul_polyval_update) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (ACCUMULATOR), SUM + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExit + full_stride 0 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + jns .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + add $STRIDE_BLOCKS, BLOCKS_LEFT + jz .LskipPartial + partial_stride +.LskipPartial: + movups SUM, (ACCUMULATOR) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c new file mode 100644 index 000000000000..b7664d018851 --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using PCMULQDQ-NI + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication + * accelerated by PCLMULQDQ-NI to implement the finite field + * operations. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/polyval.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/cpu_device_id.h> +#include <asm/simd.h> + +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_x86_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_x86_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_x86_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* Allow rescheduling every 4K bytes. */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_x86_init, + .update = polyval_x86_update, + .final = polyval_x86_final, + .setkey = polyval_x86_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-clmulni", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + }, +}; + +__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); + +static int __init polyval_clmulni_mod_init(void) +{ + if (!x86_match_cpu(pcmul_cpu_id)) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return -ENODEV; + + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_clmulni_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_init(polyval_clmulni_mod_init); +module_exit(polyval_clmulni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-clmulni"); diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 7fec5dcf6438..ca2fe186994b 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -11,12 +11,13 @@ CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o += -fno-stack-protector -obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o +obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ +obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 29b36e9e4e74..f6907627172b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -7,6 +7,8 @@ #include <asm/asm-offsets.h> #include <asm/processor-flags.h> #include <asm/ptrace-abi.h> +#include <asm/msr.h> +#include <asm/nospec-branch.h> /* @@ -283,6 +285,66 @@ For 32-bit we have the following conventions - kernel is built with #endif /* + * IBRS kernel mitigation for Spectre_v2. + * + * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers + * the regs it uses (AX, CX, DX). Must be called before the first RET + * instruction (NOTE! UNTRAIN_RET includes a RET instruction) + * + * The optional argument is used to save/restore the current value, + * which is used on the paranoid paths. + * + * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. + */ +.macro IBRS_ENTER save_reg +#ifdef CONFIG_CPU_IBRS_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS + movl $MSR_IA32_SPEC_CTRL, %ecx + +.ifnb \save_reg + rdmsr + shl $32, %rdx + or %rdx, %rax + mov %rax, \save_reg + test $SPEC_CTRL_IBRS, %eax + jz .Ldo_wrmsr_\@ + lfence + jmp .Lend_\@ +.Ldo_wrmsr_\@: +.endif + + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx + movl %edx, %eax + shr $32, %rdx + wrmsr +.Lend_\@: +#endif +.endm + +/* + * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX) + * regs. Must be called after the last RET. + */ +.macro IBRS_EXIT save_reg +#ifdef CONFIG_CPU_IBRS_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS + movl $MSR_IA32_SPEC_CTRL, %ecx + +.ifnb \save_reg + mov \save_reg, %rdx +.else + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx + andl $(~SPEC_CTRL_IBRS), %edx +.endif + + movl %edx, %eax + shr $32, %rdx + wrmsr +.Lend_\@: +#endif +.endm + +/* * Mitigate Spectre v1 for conditional swapgs code paths. * * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S new file mode 100644 index 000000000000..bfb7bcb362bc --- /dev/null +++ b/arch/x86/entry/entry.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common place for both 32- and 64-bit entry routines. + */ + +#include <linux/linkage.h> +#include <asm/export.h> +#include <asm/msr-index.h> + +.pushsection .noinstr.text, "ax" + +SYM_FUNC_START(entry_ibpb) + movl $MSR_IA32_PRED_CMD, %ecx + movl $PRED_CMD_IBPB, %eax + xorl %edx, %edx + wrmsr + RET +SYM_FUNC_END(entry_ibpb) +/* For KVM */ +EXPORT_SYMBOL_GPL(entry_ibpb); + +.popsection diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 887420844066..e309e7156038 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -698,7 +698,6 @@ SYM_CODE_START(__switch_to_asm) movl %ebx, PER_CPU_VAR(__stack_chk_guard) #endif -#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated @@ -707,7 +706,6 @@ SYM_CODE_START(__switch_to_asm) * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW -#endif /* Restore flags or the incoming task to restore AC state. */ popfl diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4300ba49b5ee..9953d966d124 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -85,7 +85,7 @@ */ SYM_CODE_START(entry_SYSCALL_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR swapgs @@ -112,6 +112,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rdi /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi + + /* clobbers %rax, make sure it is after saving the syscall nr */ + IBRS_ENTER + UNTRAIN_RET + call do_syscall_64 /* returns with IRQs disabled */ /* @@ -191,6 +196,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: + IBRS_EXIT POP_REGS pop_rdi=0 /* @@ -249,7 +255,6 @@ SYM_FUNC_START(__switch_to_asm) movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif -#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated @@ -258,7 +263,6 @@ SYM_FUNC_START(__switch_to_asm) * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW -#endif /* restore callee-saved registers */ popq %r15 @@ -322,13 +326,13 @@ SYM_CODE_END(ret_from_fork) #endif .endm -/* Save all registers in pt_regs */ -SYM_CODE_START_LOCAL(push_and_clear_regs) +SYM_CODE_START_LOCAL(xen_error_entry) UNWIND_HINT_FUNC PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 + UNTRAIN_RET RET -SYM_CODE_END(push_and_clear_regs) +SYM_CODE_END(xen_error_entry) /** * idtentry_body - Macro to emit code calling the C function @@ -337,9 +341,6 @@ SYM_CODE_END(push_and_clear_regs) */ .macro idtentry_body cfunc has_error_code:req - call push_and_clear_regs - UNWIND_HINT_REGS - /* * Call error_entry() and switch to the task stack if from userspace. * @@ -349,7 +350,7 @@ SYM_CODE_END(push_and_clear_regs) * switch the CR3. So it can skip invoking error_entry(). */ ALTERNATIVE "call error_entry; movq %rax, %rsp", \ - "", X86_FEATURE_XENPV + "call xen_error_entry", X86_FEATURE_XENPV ENCODE_FRAME_POINTER UNWIND_HINT_REGS @@ -612,6 +613,7 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) + IBRS_EXIT #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testb $3, CS(%rsp) @@ -897,6 +899,9 @@ SYM_CODE_END(xen_failsafe_callback) * 1 -> no SWAPGS on exit * * Y GSBASE value at entry, must be restored in paranoid_exit + * + * R14 - old CR3 + * R15 - old SPEC_CTRL */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC @@ -940,7 +945,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) * is needed here. */ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - RET + jmp .Lparanoid_gsbase_done .Lparanoid_entry_checkgs: /* EBX = 1 -> kernel GSBASE active, no restore required */ @@ -959,8 +964,16 @@ SYM_CODE_START_LOCAL(paranoid_entry) xorl %ebx, %ebx swapgs .Lparanoid_kernel_gsbase: - FENCE_SWAPGS_KERNEL_ENTRY +.Lparanoid_gsbase_done: + + /* + * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like + * CR3 above, keep the old value in a callee saved register. + */ + IBRS_ENTER save_reg=%r15 + UNTRAIN_RET + RET SYM_CODE_END(paranoid_entry) @@ -982,9 +995,19 @@ SYM_CODE_END(paranoid_entry) * 1 -> no SWAPGS on exit * * Y User space GSBASE, must be restored unconditionally + * + * R14 - old CR3 + * R15 - old SPEC_CTRL */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS + + /* + * Must restore IBRS state before both CR3 and %GS since we need access + * to the per-CPU x86_spec_ctrl_shadow variable. + */ + IBRS_EXIT save_reg=%r15 + /* * The order of operations is important. RESTORE_CR3 requires * kernel GSBASE. @@ -1017,6 +1040,10 @@ SYM_CODE_END(paranoid_exit) */ SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC + + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1028,9 +1055,12 @@ SYM_CODE_START_LOCAL(error_entry) FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER + UNTRAIN_RET leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ call sync_regs RET @@ -1065,6 +1095,7 @@ SYM_CODE_START_LOCAL(error_entry) .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY leaq 8(%rsp), %rax /* return pt_regs pointer */ + ANNOTATE_UNRET_END RET .Lbstep_iret: @@ -1080,6 +1111,8 @@ SYM_CODE_START_LOCAL(error_entry) swapgs FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER + UNTRAIN_RET /* * Pretend that the exception came from user mode: set up pt_regs @@ -1185,6 +1218,9 @@ SYM_CODE_START(asm_exc_nmi) PUSH_AND_CLEAR_REGS rdx=(%rdx) ENCODE_FRAME_POINTER + IBRS_ENTER + UNTRAIN_RET + /* * At this point we no longer need to worry about stack damage * due to nesting -- we're on the normal thread stack and we're @@ -1409,6 +1445,9 @@ end_repeat_nmi: movq $-1, %rsi call exc_nmi + /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ + IBRS_EXIT save_reg=%r15 + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d1052742ad0c..682338e7e2a3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -4,7 +4,6 @@ * * Copyright 2000-2002 Andi Kleen, SuSE Labs. */ -#include "calling.h" #include <asm/asm-offsets.h> #include <asm/current.h> #include <asm/errno.h> @@ -14,9 +13,12 @@ #include <asm/irqflags.h> #include <asm/asm.h> #include <asm/smap.h> +#include <asm/nospec-branch.h> #include <linux/linkage.h> #include <linux/err.h> +#include "calling.h" + .section .entry.text, "ax" /* @@ -47,7 +49,7 @@ * 0(%ebp) arg6 */ SYM_CODE_START(entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -88,6 +90,9 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) cld + IBRS_ENTER + UNTRAIN_RET + /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -174,7 +179,7 @@ SYM_CODE_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -203,6 +208,9 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS UNWIND_HINT_REGS + IBRS_ENTER + UNTRAIN_RET + movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -217,6 +225,8 @@ sysret32_from_system_call: */ STACKLEAK_ERASE + IBRS_EXIT + movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ @@ -295,7 +305,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* * Interrupts are off on entry. @@ -337,6 +347,9 @@ SYM_CODE_START(entry_INT80_compat) cld + IBRS_ENTER + UNTRAIN_RET + movq %rsp, %rdi call do_int80_syscall_32 jmp swapgs_restore_regs_and_return_to_usermode diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 7591bab060f7..ff6e7003da97 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -29,10 +29,8 @@ SYM_CODE_START_NOALIGN(\name) SYM_CODE_END(\name) .endm -#ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace EXPORT_SYMBOL(preempt_schedule_thunk) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -#endif diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 505b488fcc65..f38b07d2768b 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -31,14 +31,11 @@ SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm -#ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace EXPORT_SYMBOL(preempt_schedule_thunk) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) -#endif -#ifdef CONFIG_PREEMPTION SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) popq %r11 popq %r10 @@ -53,4 +50,3 @@ SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) RET _ASM_NOKPROBE(__thunk_restore) SYM_CODE_END(__thunk_restore) -#endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c2a8b76ae0bc..76cd790ed0bd 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -92,6 +92,7 @@ endif endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index edfe9780f6d1..90d15f2a7205 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * vdso2c - A vdso image preparation tool * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 * * vdso2c requires stripped and unstripped input. It would be trivial * to fully strip the input in here, but, for reasons described below, diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S index 15e35159ebb6..ef2dd1827243 100644 --- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -19,17 +19,20 @@ __vsyscall_page: mov $__NR_gettimeofday, %rax syscall - RET + ret + int3 .balign 1024, 0xcc mov $__NR_time, %rax syscall - RET + ret + int3 .balign 1024, 0xcc mov $__NR_getcpu, %rax syscall - RET + ret + int3 .balign 4096, 0xcc diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 0d04414b97d2..d568afc705d2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -21,7 +21,6 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 #define NUM_COUNTERS_L3 6 -#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 #define RDPMC_BASE_LLC 10 @@ -31,6 +30,7 @@ #undef pr_fmt #define pr_fmt(fmt) "amd_uncore: " fmt +static int pmu_version; static int num_counters_llc; static int num_counters_nb; static bool l3_mask; @@ -46,7 +46,7 @@ struct amd_uncore { u32 msr_base; cpumask_t *active_mask; struct pmu *pmu; - struct perf_event *events[MAX_COUNTERS]; + struct perf_event **events; struct hlist_node node; }; @@ -158,6 +158,16 @@ out: hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + /* + * The first four DF counters are accessible via RDPMC index 6 to 9 + * followed by the L3 counters from index 10 to 15. For processors + * with more than four DF counters, the DF RDPMC assignments become + * discontiguous as the additional counters are accessible starting + * from index 16. + */ + if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) + hwc->event_base_rdpmc += NUM_COUNTERS_L3; + if (flags & PERF_EF_START) amd_uncore_start(event, PERF_EF_RELOAD); @@ -209,10 +219,14 @@ static int amd_uncore_event_init(struct perf_event *event) { struct amd_uncore *uncore; struct hw_perf_event *hwc = &event->hw; + u64 event_mask = AMD64_RAW_EVENT_MASK_NB; if (event->attr.type != event->pmu->type) return -ENOENT; + if (pmu_version >= 2 && is_nb_event(event)) + event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; + /* * NB and Last level cache counters (MSRs) are shared across all cores * that share the same NB / Last level cache. On family 16h and below, @@ -221,7 +235,7 @@ static int amd_uncore_event_init(struct perf_event *event) * out. So we do not support sampling and per-thread events via * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: */ - hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; + hwc->config = event->attr.config & event_mask; hwc->idx = -1; if (event->cpu < 0) @@ -247,6 +261,19 @@ static int amd_uncore_event_init(struct perf_event *event) return 0; } +static umode_t +amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ? + attr->mode : 0; +} + +static umode_t +amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0; +} + static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -287,8 +314,10 @@ static struct device_attribute format_attr_##_var = \ DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ @@ -297,20 +326,33 @@ DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ +/* Common DF and NB attributes */ static struct attribute *amd_uncore_df_format_attr[] = { - &format_attr_event12.attr, /* event14 if F17h+ */ - &format_attr_umask.attr, + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ NULL, }; +/* Common L2 and L3 attributes */ static struct attribute *amd_uncore_l3_format_attr[] = { - &format_attr_event12.attr, /* event8 if F17h+ */ - &format_attr_umask.attr, - NULL, /* slicemask if F17h, coreid if F19h */ - NULL, /* threadmask8 if F17h, enallslices if F19h */ - NULL, /* enallcores if F19h */ - NULL, /* sliceid if F19h */ - NULL, /* threadmask2 if F19h */ + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ + NULL, /* threadmask */ + NULL, +}; + +/* F17h unique L3 attributes */ +static struct attribute *amd_f17h_uncore_l3_format_attr[] = { + &format_attr_slicemask.attr, /* slicemask */ + NULL, +}; + +/* F19h unique L3 attributes */ +static struct attribute *amd_f19h_uncore_l3_format_attr[] = { + &format_attr_coreid.attr, /* coreid */ + &format_attr_enallslices.attr, /* enallslices */ + &format_attr_enallcores.attr, /* enallcores */ + &format_attr_sliceid.attr, /* sliceid */ NULL, }; @@ -324,6 +366,18 @@ static struct attribute_group amd_uncore_l3_format_group = { .attrs = amd_uncore_l3_format_attr, }; +static struct attribute_group amd_f17h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f17h_uncore_l3_format_attr, + .is_visible = amd_f17h_uncore_is_visible, +}; + +static struct attribute_group amd_f19h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f19h_uncore_l3_format_attr, + .is_visible = amd_f19h_uncore_is_visible, +}; + static const struct attribute_group *amd_uncore_df_attr_groups[] = { &amd_uncore_attr_group, &amd_uncore_df_format_group, @@ -336,6 +390,12 @@ static const struct attribute_group *amd_uncore_l3_attr_groups[] = { NULL, }; +static const struct attribute_group *amd_uncore_l3_attr_update[] = { + &amd_f17h_uncore_l3_format_group, + &amd_f19h_uncore_l3_format_group, + NULL, +}; + static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_df_attr_groups, @@ -353,6 +413,7 @@ static struct pmu amd_nb_pmu = { static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_l3_attr_groups, + .attr_update = amd_uncore_l3_attr_update, .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, @@ -370,11 +431,19 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) cpu_to_node(cpu)); } +static inline struct perf_event ** +amd_uncore_events_alloc(unsigned int num, unsigned int cpu) +{ + return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, + cpu_to_node(cpu)); +} + static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_llc; + struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; if (amd_uncore_nb) { + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; @@ -384,11 +453,15 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); + if (!uncore_nb->events) + goto fail; uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } if (amd_uncore_llc) { + *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; uncore_llc = amd_uncore_alloc(cpu); if (!uncore_llc) goto fail; @@ -398,6 +471,9 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_llc->active_mask = &amd_llc_active_mask; uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); + if (!uncore_llc->events) + goto fail; uncore_llc->id = -1; *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } @@ -405,9 +481,16 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) return 0; fail: - if (amd_uncore_nb) - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; - kfree(uncore_nb); + if (uncore_nb) { + kfree(uncore_nb->events); + kfree(uncore_nb); + } + + if (uncore_llc) { + kfree(uncore_llc->events); + kfree(uncore_llc); + } + return -ENOMEM; } @@ -540,8 +623,11 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) if (cpu == uncore->cpu) cpumask_clear_cpu(cpu, uncore->active_mask); - if (!--uncore->refcnt) + if (!--uncore->refcnt) { + kfree(uncore->events); kfree(uncore); + } + *per_cpu_ptr(uncores, cpu) = NULL; } @@ -560,6 +646,7 @@ static int __init amd_uncore_init(void) { struct attribute **df_attr = amd_uncore_df_format_attr; struct attribute **l3_attr = amd_uncore_l3_format_attr; + union cpuid_0x80000022_ebx ebx; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && @@ -569,6 +656,9 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) + pmu_version = 2; + num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L2; if (boot_cpu_data.x86 >= 0x17) { @@ -585,8 +675,12 @@ static int __init amd_uncore_init(void) } if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { - if (boot_cpu_data.x86 >= 0x17) + if (pmu_version >= 2) { + *df_attr++ = &format_attr_event14v2.attr; + *df_attr++ = &format_attr_umask12.attr; + } else if (boot_cpu_data.x86 >= 0x17) { *df_attr = &format_attr_event14.attr; + } amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { @@ -597,6 +691,11 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; + if (pmu_version >= 2) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + num_counters_nb = ebx.split.num_df_pmc; + } + pr_info("%d %s %s counters detected\n", num_counters_nb, boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", amd_nb_pmu.name); @@ -607,16 +706,11 @@ static int __init amd_uncore_init(void) if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { if (boot_cpu_data.x86 >= 0x19) { *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask.attr; - *l3_attr++ = &format_attr_coreid.attr; - *l3_attr++ = &format_attr_enallslices.attr; - *l3_attr++ = &format_attr_enallcores.attr; - *l3_attr++ = &format_attr_sliceid.attr; + *l3_attr++ = &format_attr_umask8.attr; *l3_attr++ = &format_attr_threadmask2.attr; } else if (boot_cpu_data.x86 >= 0x17) { *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask.attr; - *l3_attr++ = &format_attr_slicemask.attr; + *l3_attr++ = &format_attr_umask8.attr; *l3_attr++ = &format_attr_threadmask8.attr; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 30788894124f..f969410d0c90 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -693,9 +693,9 @@ void x86_pmu_disable_all(void) } } -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { - return static_call(x86_pmu_guest_get_msrs)(nr); + return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); @@ -2103,14 +2103,15 @@ static int __init init_hw_perf_events(void) } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return 0; + err = 0; + goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) - return 0; + goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2219,6 +2220,8 @@ out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); +out_bad_pmu: + memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); @@ -2990,6 +2993,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { + if (!x86_pmu_initialized()) { + memset(cap, 0, sizeof(*cap)); + return; + } + cap->version = x86_pmu.version; /* * KVM doesn't support the hybrid PMU yet. @@ -3002,5 +3010,17 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; + cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); + +u64 perf_get_hw_event_config(int hw_event) +{ + int max = x86_pmu.max_events; + + if (hw_event < max) + return x86_pmu.event_map(array_index_nospec(hw_event, max)); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 45024abd929f..2db93498ff71 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/nmi.h> +#include <linux/kvm_host.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -2852,6 +2853,47 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +/* + * We may be running with guest PEBS events created by KVM, and the + * PEBS records are logged into the guest's DS and invisible to host. + * + * In the case of guest PEBS overflow, we only trigger a fake event + * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. + * The guest will then vm-entry and check the guest DS area to read + * the guest PEBS records. + * + * The contents and other behavior of the guest event do not matter. + */ +static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, + struct perf_sample_data *data) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; + struct perf_event *event = NULL; + int bit; + + if (!unlikely(perf_guest_state())) + return; + + if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active || + !guest_pebs_idxs) + return; + + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, + INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + event = cpuc->events[bit]; + if (!event->attr.precise_ip) + continue; + + perf_sample_data_init(data, 0, event->hw.last_period); + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); + + /* Inject one fake event is enough. */ + break; + } +} + static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; @@ -2891,10 +2933,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - status &= ~cpuc->pebs_enabled; - else - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); /* * PEBS overflow sets bit 62 in the global status register @@ -2903,6 +2942,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) u64 pebs_enabled = cpuc->pebs_enabled; handled++; + x86_pmu_handle_guest_pebs(regs, &data); x86_pmu.drain_pebs(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; @@ -3930,40 +3970,98 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; + + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - arr[0].guest &= ~cpuc->pebs_enabled; - else - arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); - *nr = 1; + if (!x86_pmu.pebs) + return arr; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; + } + + if (!kvm_pmu || !x86_pmu.pebs_ept) + return arr; + + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + + pebs_enable = (*nr)++; + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; } return arr; } -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; @@ -4141,6 +4239,8 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; + c = intel_get_event_constraints(cpuc, idx, event); + /* * :ppp means to do reduced skid PEBS, * which is available on PMC0 and fixed counter 0. @@ -4153,8 +4253,6 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return &counter0_constraint; } - c = intel_get_event_constraints(cpuc, idx, event); - return c; } @@ -5650,6 +5748,7 @@ __init int intel_pmu_init(void) x86_pmu.events_mask_len = eax.split.mask_length; x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -5834,6 +5933,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; @@ -6138,6 +6238,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: + x86_pmu.pebs_ept = 1; pmem = true; fallthrough; case INTEL_FAM6_ICELAKE_L: @@ -6190,6 +6291,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6235,13 +6337,15 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; x86_pmu.lbr_pt_coexist = true; - intel_pmu_pebs_data_source_skl(false); + intel_pmu_pebs_data_source_adl(); + x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = adl_update_topdown_event; x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; @@ -6398,8 +6502,7 @@ __init int intel_pmu_init(void) x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. + * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 376cc3d66094..ba60427caa6d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -94,15 +94,40 @@ void __init intel_pmu_pebs_data_source_nhm(void) pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); } -void __init intel_pmu_pebs_data_source_skl(bool pmem) +static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source) { u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); - pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); - pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); - pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); - pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); - pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); + data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source); +} + +static void __init intel_pmu_pebs_data_source_grt(u64 *data_source) +{ + data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD); +} + +void __init intel_pmu_pebs_data_source_adl(void) +{ + u64 *data_source; + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_skl(false, data_source); + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + intel_pmu_pebs_data_source_grt(data_source); } static u64 precise_store_data(u64 status) @@ -171,7 +196,50 @@ static u64 precise_datala_hsw(struct perf_event *event, u64 status) return dse.val; } -static u64 load_latency_data(u64 status) +static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock) +{ + /* + * TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (tlb) + *val |= P(TLB, MISS) | P(TLB, L2); + else + *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* locked prefix */ + if (lock) + *val |= P(LOCK, LOCKED); +} + +/* Retrieve the latency data for e-core of ADL */ +u64 adl_latency_data_small(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + + WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big); + + dse.val = status; + + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; + + /* + * For the atom core on ADL, + * bit 4: lock, bit 5: TLB access. + */ + pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss); + + if (dse.ld_data_blk) + val |= P(BLK, DATA); + else + val |= P(BLK, NA); + + return val; +} + +static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; u64 val; @@ -181,7 +249,7 @@ static u64 load_latency_data(u64 status) /* * use the mapping table for bit 0-3 */ - val = pebs_data_source[dse.ld_dse]; + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; /* * Nehalem models do not support TLB, Lock infos @@ -190,21 +258,8 @@ static u64 load_latency_data(u64 status) val |= P(TLB, NA) | P(LOCK, NA); return val; } - /* - * bit 4: TLB access - * 0 = did not miss 2nd level TLB - * 1 = missed 2nd level TLB - */ - if (dse.ld_stlb_miss) - val |= P(TLB, MISS) | P(TLB, L2); - else - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); - /* - * bit 5: locked prefix - */ - if (dse.ld_locked) - val |= P(LOCK, LOCKED); + pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked); /* * Ice Lake and earlier models do not support block infos. @@ -233,7 +288,7 @@ static u64 load_latency_data(u64 status) return val; } -static u64 store_latency_data(u64 status) +static u64 store_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; u64 val; @@ -243,23 +298,9 @@ static u64 store_latency_data(u64 status) /* * use the mapping table for bit 0-3 */ - val = pebs_data_source[dse.st_lat_dse]; + val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse]; - /* - * bit 4: TLB access - * 0 = did not miss 2nd level TLB - * 1 = missed 2nd level TLB - */ - if (dse.st_lat_stlb_miss) - val |= P(TLB, MISS) | P(TLB, L2); - else - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); - - /* - * bit 5: locked prefix - */ - if (dse.st_lat_locked) - val |= P(LOCK, LOCKED); + pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked); val |= P(BLK, NA); @@ -781,8 +822,8 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { struct event_constraint intel_grt_pebs_event_constraints[] = { /* Allow all events as PEBS with no flags */ - INTEL_PLD_CONSTRAINT(0x5d0, 0xf), - INTEL_PSD_CONSTRAINT(0x6d0, 0xf), + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf), + INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf), EVENT_CONSTRAINT_END }; @@ -1443,9 +1484,11 @@ static u64 get_data_src(struct perf_event *event, u64 aux) bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); if (fl & PERF_X86_EVENT_PEBS_LDLAT) - val = load_latency_data(aux); + val = load_latency_data(event, aux); else if (fl & PERF_X86_EVENT_PEBS_STLAT) - val = store_latency_data(aux); + val = store_latency_data(event, aux); + else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID) + val = x86_pmu.pebs_latency_data(event, aux); else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) val = precise_datala_hsw(event, aux); else if (fst) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 13179f31fe10..4f70fb6c2c1e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -278,9 +278,9 @@ enum { }; /* - * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in - * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when - * TSX is not supported they have no consistent behavior: + * For format LBR_FORMAT_EIP_FLAGS2, bits 61:62 in MSR_LAST_BRANCH_FROM_x + * are the TSX flags when TSX is supported, but when TSX is not supported + * they have no consistent behavior: * * - For wrmsr(), bits 61:62 are considered part of the sign extension. * - For HW updates (branch captures) bits 61:62 are always OFF and are not @@ -288,7 +288,7 @@ enum { * * Therefore, if: * - * 1) LBR has TSX format + * 1) LBR format LBR_FORMAT_EIP_FLAGS2 * 2) CPU has no TSX support enabled * * ... then any value passed to wrmsr() must be sign extended to 63 bits and any @@ -300,7 +300,7 @@ static inline bool lbr_from_signext_quirk_needed(void) bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM); - return !tsx_support && x86_pmu.lbr_has_tsx; + return !tsx_support; } static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); @@ -1609,9 +1609,6 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_map = hsw_lbr_sel_map; x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); - - if (lbr_from_signext_quirk_needed()) - static_branch_enable(&lbr_from_quirk_key); } /* skylake */ @@ -1702,7 +1699,11 @@ void intel_pmu_lbr_init(void) switch (x86_pmu.intel_cap.lbr_format) { case LBR_FORMAT_EIP_FLAGS2: x86_pmu.lbr_has_tsx = 1; - fallthrough; + x86_pmu.lbr_from_flags = 1; + if (lbr_from_signext_quirk_needed()) + static_branch_enable(&lbr_from_quirk_key); + break; + case LBR_FORMAT_EIP_FLAGS: x86_pmu.lbr_from_flags = 1; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 21a5482bcf84..ba3d24a6a4ec 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -84,6 +84,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ #define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ #define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ +#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ static inline bool is_topdown_count(struct perf_event *event) { @@ -136,7 +137,8 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \ + PERF_SAMPLE_WEIGHT_TYPE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ @@ -460,6 +462,10 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) +#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID) + /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) @@ -638,6 +644,8 @@ enum { x86_lbr_exclusive_max, }; +#define PERF_PEBS_DATA_SOURCE_MAX 0x10 + struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -665,6 +673,8 @@ struct x86_hybrid_pmu { unsigned int late_ack :1, mid_ack :1, enabled_ack :1; + + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) @@ -818,15 +828,18 @@ struct x86_pmu { pebs_prec_dist :1, pebs_no_tlb :1, pebs_no_isolation :1, - pebs_block :1; + pebs_block :1, + pebs_ept :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + u64 (*pebs_latency_data)(struct perf_event *event, u64 status); unsigned long large_pebs_flags; u64 rtm_abort_event; + u64 pebs_capable; /* * Intel LBR @@ -902,7 +915,7 @@ struct x86_pmu { /* * Intel host/guest support (KVM) */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. @@ -1392,6 +1405,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +u64 adl_latency_data_small(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; @@ -1499,6 +1514,8 @@ void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); +void intel_pmu_pebs_data_source_adl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index db2d92fb44da..fb8b2c088681 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -46,7 +46,7 @@ static void hv_apic_icr_write(u32 low, u32 id) { u64 reg_val; - reg_val = SET_APIC_DEST_FIELD(id); + reg_val = SET_XAPIC_DEST_FIELD(id); reg_val = reg_val << 32; reg_val |= low; diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 7e0f6bedc248..42c70d28ef27 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -192,7 +192,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct pci_dev *dev; struct hv_interrupt_entry out_entry, *stored_entry; struct irq_cfg *cfg = irqd_cfg(data); - cpumask_t *affinity; + const cpumask_t *affinity; int cpu; u64 status; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9b10c8c76087..9542c582d546 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -76,6 +76,7 @@ extern int alternatives_patched; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); extern void apply_ibt_endbr(s32 *start, s32 *end); struct module; diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index aabdbb5ab920..f3eb098d63d4 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -29,7 +29,10 @@ union ibs_fetch_ctl { rand_en:1, /* 57: random tagging enable */ fetch_l2_miss:1,/* 58: L2 miss for sampled fetch * (needs IbsFetchComp) */ - reserved:5; /* 59-63: reserved */ + l3_miss_only:1, /* 59: Collect L3 miss samples only */ + fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */ + fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */ + reserved:2; /* 62-63: reserved */ }; }; @@ -38,14 +41,14 @@ union ibs_op_ctl { __u64 val; struct { __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ - reserved0:1, /* 16: reserved */ + l3_miss_only:1, /* 16: Collect L3 miss samples only */ op_en:1, /* 17: op sampling enable */ op_val:1, /* 18: op sample valid */ cnt_ctl:1, /* 19: periodic op counter control */ opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ - reserved1:5, /* 27-31: reserved */ + reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved2:5; /* 59-63: reserved */ + reserved1:5; /* 59-63: reserved */ }; }; @@ -71,11 +74,12 @@ union ibs_op_data { union ibs_op_data2 { __u64 val; struct { - __u64 data_src:3, /* 0-2: data source */ + __u64 data_src_lo:3, /* 0-2: data source low */ reserved0:1, /* 3: reserved */ rmt_node:1, /* 4: destination node */ cache_hit_st:1, /* 5: cache hit state */ - reserved1:57; /* 5-63: reserved */ + data_src_hi:2, /* 6-7: data source high */ + reserved1:56; /* 8-63: reserved */ }; }; diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 92035eb3afee..68d213e83fcc 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -89,8 +89,8 @@ #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 -#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) -#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_XAPIC_DEST_FIELD(x) ((x) << 24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ebc248e49549..02bae8e0758b 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -31,20 +31,6 @@ static inline bool __must_check rdrand_long(unsigned long *v) return false; } -static inline bool __must_check rdrand_int(unsigned int *v) -{ - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { - asm volatile("rdrand %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - if (ok) - return true; - } while (--retry); - return false; -} - static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; @@ -54,48 +40,23 @@ static inline bool __must_check rdseed_long(unsigned long *v) return ok; } -static inline bool __must_check rdseed_int(unsigned int *v) -{ - bool ok; - asm volatile("rdseed %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - return ok; -} - /* * These are the generic interfaces; they must not be declared if the - * stubs in <linux/random.h> are to be invoked, - * i.e. CONFIG_ARCH_RANDOM is not defined. + * stubs in <linux/random.h> are to be invoked. */ -#ifdef CONFIG_ARCH_RANDOM -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; -} - -extern void x86_init_rdrand(struct cpuinfo_x86 *c); - -#else /* !CONFIG_ARCH_RANDOM */ - -static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } - -#endif /* !CONFIG_ARCH_RANDOM */ +#ifndef CONFIG_UML +void x86_init_rdrand(struct cpuinfo_x86 *c); +#endif #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 03acc823838a..14ed039dff55 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -203,8 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */ +#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -296,6 +296,13 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -316,6 +323,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -345,6 +353,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ @@ -447,5 +456,6 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 36369e76cc63..33d2cd04d254 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,25 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif +#ifdef CONFIG_RETPOLINE +# define DISABLE_RETPOLINE 0 +#else +# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ + (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) +#endif + +#ifdef CONFIG_RETHUNK +# define DISABLE_RETHUNK 0 +#else +# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) +#endif + +#ifdef CONFIG_CPU_UNRET_ENTRY +# define DISABLE_UNRET 0 +#else +# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -82,7 +101,7 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 0 +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 8e95aa4b0d17..8ae6e0e11b8b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); #endif -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_X86_DMA_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9636742a80f2..233ae6986d6f 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -100,8 +100,6 @@ static inline void efi_fpu_end(void) efi_fpu_end(); \ }) -#define arch_efi_call_virt(p, f, args...) p->f(args) - #else /* !CONFIG_X86_32 */ #define EFI_LOADER_SIGNATURE "EL64" @@ -121,6 +119,7 @@ extern asmlinkage u64 __efi_call(void *fp, ...); efi_enter_mm(); \ }) +#undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(); \ ret = efi_call((void *)p->f, args); \ @@ -383,7 +382,6 @@ static inline bool efi_is_64bit(void) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); -extern void efi_find_mirror(void); extern void efi_reserve_boot_services(void); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -395,9 +393,6 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } -static inline void efi_find_mirror(void) -{ -} static inline void efi_reserve_boot_services(void) { } diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 6b0f31fb53f7..503a577814b2 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -164,4 +164,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) /* prctl */ extern long fpu_xstate_prctl(int option, unsigned long arg2); +extern void fpu_idle_fpregs(void); + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 1870b99c3356..e9025640f634 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -170,15 +170,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) #define isa_bus_to_virt phys_to_virt /* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - -/* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6ad8d946cd3e..a3760ca796aa 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,12 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres + #ifdef CONFIG_KEXEC_FILE struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, @@ -193,6 +199,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +void *arch_kexec_kernel_image_load(struct kimage *image); +#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index da47f60a4650..51f777071584 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) @@ -87,7 +88,7 @@ KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6..c17e3e96fc1d 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -12,7 +12,7 @@ BUILD_BUG_ON(1) * a NULL definition, for example if "static_call_cond()" will be used * at the call sites. */ -KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(hw_event_available) KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9217bd6cf0d1..e8281d64a431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -65,6 +65,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -126,7 +129,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define UNMAPPED_GVA (~(gpa_t)0) #define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ @@ -505,6 +507,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; @@ -520,6 +523,21 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; + u64 pebs_enable; + u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; + + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. @@ -644,7 +662,6 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - bool apicv_active; bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; @@ -695,7 +712,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; - struct kvm_mmu_memory_cache mmu_gfn_array_cache; + struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -808,6 +825,7 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; + u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; @@ -1111,11 +1129,6 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_PIT_REINJ, /* - * AVIC is inhibited because the guest has x2apic in its CPUID. - */ - APICV_INHIBIT_REASON_X2APIC, - - /* * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, @@ -1222,8 +1235,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1307,6 +1325,36 @@ struct kvm_arch { hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; + + bool disable_nx_huge_pages; + + /* + * Memory caches used to allocate shadow pages when performing eager + * page splitting. No need for a shadowed_info_cache since eager page + * splitting only allocates direct shadow pages. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_shadow_page_cache; + struct kvm_mmu_memory_cache split_page_header_cache; + + /* + * Memory cache used to allocate pte_list_desc structs while splitting + * huge pages. In the worst case, to split one huge page, 512 + * pte_list_desc structs are needed to add each lower level leaf sptep + * to the rmap plus 1 to extend the parent_ptes rmap of the lower level + * page table. + * + * Protected by kvm->slots_lock. + */ +#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) + struct kvm_mmu_memory_cache split_desc_cache; }; struct kvm_vm_stat { @@ -1367,6 +1415,7 @@ struct kvm_vcpu_stat { u64 preemption_reported; u64 preemption_other; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; @@ -1407,6 +1456,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); @@ -1471,7 +1521,7 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); @@ -1485,7 +1535,7 @@ struct kvm_x86_ops { bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + void (*hwapic_isr_update)(int isr); bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); @@ -1495,7 +1545,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); @@ -1705,21 +1755,6 @@ extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; -/* bus lock detection supported? */ -extern bool kvm_has_bus_lock_exit; - -extern u64 kvm_mce_cap_supported; - /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context @@ -2060,6 +2095,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 85865f1645bd..73ca20049835 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -19,19 +19,27 @@ #define __ALIGN_STR __stringify(__ALIGN) #endif +#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define RET jmp __x86_return_thunk +#else /* CONFIG_RETPOLINE */ #ifdef CONFIG_SLS #define RET ret; int3 #else #define RET ret #endif +#endif /* CONFIG_RETPOLINE */ #else /* __ASSEMBLY__ */ +#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define ASM_RET "jmp __x86_return_thunk\n\t" +#else /* CONFIG_RETPOLINE */ #ifdef CONFIG_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif +#endif /* CONFIG_RETPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 88ceaf3648b3..72ca90552b6a 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -89,6 +89,8 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void add_encrypt_protection_map(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d27e0581b777..182b2a1f71fe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -51,6 +51,8 @@ #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -93,6 +95,7 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO BIT(4) /* * Not susceptible to Speculative Store Bypass @@ -140,6 +143,13 @@ * bit available to control VERW * behavior. */ +#define ARCH_CAP_RRSBA BIT(19) /* + * Indicates RET may use predictors + * other than the RSB. With eIBRS + * enabled predictions in kernel mode + * are restricted to targets in + * kernel. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -221,6 +231,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -378,6 +394,7 @@ #define MSR_TURBO_ACTIVATION_RATIO 0x0000064C #define MSR_PLATFORM_ENERGY_STATUS 0x0000064D +#define MSR_SECONDARY_TURBO_RATIO_LIMIT 0x00000650 #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 @@ -567,6 +584,9 @@ /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -1005,6 +1025,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 29dd27b5a339..3a8fdf881313 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -13,6 +13,7 @@ #define MWAIT_SUBSTATE_SIZE 4 #define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) +#define MWAIT_C1_SUBSTATE_MASK 0xf0 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index da251a5645b0..cba942006ffe 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,6 +11,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> +#include <asm/percpu.h> #define RETPOLINE_THUNK_SIZE 32 @@ -76,25 +77,54 @@ .endm /* + * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions + * vs RETBleed validation. + */ +#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE + +/* + * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should + * eventually turn into it's own annotation. + */ +.macro ANNOTATE_UNRET_END +#ifdef CONFIG_DEBUG_ENTRY + ANNOTATE_RETPOLINE_SAFE + nop +#endif +.endm + +/* + * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call + * to the retpoline thunk with a CS prefix when the register requires + * a RAX prefix byte to encode. Also see apply_retpolines(). + */ +.macro __CS_PREFIX reg:req + .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 + .ifc \reg,\rs + .byte 0x2e + .endif + .endr +.endm + +/* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ - __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE + __CS_PREFIX \reg + jmp __x86_indirect_thunk_\reg #else jmp *%\reg + int3 #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ - __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE + __CS_PREFIX \reg + call __x86_indirect_thunk_\reg #else call *%\reg #endif @@ -105,10 +135,34 @@ * monstrosity above, manually. */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req -#ifdef CONFIG_RETPOLINE ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) .Lskip_rsb_\@: +.endm + +#ifdef CONFIG_CPU_UNRET_ENTRY +#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret" +#else +#define CALL_ZEN_UNTRAIN_RET "" +#endif + +/* + * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the + * return thunk isn't mapped into the userspace tables (then again, AMD + * typically has NO_MELTDOWN). + * + * While zen_untrain_ret() doesn't clobber anything but requires stack, + * entry_ibpb() will clobber AX, CX, DX. + * + * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point + * where we have a stack but before any RET instruction. + */ +.macro UNTRAIN_RET +#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) + ANNOTATE_UNRET_END + ALTERNATIVE_2 "", \ + CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ + "call entry_ibpb", X86_FEATURE_ENTRY_IBPB #endif .endm @@ -120,17 +174,20 @@ _ASM_PTR " 999b\n\t" \ ".popsection\n\t" -#ifdef CONFIG_RETPOLINE - typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +extern retpoline_thunk_t __x86_indirect_thunk_array[]; + +extern void __x86_return_thunk(void); +extern void zen_untrain_ret(void); +extern void entry_ibpb(void); + +#ifdef CONFIG_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; #include <asm/GEN-for-each-reg.h> #undef GEN -extern retpoline_thunk_t __x86_indirect_thunk_array[]; - #ifdef CONFIG_X86_64 /* @@ -193,6 +250,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_EIBRS, SPECTRE_V2_EIBRS_RETPOLINE, SPECTRE_V2_EIBRS_LFENCE, + SPECTRE_V2_IBRS, }; /* The indirect branch speculation control variants */ @@ -235,6 +293,9 @@ static inline void indirect_branch_prediction_barrier(void) /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; +DECLARE_PER_CPU(u64, x86_spec_ctrl_current); +extern void write_spec_ctrl_current(u64 val, bool force); +extern u64 spec_ctrl_current(void); /* * With retpoline, we must use IBRS to restrict branch prediction @@ -244,18 +305,18 @@ extern u64 x86_spec_ctrl_base; */ #define firmware_restrict_branch_speculation_start() \ do { \ - u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ - \ preempt_disable(); \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ + spec_ctrl_current() | SPEC_CTRL_IBRS, \ X86_FEATURE_USE_IBRS_FW); \ + alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, \ + X86_FEATURE_USE_IBPB_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ - u64 val = x86_spec_ctrl_base; \ - \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ + spec_ctrl_current(), \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f3fd5928bcbb..736793d65bcb 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -105,9 +105,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -/* generic pci stuff */ -#include <asm-generic/pci.h> - #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ static inline int __pcibus_to_node(const struct pci_bus *bus) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 409725e86f42..f6fc8dd51ef4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -89,6 +89,19 @@ #define AMD64_RAW_EVENT_MASK_NB \ (AMD64_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK) + +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \ + (AMD64_EVENTSEL_EVENT | \ + GENMASK_ULL(37, 36)) + +#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \ + (ARCH_PERFMON_EVENTSEL_UMASK | \ + GENMASK_ULL(27, 24)) + +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ + AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 @@ -194,6 +207,9 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; + unsigned int reserved:6; + /* Number of Data Fabric Counters */ + unsigned int num_df_pmc:6; } split; unsigned int full; }; @@ -206,6 +222,7 @@ struct x86_pmu_capability { int bit_width_fixed; unsigned int events_mask; int events_mask_len; + unsigned int pebs_ept :1; }; /* @@ -504,6 +521,7 @@ struct x86_pmu_lbr { }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern u64 perf_get_hw_event_config(int hw_event); extern void perf_check_microcode(void); extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -513,15 +531,20 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) memset(cap, 0, sizeof(*cap)); } +static inline u64 perf_get_hw_event_config(int hw_event) +{ + return 0; +} + static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bdaf8391e2e0..aa174fed3a71 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -230,25 +230,6 @@ enum page_cache_mode { #endif /* __ASSEMBLY__ */ -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - /* * early identity mapping pte attrib macros. */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f8b9ee97a891..f37cbff7354c 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -120,6 +120,9 @@ void *extend_brk(size_t size, size_t align); static char __brk_##name[size] extern void probe_roms(void); + +void clear_bss(void); + #ifdef __i386__ asmlinkage void __init i386_start_kernel(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 19514524f0f8..4a23e52fe0ee 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -72,7 +72,6 @@ static inline u64 lower_bits(u64 val, unsigned int bits) struct real_mode_header; enum stack_type; -struct ghcb; /* Early IDT entry points for #VC handler */ extern void vc_no_ghcb(void); @@ -156,11 +155,7 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); -extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - bool set_ghcb_msr, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2); + static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { int rc; diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 3f9334ef67cd..eae20fa52b93 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -65,17 +65,22 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * is in the PENDING or MODIFIED state. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { + SGX_EPC_PAGE_CONFLICT = 7, SGX_NOT_TRACKED = 11, SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, + SGX_PAGE_NOT_MODIFIABLE = 20, SGX_UNMASKED_EVENT = 128, }; @@ -234,6 +239,9 @@ struct sgx_pageinfo { * %SGX_PAGE_TYPE_REG: a regular page * %SGX_PAGE_TYPE_VA: a VA page * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * + * Make sure when making changes to this enum that its values can still fit + * in the bitfield within &struct sgx_encl_page */ enum sgx_page_type { SGX_PAGE_TYPE_SECS, diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 45b18eb94fa1..35f709f619fb 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -295,6 +295,15 @@ static inline int enqcmds(void __iomem *dst, const void *src) return 0; } +static inline void tile_release(void) +{ + /* + * Instruction opcode for TILERELEASE; supported in binutils + * version >= 2.36. + */ + asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 2d8dacd02643..343b722ccaf2 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -21,6 +21,16 @@ * relative displacement across sections. */ +/* + * The trampoline is 8 bytes and of the general form: + * + * jmp.d32 \func + * ud1 %esp, %ecx + * + * That trailing #UD provides both a speculation stop and serves as a unique + * 3 byte signature identifying static call trampolines. Also see tramp_ud[] + * and __static_call_fixup(). + */ #define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insns) \ asm(".pushsection .static_call.text, \"ax\" \n" \ ".align 4 \n" \ @@ -28,7 +38,7 @@ STATIC_CALL_TRAMP_STR(name) ": \n" \ ANNOTATE_NOENDBR \ insns " \n" \ - ".byte 0x53, 0x43, 0x54 \n" \ + ".byte 0x0f, 0xb9, 0xcc \n" \ ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \ ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \ ".popsection \n") @@ -36,8 +46,13 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") +#ifdef CONFIG_RETHUNK +#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ + __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") +#else #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop") +#endif #define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \ ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0) @@ -48,4 +63,6 @@ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \ ".popsection \n") +extern bool __static_call_fixup(void *tramp, u8 op, void *dest); + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b07fba11704..0361626841bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -195,6 +195,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define X2APIC_MODE_SHIFT 30 +#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) @@ -253,12 +256,19 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, }; +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0) + +/* + * For AVIC, the max index allowed for physical APIC ID + * table is 0xff (255). + */ +#define AVIC_MAX_PHYSICAL_ID 0XFEULL /* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. + * For x2AVIC, the max index allowed for physical APIC ID + * table is 0x1ff (511). */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff +#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 1bfe979bb9bc..580636cdc257 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -2,9 +2,6 @@ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4af5579c7ef7..cda3118f3b27 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -16,6 +16,7 @@ void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL +#define TLB_GENERATION_INVALID 0 void cr4_update_irqsoff(unsigned long set, unsigned long clear); unsigned long cr4_read_shadow(void); diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 8b33674288ea..f66fbe6537dd 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -8,7 +8,11 @@ #ifdef __ASSEMBLY__ .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 + UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1 +.endm + +.macro UNWIND_HINT_ENTRY + UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1 .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 @@ -52,6 +56,14 @@ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm +.macro UNWIND_HINT_SAVE + UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE +.endm + +.macro UNWIND_HINT_RESTORE + UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +.endm + #else #define UNWIND_HINT_FUNC \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6c343c6a1855..c371ef695fcc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,6 +31,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -74,6 +75,12 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) + +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) @@ -158,6 +165,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -221,6 +229,10 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -269,6 +281,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -554,6 +567,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index d9a74681a77d..c6a7eed03914 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -43,6 +43,7 @@ #define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ #define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ #define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ @@ -84,5 +85,8 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index bea5cdcdf532..01d19fc22346 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -11,11 +11,12 @@ #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 #define SETUP_CC_BLOB 7 +#define SETUP_IMA 8 +#define SETUP_RNG_SEED 9 +#define SETUP_ENUM_MAX SETUP_RNG_SEED #define SETUP_INDIRECT (1<<31) - -/* SETUP_INDIRECT | max(SETUP_*) */ -#define SETUP_TYPE_MAX (SETUP_INDIRECT | SETUP_JAILHOUSE) +#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -52,7 +53,7 @@ struct setup_data { __u64 next; __u32 type; __u32 len; - __u8 data[0]; + __u8 data[]; }; /* extensible setup indirect data node */ @@ -172,6 +173,14 @@ struct jailhouse_setup_data { } __attribute__((packed)) v2; } __attribute__((packed)); +/* + * IMA buffer setup data information from the previous kernel during kexec + */ +struct ima_setup_data { + __u64 addr; + __u64 size; +} __attribute__((packed)); + /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 21614807a2cb..46de10a809ec 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -198,13 +198,13 @@ struct kvm_msrs { __u32 nmsrs; /* number of msrs in entries */ __u32 pad; - struct kvm_msr_entry entries[0]; + struct kvm_msr_entry entries[]; }; /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; + __u32 indices[]; }; /* Maximum size of any access bitmap in bytes */ @@ -241,7 +241,7 @@ struct kvm_cpuid_entry { struct kvm_cpuid { __u32 nent; __u32 padding; - struct kvm_cpuid_entry entries[0]; + struct kvm_cpuid_entry entries[]; }; struct kvm_cpuid_entry2 { @@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 { struct kvm_cpuid2 { __u32 nent; __u32 padding; - struct kvm_cpuid_entry2 entries[0]; + struct kvm_cpuid_entry2 entries[]; }; /* for KVM_GET_PIT and KVM_SET_PIT */ @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; @@ -325,6 +326,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +361,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; @@ -389,7 +394,7 @@ struct kvm_xsave { * the contents of CPUID leaf 0xD on the host. */ __u32 region[1024]; - __u32 extra[0]; + __u32 extra[]; }; #define KVM_MAX_XCRS 16 @@ -434,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -516,7 +522,7 @@ struct kvm_pmu_event_filter { __u32 fixed_counter_bitmap; __u32 flags; __u32 pad[4]; - __u64 events[0]; + __u64 events[]; }; #define KVM_PMU_EVENT_ALLOW 0 diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index f4b81587e90b..2dd35bbdc822 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -29,6 +29,12 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) #define SGX_IOC_VEPC_REMOVE_ALL \ _IO(SGX_MAGIC, 0x04) +#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) +#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ + _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) +#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ + _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) /** * struct sgx_enclave_create - parameter structure for the @@ -76,6 +82,62 @@ struct sgx_enclave_provision { __u64 fd; }; +/** + * struct sgx_enclave_restrict_permissions - parameters for ioctl + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @permissions:new permission bits for pages in range described by @offset + * and @length + * @result: (output) SGX result code of ENCLS[EMODPR] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_restrict_permissions { + __u64 offset; + __u64 length; + __u64 permissions; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_modify_types - parameters for ioctl + * %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @page_type: new type for pages in range described by @offset and @length + * @result: (output) SGX result code of ENCLS[EMODT] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_modify_types { + __u64 offset; + __u64 length; + __u64 page_type; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @count: (output) bytes successfully changed (multiple of page size) + * + * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized + * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES + * ioctl() should be used to change the page type to PT_TRIM. After that + * succeeds ENCLU[EACCEPT] should be run from within the enclave and then + * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. + */ +struct sgx_enclave_remove_pages { + __u64 offset; + __u64 length; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4c8b6ae802ac..a20a5ebfacd7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -34,8 +34,6 @@ KASAN_SANITIZE_sev.o := n # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD_test_nx.o := y - # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 8b8cbf22461a..8d8752b44f11 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -11,6 +11,22 @@ /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ +bool cpc_supported_by_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) || + (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) + return true; + else if (boot_cpu_data.x86 == 0x17 && + boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + return true; + return boot_cpu_has(X86_FEATURE_CPPC); + } + return false; +} + bool cpc_ffh_supported(void) { return true; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e257f6c80372..62f6b8b7c4a5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -115,6 +115,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; @@ -507,9 +508,78 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } +#ifdef CONFIG_RETHUNK +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return -1; + + bytes[i++] = RET_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ @@ -860,6 +930,7 @@ void __init alternative_instructions(void) * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); /* * Then patch alternatives, such that those paravirt calls that are in diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 190e0f763375..4266b64631a4 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -19,17 +19,23 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -41,8 +47,11 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, {} }; @@ -61,12 +70,15 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, {} }; @@ -81,6 +93,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index df764ceac2c8..6d303d1d276c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -275,7 +275,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index d1fb874fbe64..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -99,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 437308004ef2..cb50589a7102 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -19,6 +19,7 @@ #include <asm/suspend.h> #include <asm/tlbflush.h> #include <asm/tdx.h> +#include "../kvm/vmx/vmx.h" #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -107,4 +108,9 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); + + if (IS_ENABLED(CONFIG_KVM_INTEL)) { + BLANK(); + OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); + } } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0c0b09796ced..48276c0e479d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -808,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) return; /* - * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * The self-test can clear X86_FEATURE_RDRAND, so check for * RDRAND support using the CPUID function directly. */ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) @@ -862,6 +862,28 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif +} + static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); @@ -870,12 +892,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) node_reclaim_distance = 32; #endif - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. - */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -907,7 +938,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: fallthrough; + case 0x17: init_spectral_chicken(c); + fallthrough; case 0x19: init_amd_zn(c); break; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 74c62cc47a5f..6761668100b9 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -38,6 +38,8 @@ static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); @@ -48,16 +50,40 @@ static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void write_spec_ctrl_current(u64 val, bool force) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -114,13 +140,21 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); + /* + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET. + */ + spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); md_clear_select_mitigation(); @@ -161,31 +195,17 @@ void __init check_bugs(void) #endif } +/* + * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is + * done in vmenter.S. + */ void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - if (hostval != guestval) { msrval = setguest ? guestval : hostval; wrmsrl(MSR_IA32_SPEC_CTRL, msrval); @@ -752,12 +772,180 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, +}; + +static const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + mitigate_smt = true; + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = @@ -787,6 +975,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) @@ -828,6 +1017,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_EIBRS, SPECTRE_V2_CMD_EIBRS_RETPOLINE, SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -868,13 +1058,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -900,15 +1092,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { - return (mode == SPECTRE_V2_EIBRS || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_EIBRS_LFENCE); + return mode == SPECTRE_V2_IBRS || + mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; } static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -921,7 +1114,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; @@ -969,12 +1162,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not - * required. + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, + * STIBP is not required. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return; /* @@ -986,6 +1179,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + spectre_v2_user_stibp = mode; set_mode: @@ -999,6 +1199,7 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -1016,6 +1217,7 @@ static const struct { { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -1078,6 +1280,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -1093,6 +1319,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1117,6 +1359,15 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + mode = spectre_v2_select_retpoline(); break; @@ -1133,6 +1384,10 @@ static void __init spectre_v2_select_mitigation(void) mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + case SPECTRE_V2_CMD_EIBRS: mode = SPECTRE_V2_EIBRS; break; @@ -1149,10 +1404,9 @@ static void __init spectre_v2_select_mitigation(void) if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_eibrs_mode(mode)) { - /* Force it so VMEXIT will restore correctly */ + if (spectre_v2_in_ibrs_mode(mode)) { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } switch (mode) { @@ -1160,6 +1414,12 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_EIBRS: break; + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + case SPECTRE_V2_LFENCE: case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); @@ -1171,43 +1431,117 @@ static void __init spectre_v2_select_mitigation(void) break; } + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Similar to context switches, there are two types of RSB attacks + * after vmexit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS, on the other hand, has RSB-poisoning protections, so it + * doesn't need RSB clearing after vmexit. + */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) || + boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + + /* + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + write_spec_ctrl_current(val, true); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1424,16 +1758,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1450,7 +1774,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } } @@ -1701,7 +2025,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1938,7 +2262,7 @@ static ssize_t mmio_stale_data_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { @@ -1994,6 +2318,24 @@ static ssize_t srbds_show_state(char *buf) return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); } +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sprintf(buf, "Vulnerable: untrained return thunk on non-Zen uarch\n"); + + return sprintf(buf, "%s; SMT %s\n", + retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2039,6 +2381,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MMIO_STALE_DATA: return mmio_stale_data_show_state(buf); + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + default: break; } @@ -2095,4 +2440,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at { return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4730b0a58f24..736262a76a12 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1205,48 +1205,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) /* CPU is affected by X86_BUG_MMIO_STALE_DATA */ #define MMIO BIT(1) /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ #define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) | - BIT(7) | BIT(0xB), MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO), - VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO), - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; @@ -1348,6 +1360,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !arch_cap_mmio_immune(ia32_cap)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2a8e584fc991..7c9b5893c30a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -61,6 +61,8 @@ static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 7227c15299d0..9651275aecd1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/isa-dma.h> #include <linux/pci.h> #include <asm/dma.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 3fcdda4c1e11..21fd425088fe 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -302,6 +302,12 @@ static void init_hygon(struct cpuinfo_x86 *c) /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + set_cpu_cap(c, X86_FEATURE_ZEN); set_cpu_cap(c, X86_FEATURE_CPB); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fd5dead8371c..2d7ea5480ec3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -682,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); } @@ -1216,22 +1216,23 @@ static void bus_lock_init(void) { u64 val; - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || - (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - /* - * Enable #DB for bus lock. All bus locks are handled in #DB except - * split locks are handled in #AC in the fatal case. - */ rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 5fbd7ffb3233..12cf2e7ca33c 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -33,6 +33,8 @@ #include "internal.h" +static bool hw_injection_possible = true; + /* * Collect all the MCi_XXX settings */ @@ -339,6 +341,8 @@ static int __set_inj(const char *buf) for (i = 0; i < N_INJ_TYPES; i++) { if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; inj_type = i; return 0; } @@ -717,11 +721,54 @@ static void __init debugfs_init(void) &i_mce, dfs_fls[i].fops); } +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + static int __init inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + check_hw_inj_possible(); + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 4ae0e603f7fa..7e03f5b7f6bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -211,7 +211,7 @@ noinstr u64 mce_rdmsrl(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { - if (mce_flags.smca) { + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { switch (reg) { case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..26a427fa84ea 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -11,56 +11,39 @@ #include <asm/archrandom.h> #include <asm/sections.h> -static int __init x86_rdrand_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_RDRAND); - setup_clear_cpu_cap(X86_FEATURE_RDSEED); - return 1; -} -__setup("nordrand", x86_rdrand_setup); - /* * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. - * Run the instruction a few times as a sanity check. - * If it fails, it is simple to disable RDRAND here. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. */ -#define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned int changed = 0; - unsigned long tmp, prev; - int i; + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; if (!cpu_has(c, X86_FEATURE_RDRAND)) return; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (!rdrand_long(&tmp)) { - clear_cpu_cap(c, X86_FEATURE_RDRAND); - pr_warn_once("rdrand: disabled\n"); - return; + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; } + changed += i && sample != prev; + prev = sample; } + if (changed < MIN_CHANGE) + failure = true; - /* - * Stupid sanity-check whether RDRAND does *actually* generate - * some at least random-looking data. - */ - prev = tmp; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (rdrand_long(&tmp)) { - if (prev != tmp) - changed++; - - prev = tmp; - } + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); } - - if (WARN_ON_ONCE(!changed)) - pr_emerg( -"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); - } -#endif diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index dbaa8326d6f2..fd44b54c90d5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 19876ebfb504..24c1bb8eb196 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } -static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr, - unsigned long vm_flags) +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); struct sgx_epc_page *epc_page; - struct sgx_encl_page *entry; - - entry = xa_load(&encl->page_array, PFN_DOWN(addr)); - if (!entry) - return ERR_PTR(-EFAULT); - - /* - * Verify that the faulted page has equal or higher build time - * permissions than the VMA permissions (i.e. the subset of {VM_READ, - * VM_WRITE, VM_EXECUTE} in vma->vm_flags). - */ - if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) - return ERR_PTR(-EFAULT); /* Entry successfully located. */ if (entry->epc_page) { @@ -276,6 +261,146 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return entry; } +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) + goto err_out_epc; + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -295,9 +420,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) if (unlikely(!encl)) return VM_FAULT_SIGBUS; + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); if (IS_ERR(entry)) { mutex_unlock(&encl->lock); @@ -367,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + /* * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might * conflict with the enclave page permissions. @@ -445,7 +586,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, for ( ; ; ) { mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); if (PTR_ERR(entry) != -EBUSY) break; @@ -687,7 +828,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) spin_lock(&encl->mm_lock); list_add_rcu(&encl_mm->list, &encl->mm_list); - /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ smp_wmb(); encl->mm_list_version++; spin_unlock(&encl->mm_lock); @@ -695,6 +836,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) return 0; } +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { @@ -735,7 +943,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return PTR_ERR(pcmd); } - backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); @@ -902,8 +1109,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. * * Allocate a free EPC page and convert it to a Version Array (VA) page. * @@ -911,12 +1195,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, * a VA page, * -errno otherwise */ -struct sgx_epc_page *sgx_alloc_va_page(void) +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) { struct sgx_epc_page *epc_page; int ret; - epc_page = sgx_alloc_epc_page(NULL, true); + epc_page = sgx_alloc_epc_page(NULL, reclaim); if (IS_ERR(epc_page)) return ERR_CAST(epc_page); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 332ef3568267..a65a952116fd 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -27,7 +27,8 @@ struct sgx_encl_page { unsigned long desc; - unsigned long vm_max_prot_bits; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; struct sgx_epc_page *epc_page; struct sgx_encl *encl; struct sgx_va_page *va_page; @@ -78,7 +79,6 @@ struct sgx_va_page { }; struct sgx_backing { - pgoff_t page_index; struct page *contents; struct page *pcmd; unsigned long pcmd_offset; @@ -106,6 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, @@ -113,11 +114,18 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); - -struct sgx_epc_page *sgx_alloc_va_page(void); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index fa04a73daf9c..99004b02e2ed 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -136,57 +136,71 @@ static inline bool encls_failed(int ret) ret; \ }) +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) { return __encls_2(ECREATE, pginfo, secs); } +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ static inline int __eextend(void *secs, void *addr) { return __encls_2(EEXTEND, secs, addr); } +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) { return __encls_2(EADD, pginfo, addr); } +/* Finalize enclave build, initialize enclave for user code execution. */ static inline int __einit(void *sigstruct, void *token, void *secs) { return __encls_ret_3(EINIT, sigstruct, secs, token); } +/* Disassociate EPC page from its enclave and mark it as unused. */ static inline int __eremove(void *addr) { return __encls_ret_1(EREMOVE, addr); } +/* Copy data to an EPC page belonging to a debug enclave. */ static inline int __edbgwr(void *addr, unsigned long *data) { return __encls_2(EDGBWR, *data, addr); } +/* Copy data from an EPC page belonging to a debug enclave. */ static inline int __edbgrd(void *addr, unsigned long *data) { return __encls_1_1(EDGBRD, *data, addr); } +/* Track that software has completed the required TLB address clears. */ static inline int __etrack(void *addr) { return __encls_ret_1(ETRACK, addr); } +/* Load, verify, and unblock an EPC page. */ static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(ELDU, pginfo, addr, va); } +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ static inline int __eblock(void *addr) { return __encls_ret_1(EBLOCK, addr); } +/* Initialize an EPC page into a Version Array (VA) page. */ static inline int __epa(void *addr) { unsigned long rbx = SGX_PAGE_TYPE_VA; @@ -194,10 +208,29 @@ static inline int __epa(void *addr) return __encls_2(EPA, rbx, addr); } +/* Invalidate an EPC page and write it out to main memory. */ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(EWB, pginfo, addr, va); } +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 83df20e3e633..ebe79d60619f 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) { struct sgx_va_page *va_page = NULL; void *err; @@ -30,7 +30,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) if (!va_page) return ERR_PTR(-ENOMEM); - va_page->epc_page = sgx_alloc_va_page(); + va_page->epc_page = sgx_alloc_va_page(reclaim); if (IS_ERR(va_page->epc_page)) { err = ERR_CAST(va_page->epc_page); kfree(va_page); @@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) return va_page; } -static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) { encl->page_cnt--; @@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); else if (va_page) @@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) set_bit(SGX_ENCL_DEBUG, &encl->flags); encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; @@ -168,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* - * TCS pages must always RW set for CPU access while the SECINFO - * permissions are *always* zero - the CPU ignores the user provided - * values and silently overwrites them with zero permissions. - */ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; @@ -306,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return PTR_ERR(epc_page); } - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) { ret = PTR_ERR(va_page); goto err_out_free; @@ -344,6 +313,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, */ encl_page->encl = encl; encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; encl->secs_child_cnt++; if (flags & SGX_PAGE_MEASURE) { @@ -372,6 +342,26 @@ err_out_free: return ret; } +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + /** * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES * @encl: an enclave pointer @@ -425,14 +415,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&add_arg, arg, sizeof(add_arg))) return -EFAULT; - if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || - !IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - - if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) return -EINVAL; - if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) return -EINVAL; if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, @@ -674,6 +660,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) return sgx_set_attribute(&encl->attributes_mask, params.fd); } +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -695,6 +1240,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_PROVISION: ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index a78652d43e61..515e2a5f25bb 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; - unsigned long mm_list_version; - struct sgx_encl_mm *encl_mm; - struct vm_area_struct *vma; - int idx, ret; - - do { - mm_list_version = encl->mm_list_version; - - /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ - smp_rmb(); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - mmap_read_lock(encl_mm->mm); - - ret = sgx_encl_find(encl_mm->mm, addr, &vma); - if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); - - mmap_read_unlock(encl_mm->mm); - - mmput_async(encl_mm->mm); - } + int ret; - srcu_read_unlock(&encl->srcu, idx); - } while (unlikely(encl->mm_list_version != mm_list_version)); + sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); @@ -201,39 +174,10 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, return ret; } -static void sgx_ipi_cb(void *info) +void sgx_ipi_cb(void *info) { } -static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -{ - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; - int idx; - - /* - * Can race with sgx_encl_mm_add(), but ETRACK has already been - * executed, which means that the CPUs running in the new mm will enter - * into the enclave with a fresh epoch. - */ - cpumask_clear(cpumask); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); - - mmput_async(encl_mm->mm); - } - - srcu_read_unlock(&encl->srcu, idx); - - return cpumask; -} - /* * Swap page to the regular memory transformed to the blocked state by using * EBLOCK, which means that it can no longer be referenced (no new TLB entries). @@ -280,7 +224,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ - on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } @@ -431,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark) !list_empty(&sgx_active_page_list); } +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + static int ksgxd(void *p) { set_freezable(); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f17def9fe6f..0f2020653fba 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -86,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) struct sgx_epc_page *__sgx_alloc_epc_page(void); void sgx_free_epc_page(struct sgx_epc_page *page); +void sgx_reclaim_direct(void); void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +void sgx_ipi_cb(void *info); + #ifdef CONFIG_X86_SGX_KVM int __init sgx_vepc_init(void); #else diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c04b933f48d3..02039ec3597d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f267205f2d5a..9dac24680ff8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1017,10 +1017,10 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* - * SETUP_EFI is supplied by kexec and does not need to be - * reserved. + * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need + * to be reserved. */ - if (data->type != SETUP_EFI) + if (data->type != SETUP_EFI && data->type != SETUP_IMA) e820__range_update_kexec(pa_data, sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4fe7af58cfe1..9417d5aa7305 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 0531d6a06df5..3b28c5b25e12 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -851,3 +851,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) */ return 0; } + +/* + * Initialize register state that may prevent from entering low-power idle. + * This function will be invoked from the cpuidle driver only when needed. + */ +void fpu_idle_fpregs(void) +{ + /* Note: AMX_TILE being enabled implies XGETBV1 support */ + if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && + (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { + tile_release(); + fpregs_deactivate(¤t->thread.fpu); + } +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5b4efc927d80..bd165004776d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -91,6 +91,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) /* Make sure it is what we expect it to be */ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; WARN_ON(1); return -EINVAL; } @@ -301,7 +302,7 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS) +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -357,7 +358,10 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - memcpy(ip, retq, RET_SIZE); + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index bd4a34100ed0..6a3cfaf6b72a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -426,10 +426,12 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ -static void __init clear_bss(void) +void __init clear_bss(void) { memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); } static unsigned long get_cmd_line_ptr(void) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index eb8656bac99b..9b7acc9c7874 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -23,6 +23,7 @@ #include <asm/cpufeatures.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/nospec-branch.h> #include <asm/bootparam.h> #include <asm/export.h> #include <asm/pgtable_32.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 92c4afa2b729..d860d437631b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -389,6 +389,8 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -448,6 +450,7 @@ SYM_CODE_END(early_idt_handler_array) SYM_CODE_START_LOCAL(early_idt_handler_common) UNWIND_HINT_IRET_REGS offset=16 + ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -497,6 +500,8 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 68f091ba8443..f5b8ef02d172 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -146,16 +146,3 @@ void arch_jump_label_transform_apply(void) text_poke_finish(); mutex_unlock(&text_mutex); } - -static enum { - JL_STATE_START, - JL_STATE_NO_UPDATE, - JL_STATE_UPDATE, -} jlstate __initdata_or_module = JL_STATE_START; - -__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - if (jlstate == JL_STATE_UPDATE) - jump_label_transform(entry, type, 1); -} diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 170d0fd68b1f..6b58610a1552 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -17,7 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verification.h> +#include <linux/random.h> #include <asm/bootparam.h> #include <asm/setup.h> @@ -110,6 +110,26 @@ static int setup_e820_entries(struct boot_params *params) return 0; } +enum { RNG_SEED_LENGTH = 32 }; + +static void +setup_rng_seed(struct boot_params *params, unsigned long params_load_addr, + unsigned int rng_seed_setup_data_offset) +{ + struct setup_data *sd = (void *)params + rng_seed_setup_data_offset; + unsigned long setup_data_phys; + + if (!rng_is_initialized()) + return; + + sd->type = SETUP_RNG_SEED; + sd->len = RNG_SEED_LENGTH; + get_random_bytes(sd->data, RNG_SEED_LENGTH); + setup_data_phys = params_load_addr + rng_seed_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} + #ifdef CONFIG_EFI static int setup_efi_info_memmap(struct boot_params *params, unsigned long params_load_addr, @@ -186,11 +206,38 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +static void +setup_ima_state(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int ima_setup_data_offset) +{ +#ifdef CONFIG_IMA_KEXEC + struct setup_data *sd = (void *)params + ima_setup_data_offset; + unsigned long setup_data_phys; + struct ima_setup_data *ima; + + if (!image->ima_buffer_size) + return; + + sd->type = SETUP_IMA; + sd->len = sizeof(*ima); + + ima = (void *)sd + sizeof(struct setup_data); + ima->addr = image->ima_buffer_addr; + ima->size = image->ima_buffer_size; + + /* Add setup data */ + setup_data_phys = params_load_addr + ima_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +#endif /* CONFIG_IMA_KEXEC */ +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, unsigned int efi_map_offset, unsigned int efi_map_sz, - unsigned int efi_setup_data_offset) + unsigned int setup_data_offset) { unsigned int nr_e820_entries; unsigned long long mem_k, start, end; @@ -245,8 +292,22 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, #ifdef CONFIG_EFI /* Setup EFI state */ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, - efi_setup_data_offset); + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct efi_setup_data); #endif + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { + /* Setup IMA log buffer state */ + setup_ima_state(image, params, params_load_addr, + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); + } + + /* Setup RNG seed */ + setup_rng_seed(params, params_load_addr, setup_data_offset); + /* Setup EDD info */ memcpy(params->eddbuf, boot_params.eddbuf, EDDMAXNR * sizeof(struct edd_info)); @@ -401,7 +462,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + - sizeof(struct efi_setup_data); + sizeof(struct efi_setup_data) + + sizeof(struct setup_data) + + RNG_SEED_LENGTH; + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) @@ -528,28 +595,11 @@ static int bzImage64_cleanup(void *loader_data) return 0; } -#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG -static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) -{ - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - } - return ret; -} -#endif - const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG - .verify_sig = bzImage64_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7c4ab8870da4..74167dc5f55e 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -814,16 +814,20 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - /* Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { + /* This will restore both kcb and current_kprobe */ restore_previous_kprobe(kcb); - else + } else { + /* + * Always update the kcb status because + * reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); reset_current_kprobe(); + } } NOKPROBE_SYMBOL(kprobe_post_process); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1a3658f7e6d9..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -236,8 +236,7 @@ again: raw_spin_unlock(&b->lock); /* A dummy token might be allocated and ultimately not used. */ - if (dummy) - kfree(dummy); + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b98ffcf4d250..b1abf663417c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -253,7 +253,7 @@ int module_finalize(const Elf_Ehdr *hdr, { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -271,6 +271,8 @@ int module_finalize(const Elf_Ehdr *hdr, orc_ip = s; if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) retpolines = s; + if (!strcmp(".return_sites", secstrings + s->sh_name)) + returns = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@ -287,6 +289,10 @@ int module_finalize(const Elf_Ehdr *hdr, void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + if (returns) { + void *rseg = (void *)returns->sh_addr; + apply_returns(rseg, rseg + returns->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; @@ -304,9 +310,6 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } - /* make jump label nops */ - jump_label_apply_nops(me); - if (orc && orc_ip) unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, (void *)orc->sh_addr, orc->sh_size); diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9b2772b7e1f3..58a6ea472db9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - wrmsrl(MSR_IA32_SPEC_CTRL, msr); + write_spec_ctrl_current(msr, false); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -810,24 +810,43 @@ static void amd_e400_idle(void) } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * Some AMD machines are the opposite, they depend on using HALT. - * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - return 1; + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -932,9 +951,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index fcc8a7699103..c7c4b1917336 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -7,10 +7,12 @@ #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/kexec.h> +#include <asm/nospec-branch.h> #include <asm/processor-flags.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 2) @@ -91,7 +93,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -159,12 +163,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %edx, %edx xorl %esi, %esi xorl %ebp, %ebp - RET + ANNOTATE_UNRET_SAFE + ret + int3 1: popl %edx movl CP_PA_SWAP_PAGE(%edi), %esp addl $PAGE_SIZE, %esp 2: + ANNOTATE_RETPOLINE_SAFE call *%edx /* get the re-entry point of the peer system */ @@ -190,7 +197,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movl %edi, %eax addl $(virtual_mapped - relocate_kernel), %eax pushl %eax - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -208,7 +217,9 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popl %edi popl %esi popl %ebx - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -271,7 +282,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) popl %edi popl %ebx popl %ebp - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c1d8626c53b6..4809c0dc4eb0 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -13,7 +13,8 @@ #include <asm/unwind_hints.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 3) @@ -105,7 +106,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -200,7 +203,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %r14d, %r14d xorl %r15d, %r15d - RET + ANNOTATE_UNRET_SAFE + ret + int3 1: popq %rdx @@ -219,7 +224,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) call swap_pages movq $virtual_mapped, %rax pushq %rax - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -241,7 +248,9 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popq %r12 popq %rbp popq %rbx - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -298,7 +307,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) lea PAGE_SIZE(%rax), %rsi jmp 0b 3: - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd6c6fd373ae..216fee7144ee 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -11,6 +11,7 @@ #include <linux/dma-map-ops.h> #include <linux/dmi.h> #include <linux/efi.h> +#include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> #include <linux/iscsi_ibft.h> @@ -23,6 +24,7 @@ #include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> #include <linux/swiotlb.h> +#include <linux/random.h> #include <uapi/linux/mount.h> @@ -140,6 +142,11 @@ __visible unsigned long mmu_cr4_features __ro_after_init; __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif +#ifdef CONFIG_IMA +static phys_addr_t ima_kexec_buffer_phys; +static size_t ima_kexec_buffer_size; +#endif + /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; @@ -330,6 +337,60 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init add_early_ima_buffer(u64 phys_addr) +{ +#ifdef CONFIG_IMA + struct ima_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap ima_setup_data entry\n"); + return; + } + + if (data->size) { + memblock_reserve(data->addr, data->size); + ima_kexec_buffer_phys = data->addr; + ima_kexec_buffer_size = data->size; + } + + early_memunmap(data, sizeof(*data)); +#else + pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); +#endif +} + +#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) +int __init ima_free_kexec_buffer(void) +{ + int rc; + + if (!ima_kexec_buffer_size) + return -ENOENT; + + rc = memblock_phys_free(ima_kexec_buffer_phys, + ima_kexec_buffer_size); + if (rc) + return rc; + + ima_kexec_buffer_phys = 0; + ima_kexec_buffer_size = 0; + + return 0; +} + +int __init ima_get_kexec_buffer(void **addr, size_t *size) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + *addr = __va(ima_kexec_buffer_phys); + *size = ima_kexec_buffer_size; + + return 0; +} +#endif + static void __init parse_setup_data(void) { struct setup_data *data; @@ -355,6 +416,18 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_IMA: + add_early_ima_buffer(pa_data); + break; + case SETUP_RNG_SEED: + data = early_memremap(pa_data, data_len); + add_bootloader_randomness(data->data, data->len); + /* Zero seed for forward secrecy. */ + memzero_explicit(data->data, data->len); + /* Zero length in case we find ourselves back here by accident. */ + memzero_explicit(&data->len, sizeof(data->len)); + early_memunmap(data, data_len); + break; default: break; } diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index b478edf43bec..3a5b0c9c4fcc 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -219,9 +219,10 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt return ES_VMM_ERROR; } -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, - struct es_em_ctxt *ctxt, u64 exit_code, - u64 exit_info_1, u64 exit_info_2) +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) { /* Fill in protocol and format specifiers */ ghcb->protocol_version = ghcb_version; @@ -231,14 +232,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, ghcb_set_sw_exit_info_1(ghcb, exit_info_1); ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - /* - * Hyper-V unenlightened guests use a paravisor for communicating and - * GHCB pages are being allocated and set up by that paravisor. Linux - * should not change the GHCB page's physical address. - */ - if (set_ghcb_msr) - sev_es_wr_ghcb_msr(__pa(ghcb)); - + sev_es_wr_ghcb_msr(__pa(ghcb)); VMGEXIT(); return verify_exception_info(ghcb, ctxt); @@ -795,7 +789,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) */ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO, + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, exit_info_2); if (ret != ES_OK) return ret; @@ -837,8 +831,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rax(ghcb, rax); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, - SVM_EXIT_IOIO, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); if (ret != ES_OK) return ret; @@ -894,7 +887,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; @@ -919,7 +912,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index c05f0124c410..63dc626627a0 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -786,7 +786,7 @@ static int vmgexit_psc(struct snp_psc_desc *desc) ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); /* * Page State Change VMGEXIT can pass error code through @@ -1212,8 +1212,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, - exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); if ((ret == ES_OK) && (!exit_info_1)) { regs->ax = ghcb->save.rax; @@ -1452,7 +1451,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); } /* @@ -1628,7 +1627,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, /* Using a value of 0 for ExitInfo1 means RAX holds the value */ ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); if (ret != ES_OK) return ret; @@ -1658,7 +1657,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); } static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -1667,7 +1666,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt ghcb_set_rcx(ghcb, ctxt->regs->cx); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); if (ret != ES_OK) return ret; @@ -1708,7 +1707,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, if (x86_platform.hyper.sev_es_hcall_prepare) x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); if (ret != ES_OK) return ret; @@ -2197,7 +2196,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned ghcb_set_rbx(ghcb, input->data_npages); } - ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); if (ret) goto e_put; diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index aa72cefdd5be..aaaba85d6d7f 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -12,13 +12,21 @@ enum insn_type { }; /* + * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such + * that there is no false-positive trampoline identification while also being a + * speculation stop. + */ +static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; + +/* * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax */ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; -static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) +static void __ref __static_call_transform(void *insn, enum insn_type type, + void *func, bool modinit) { const void *emulate = NULL; int size = CALL_INSN_SIZE; @@ -43,14 +51,17 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void break; case RET: - code = &retinsn; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + else + code = &retinsn; break; } if (memcmp(insn, code, size) == 0) return; - if (unlikely(system_state == SYSTEM_BOOTING)) + if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); text_poke_bp(insn, code, size, emulate); @@ -60,7 +71,7 @@ static void __static_call_validate(void *insn, bool tail, bool tramp) { u8 opcode = *(u8 *)insn; - if (tramp && memcmp(insn+5, "SCT", 3)) { + if (tramp && memcmp(insn+5, tramp_ud, 3)) { pr_err("trampoline signature fail"); BUG(); } @@ -104,14 +115,42 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) if (tramp) { __static_call_validate(tramp, true, true); - __static_call_transform(tramp, __sc_insn(!func, true), func); + __static_call_transform(tramp, __sc_insn(!func, true), func, false); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { __static_call_validate(site, tail, false); - __static_call_transform(site, __sc_insn(!func, tail), func); + __static_call_transform(site, __sc_insn(!func, tail), func, false); } mutex_unlock(&text_mutex); } EXPORT_SYMBOL_GPL(arch_static_call_transform); + +#ifdef CONFIG_RETHUNK +/* + * This is called by apply_returns() to fix up static call trampolines, + * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as + * having a return trampoline. + * + * The problem is that static_call() is available before determining + * X86_FEATURE_RETHUNK and, by implication, running alternatives. + * + * This means that __static_call_transform() above can have overwritten the + * return trampoline and we now need to fix things up to be consistent. + */ +bool __static_call_fixup(void *tramp, u8 op, void *dest) +{ + if (memcmp(tramp+5, tramp_ud, 3)) { + /* Not a trampoline site, not our problem. */ + return false; + } + + mutex_lock(&text_mutex); + if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) + __static_call_transform(tramp, RET, NULL, true); + mutex_unlock(&text_mutex); + + return true; +} +#endif diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 0c1154a1c403..3bacd935f840 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -6,7 +6,6 @@ * Copyright (c) 2006-2009, Intel Corporation */ -#include <linux/intel-iommu.h> #include <linux/init_task.h> #include <linux/spinlock.h> #include <linux/export.h> @@ -516,17 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb return dmar_tbl; } - -int tboot_force_iommu(void) -{ - if (!tboot_enabled()) - return 0; - - if (no_iommu || dmar_disabled) - pr_warn("Forcing Intel-IOMMU to enabled\n"); - - dmar_disabled = 0; - no_iommu = 0; - - return 1; -} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 81aba718ecd5..15f29053cec4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,7 +141,7 @@ SECTIONS #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.indirect_thunk) + *(.text.__x86.*) __indirect_thunk_end = .; #endif } :text =0xcccc @@ -283,6 +283,13 @@ SECTIONS *(.retpoline_sites) __retpoline_sites_end = .; } + + . = ALIGN(8); + .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) { + __return_sites = .; + *(.return_sites) + __return_sites_end = .; + } #endif #ifdef CONFIG_X86_KERNEL_IBT @@ -385,7 +392,7 @@ SECTIONS __end_of_kernel_reserve = .; . = ALIGN(PAGE_SIZE); - .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) { + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ *(.bss..brk) /* areas brk users have reserved */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index de6d44e07e34..75dcf7a72605 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; @@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) /* * Calculate guest's supported XCR0 taking into account guest CPUID data and - * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { @@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) if (!best) return 0; - return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, @@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 9: break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); - u64 permitted_xss = supported_xss; + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); @@ -1313,12 +1334,20 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9d..b1658c0de847 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9240b3b7f8dd..cfed36aba2f7 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { - *val = kvm_tsc_scaling_ratio_frac_bits; + *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } @@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 89b11e7dca8a..047c583596bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -189,9 +189,6 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) -#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) -#define FASTOP_SIZE (8 * (1 + HAS_KERNEL_IBT)) - struct opcode { u64 flags; u8 intercept; @@ -247,6 +244,9 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); @@ -256,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; @@ -269,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { + unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } @@ -306,9 +313,15 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for * different operand sizes can be reached by calculation, rather than a jump * table (which would be bigger than the code). + * + * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR + * and 1 for the straight line speculation INT3, leaves 7 bytes for the + * body of the function. Currently none is larger than 4. */ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); +#define FASTOP_SIZE 16 + #define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ ".type " name ", @function \n\t" \ @@ -325,13 +338,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #define FOP_RET(name) \ __FOP_RET(#name) -#define FOP_START(op) \ +#define __FOP_START(op, align) \ extern void em_##op(struct fastop *fake); \ asm(".pushsection .text, \"ax\" \n\t" \ ".global em_" #op " \n\t" \ - ".align " __stringify(FASTOP_SIZE) " \n\t" \ + ".align " __stringify(align) " \n\t" \ "em_" #op ":\n\t" +#define FOP_START(op) __FOP_START(op, FASTOP_SIZE) + #define FOP_END \ ".popsection") @@ -435,17 +450,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); /* * Depending on .config the SETcc functions look like: * - * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT] - * SETcc %al [3 bytes] - * RET [1 byte] - * INT3 [1 byte; CONFIG_SLS] - * - * Which gives possible sizes 4, 5, 8 or 9. When rounded up to the - * next power-of-two alignment they become 4, 8 or 16 resp. + * ENDBR [4 bytes; CONFIG_X86_KERNEL_IBT] + * SETcc %al [3 bytes] + * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK] + * INT3 [1 byte; CONFIG_SLS] */ -#define SETCC_LENGTH (ENDBR_INSN_SIZE + 4 + IS_ENABLED(CONFIG_SLS)) -#define SETCC_ALIGN (4 << IS_ENABLED(CONFIG_SLS) << HAS_KERNEL_IBT) -static_assert(SETCC_LENGTH <= SETCC_ALIGN); +#define SETCC_ALIGN 16 #define FOP_SETCC(op) \ ".align " __stringify(SETCC_ALIGN) " \n\t" \ @@ -453,9 +463,10 @@ static_assert(SETCC_LENGTH <= SETCC_ALIGN); #op ": \n\t" \ ASM_ENDBR \ #op " %al \n\t" \ - __FOP_RET(#op) + __FOP_RET(#op) \ + ".skip " __stringify(SETCC_ALIGN) " - (.-" #op "), 0xcc \n\t" -FOP_START(setcc) +__FOP_START(setcc, SETCC_ALIGN) FOP_SETCC(seto) FOP_SETCC(setno) FOP_SETCC(setc) @@ -614,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { - WARN_ON(vec > 0x1f); + if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) + return X86EMUL_UNHANDLEABLE; + ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -1361,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - WARN_ON((mc->end + size) >= sizeof(mc->data)); + if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) + return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); @@ -1686,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - if (!seg_desc.p) { - err_vec = NP_VECTOR; - goto exception; - } - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1733,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); + } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -2431,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < 8; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2488,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < 16; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -5718,7 +5731,8 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) { - WARN_ON(ctxt->exception.vector > 0x1f); + if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) + return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e2e95a6fccfd..ed804447589c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8dff25d267b7..89246446d6aa 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -89,6 +89,7 @@ struct x86_instruction_info { #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { + void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * @@ -301,6 +302,18 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); +/* + * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is + * tracked/accessed via _eip, and except for RIP relative addressing, which + * also uses _eip, RIP cannot be a register operand nor can it be an operand in + * a ModRM or SIB byte. + */ +#ifdef CONFIG_X86_64 +#define NR_EMULATOR_GPRS 16 +#else +#define NR_EMULATOR_GPRS 8 +#endif + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -345,9 +358,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_valid; + u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u32 regs_dirty; + u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; @@ -363,7 +376,7 @@ struct x86_emulate_ctxt { struct operand src2; struct operand dst; struct operand memop; - unsigned long _regs[NR_VCPU_REGS]; + unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; @@ -371,6 +384,15 @@ struct x86_emulate_ctxt { bool is_branch; }; +#define KVM_EMULATOR_BUG_ON(cond, ctxt) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret)) \ + ctxt->ops->vm_bugged(ctxt); \ + unlikely(__ret); \ +}) + /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0e68b4c937fc..e2ce3556915e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> +#include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> @@ -54,7 +55,7 @@ #define PRIo64 "o" /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 @@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) +{ + return apic->nr_lvt_entries > lvt_index; +} + +static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) +{ + return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION; + u32 v = 0; if (!lapic_in_kernel(vcpu)) return; + v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); + /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with @@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) +{ + int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) + return; + + /* Initialize/mask any "new" LVT entries. */ + for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); + + apic->nr_lvt_entries = nr_lvt_entries; + + /* The number of LVT entries is reflected in the version register. */ + kvm_apic_set_version(vcpu); +} + +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { + [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ + [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, + [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, + [LVT_LINT0] = LINT_MASK, + [LVT_LINT1] = LINT_MASK, + [LVT_ERROR] = LVT_MASK, + [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) @@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - - vcpu = apic->vcpu; - - if (unlikely(vcpu->arch.apicv_active)) { + if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need @@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - if (apic_x2apic_mode(apic)) - return mda == kvm_x2apic_id(apic); - /* - * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if - * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and - * this allows unique addressing of VCPUs with APIC ID over 0xff. - * The 0xff condition is needed because writeable xAPIC ID. + * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they + * were in x2APIC mode if the target APIC ID can't be encoded as an + * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which + * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC + * mode. Match the x2APIC ID if and only if the target APIC ID can't + * be encoded in xAPIC to avoid spurious matches against a vCPU that + * changed its (addressable) xAPIC ID (which is writable). */ - if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) - return true; + if (apic_x2apic_mode(apic) || mda > 0xff) + return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } @@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); @@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); + if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); + /* * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be @@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (vcpu->arch.apicv_active) + if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ - if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { @@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; - if (!from_timer_fn && vcpu->arch.apicv_active) { + if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; @@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); } +static int get_lvt_index(u32 reg) +{ + if (reg == APIC_LVTCMCI) + return LVT_CMCI; + if (reg < APIC_LVTT || reg > APIC_LVTERR) + return -1; + return array_index_nospec( + (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; - u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) { + kvm_lapic_set_reg(apic, APIC_LVTx(i), + kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: { - /* TODO: Check vector */ - size_t size; - u32 index; - + case APIC_LVTERR: + case APIC_LVTCMCI: { + u32 index = get_lvt_index(reg); + if (!kvm_lapic_lvt_supported(apic, index)) { + ret = 1; + break; + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - size = ARRAY_SIZE(apic_lvt_mask); - index = array_index_nospec( - (reg - APIC_LVTT) >> 4, size); val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; @@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val; + + if (apic_x2apic_mode(apic)) + kvm_lapic_msr_read(apic, offset, &val); + else + val = kvm_lapic_get_reg(apic, offset); - /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + /* + * ICR is a single 64-bit register when x2APIC is enabled. For legacy + * xAPIC, ICR writes need to go down the common (slightly slower) path + * to get the upper half from ICR2. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); + trace_kvm_apic_write(APIC_ICR, val); + } else { + /* TODO: optimize to just emulate side effect w/o one more write */ + kvm_lapic_reg_write(apic, offset, (u32)val); + } } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { + kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; @@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) @@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) } apic->vcpu = vcpu; + apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; @@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 65bb2a8cf145..117a46df5cc1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,7 +10,6 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 @@ -29,6 +28,20 @@ enum lapic_mode { LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, }; +enum lapic_lvt_entry { + LVT_TIMER, + LVT_THERMAL_MONITOR, + LVT_PERFORMANCE_COUNTER, + LVT_LINT0, + LVT_LINT1, + LVT_ERROR, + LVT_CMCI, + + KVM_APIC_MAX_NR_LVT_ENTRIES, +}; + +#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x)) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -48,6 +61,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool apicv_active; bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; @@ -65,6 +79,7 @@ struct kvm_lapic { struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; + int nr_lvt_entries; }; struct dest_map; @@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); @@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apicv_active; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f8192864b496..a99acec925eb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,11 +6,6 @@ #include "kvm_cache_regs.h" #include "cpuid.h" -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 @@ -34,11 +29,6 @@ #define PT_DIR_PAT_SHIFT 12 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - #define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..06ac8c7cef67 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,8 +53,6 @@ #include <asm/kvm_page_track.h> #include "trace.h" -#include "paging.h" - extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; @@ -111,26 +109,6 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ @@ -326,13 +304,6 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { @@ -432,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * An spte tlb flush may be pending, because kvm_set_pte_rmap * coalesces them and we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * @@ -558,11 +529,12 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -682,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -695,48 +669,79 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (!sp->role.direct) - return sp->gfns[index]; + return sp->shadowed_translation[index] >> PAGE_SHIFT; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp_has_gptes(sp)) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) +{ + if (sp_has_gptes(sp)) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -792,6 +797,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) KVM_PAGE_TRACK_WRITE); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -855,7 +863,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; @@ -866,7 +874,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); + desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; @@ -878,7 +886,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + desc->more = kvm_mmu_memory_cache_alloc(cache); desc = desc->more; desc->spte_count = 0; break; @@ -913,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; @@ -949,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1030,7 +1039,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1042,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(spte, rmap_head); } /* @@ -1129,26 +1138,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } + struct kvm_mmu_page *sp; - return false; -} + sp = sptep_to_sp(sptep); + WARN_ON(sp->role.level == PG_LEVEL_4K); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - } } /* @@ -1383,22 +1384,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { - return pte_list_destroy(kvm, rmap_head); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return __kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) +static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; @@ -1417,7 +1418,7 @@ restart: need_flush = true; if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( @@ -1529,7 +1530,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1542,7 +1543,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1550,9 +1551,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1564,9 +1565,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1579,31 +1580,43 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); + kvm_update_page_stats(kvm, sp->role.level, 1); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } } +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) +{ + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; + + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1616,7 +1629,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1652,14 +1665,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1668,19 +1681,19 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -1690,27 +1703,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -1725,11 +1717,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; @@ -1789,7 +1779,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2019,36 +2009,24 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned int access) +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; - union kvm_mmu_page_role role; - struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2064,16 +2042,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + if (role.level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } - if (direct_mmu) - goto trace_get_page; + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) + goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2092,37 +2074,160 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, WARN_ON(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } - ++vcpu->kvm->stat.mmu_cache_miss; + sp = NULL; + ++kvm->stat.mmu_cache_miss; + +out: + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; +}; + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, + struct shadow_page_caches *caches, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); + if (!role.direct) + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp = kvm_mmu_alloc_page(vcpu, direct); + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(kvm, +1); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { - account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); + if (sp_has_gptes(sp)) + account_shadowed(kvm, sp); + + return sp; +} + +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); +} + +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = spte_index(sptep) & 1; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2145,7 +2250,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2164,7 +2269,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2177,7 +2282,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2186,23 +2291,38 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep, flush); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(cache, sp, sptep); if (sp->unsync_children || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2216,7 +2336,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2237,7 +2357,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, spte); /* @@ -2263,7 +2383,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; @@ -2396,7 +2516,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + kvm_mmu_free_shadow_page(sp); } } @@ -2676,7 +2796,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -2711,8 +2831,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; @@ -2728,7 +2850,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, int i, ret; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; @@ -2754,7 +2876,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, WARN_ON(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -2798,20 +2920,42 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2823,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; @@ -2864,7 +3011,7 @@ out: int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) + int max_level) { struct kvm_lpage_info *linfo; int host_level; @@ -2879,7 +3026,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } @@ -2893,7 +3040,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -2904,8 +3051,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->gfn, fault->max_level); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -2961,13 +3107,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && fault->req_level >= it.level) @@ -3095,7 +3238,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) @@ -3265,7 +3408,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); if (WARN_ON(!sp)) return; @@ -3369,12 +3512,19 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, + u8 level) { + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + role.level = level; + role.quadrant = quadrant; + + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); + + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3397,7 +3547,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,8 +3558,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3493,9 +3643,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3533,7 +3682,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,8 +3727,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } @@ -3737,7 +3893,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; + root &= SPTE_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp, true); } @@ -4155,14 +4311,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. If the host MTRRs are ignored by TDP + * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA + * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype + * from the guest's MTRRs so that guest accesses to memory that is + * DMA'd aren't cached against the guest's wishes. + * + * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, + * e.g. KVM will force UC memtype for host MMIO. + */ + if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - --fault->max_level; + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } } return direct_page_fault(vcpu, fault); @@ -4567,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -5199,11 +5367,11 @@ static bool need_remote_flush(u64 old, u64 new) return false; if (!is_shadow_present_pte(new)) return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) + if ((old ^ new) & SPTE_BASE_ADDR_MASK) return true; old ^= shadow_nx_mask; new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; + return (old & ~new & SPTE_PERM_MASK) != 0; } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -5328,13 +5496,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5819,9 +5980,25 @@ int kvm_mmu_init_vm(struct kvm *kvm) node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); +} + void kvm_mmu_uninit_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; @@ -5829,9 +6006,11 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); kvm_mmu_uninit_tdp_mmu(kvm); + + mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -5853,8 +6032,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - + flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5879,7 +6057,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) @@ -5950,15 +6128,249 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + /* + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. + */ + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to use a list if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + bool flush = false; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. + */ + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); + continue; + } + + spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { + slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); + } +} + /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same resons as in @@ -5973,12 +6385,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to @@ -5997,13 +6416,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); - pfn = spte_to_pfn(*sptep); /* * We cannot do huge page mapping for indirect shadow pages, @@ -6012,10 +6429,10 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -6030,18 +6447,24 @@ restart: return need_tlb_flush; } +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); +} + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } @@ -6317,7 +6740,7 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker, "x86-mmu"); if (ret) goto out; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..582def531d4d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,20 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest @@ -53,8 +67,21 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; @@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, @@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + int max_level); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index de8ab323bb70..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..f5958071220c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,25 +16,21 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -42,36 +38,35 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -97,6 +92,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -374,7 +378,7 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), @@ -421,11 +425,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); + + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + + if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); } @@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed && + fault->req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } @@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + pte_gpa += spte_index(sptep) * sizeof(pt_element_t); mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) @@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 @@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * @@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; @@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; - if (gfn != sp->gfns[i]) { + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; } + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; @@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) flush |= mmu_spte_update(sptep, spte); } + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ return flush; } @@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..7314d27d57a4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; + WARN_ON_ONCE(!pte_access && !shadow_present_mask); + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) @@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + if (shadow_memtype_mask) + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, + int index) { u64 child_spte; - int child_level; if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) return 0; @@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) return 0; child_spte = huge_spte; - child_level = huge_level - 1; /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled()) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } @@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) { u64 new_spte; - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; @@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* + * EPT overrides the host MTRRs, and so KVM must program the desired + * memtype directly into the SPTEs. Note, this mask is just the mask + * of all bits that factor into the memtype, the actual memtype must be + * dynamically calculated, e.g. to ensure host MMIO is mapped UC. + */ + shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + + /* + * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB + * memtype in the SPTEs, i.e. relies on host MTRRs to provide the + * correct memtype (WB is the "weakest" memtype). + */ + shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..cabe3fbb4f39 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching; static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte) return spte == REMOVED_SPTE; } +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to @@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) @@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..39b48e7d7d1a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -11,7 +11,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); @@ -146,15 +146,6 @@ static bool try_step_up(struct tdp_iter *iter) } /* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - -/* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..bf2ccf9debca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; /* * The caller is responsible for ensuring the old SPTE is not a REMOVED @@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); @@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush) @@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); @@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) @@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { +retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - pfn = spte_to_pfn(iter.old_spte); - - if (kvm_is_reserved_pfn(pfn)) + if (is_last_spte(iter.old_spte, iter.level)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - if (max_mapping_level == iter.level) + if (iter.gfn < start || iter.gfn >= end) continue; - /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. - */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, PG_LEVEL_NUM); + if (max_mapping_level < iter.level) + continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); - - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + goto retry; } rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Zap non-leaf SPTEs (and free their associated page tables) which could + * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3f868fed9114..02f9e4f245bd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <asm/perf_event.h> +#include <asm/cpu_device_id.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -24,6 +25,15 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -34,7 +44,9 @@ * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD - * has MSR_K7_PERFCTRn. + * has MSR_K7_PERFCTRn and, for families 15H and later, + * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are + * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except @@ -46,7 +58,8 @@ * between pmc and perf counters is as the following: * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed - * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H + * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; @@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; - - if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) - return; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) + attr.precise_ip = 3; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; @@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); + struct kvm *kvm = pmc->vcpu->kvm; bool allow_event = true; + __u64 key; + int idx; - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; - - pmc_pause_counter(pmc); - - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) - return; + if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) + return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + if (!filter) + goto out; + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; if (bsearch(&key, filter->events, filter->nevents, sizeof(__u64), cmp_u64)) allow_event = filter->action == KVM_PMU_EVENT_ALLOW; else allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; } - if (!allow_event) - return; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; - - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); +out: + return allow_event; } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_counter(struct kvm_pmc *pmc) { - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!en_field || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + if (!check_pmu_event_filter(pmc)) return; - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (!pmc) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + pmc_release_perf_event(pmc); - reprogram_fixed_counter(pmc, ctrl, idx); - } + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_counter); @@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { @@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } @@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int perf_hw_id) { - u64 old_eventsel = pmc->eventsel; - unsigned int config; - - pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - pmc->eventsel = old_eventsel; - return config == perf_hw_id; + return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & + AMD64_RAW_EVENT_MASK_NB); } static inline bool cpl_is_matched(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e745f443b6a8..5cc5721f260b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -8,6 +8,9 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) @@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); + bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, @@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1bc5820ea46..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -40,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -59,6 +63,54 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + /* For xAVIC and hybrid-xAVIC modes */ + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) */ int cpu = READ_ONCE(vcpu->cpu); - if (cpu != get_cpu()) + if (cpu != get_cpu()) { wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } put_cpu(); } @@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) dest = icrh; else - dest = GET_APIC_DEST_FIELD(icrh); + dest = GET_XAPIC_DEST_FIELD(icrh); if (dest_mode == APIC_DEST_PHYSICAL) { /* broadcast destination, use slow path */ @@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (apic_x2apic_mode(source)) { - l1_physical_id = logid_index; - } else { + if (!apic_x2apic_mode(source)) { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; } } @@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { + u32 dest; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) { + dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, icrl & APIC_MODE_MASK, @@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; @@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; + + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); +} + static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | @@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); @@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * accordingly before re-activating. */ avic_apicv_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); @@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba7cd26f438f..76dcc8a3e849 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) @@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->nested_ctl = from->nested_ctl; to->event_inj = from->event_inj; to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; to->virt_ext = from->virt_ext; to->pause_filter_count = from->pause_filter_count; @@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } @@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.event_inj = svm->nested.ctl.event_inj; vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->nested_ctl = from->nested_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; @@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 136039fc6d01..f24613a108c5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,34 +33,6 @@ enum index { INDEX_ERROR, }; -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* duplicated from amd_f17h_perfmon_event_map. */ -static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* amd_pmc_perf_hw_id depends on these being the same size */ -static_assert(ARRAY_SIZE(amd_event_mapping) == - ARRAY_SIZE(amd_f17h_event_mapping)); - static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - struct kvm_event_hw_type_mapping *event_mapping; - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ - if (WARN_ON(pmc_is_fixed(pmc))) - return PERF_COUNT_HW_MAX; - - if (guest_cpuid_family(pmc->vcpu) >= 0x17) - event_mapping = amd_f17h_event_mapping; - else - event_mapping = amd_event_mapping; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (event_mapping[i].eventsel == event_select - && event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return event_mapping[i].event_type; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_counter(pmc); + } return 0; } @@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .pmc_perf_hw_id = amd_pmc_perf_hw_id, + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0c240ed04f96..b0e793e7d85c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } @@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm, { struct kvm_vcpu *vcpu; unsigned long i, j; - bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; - if (first) { +#ifdef CONFIG_PROVE_LOCKING + if (!i) /* * Reset the role to one that avoids colliding with * the role used for the first vcpu mutex. */ role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { + else mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } +#endif } return 0; out_unlock: - first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; - if (first) - first = false; - else +#ifdef CONFIG_PROVE_LOCKING + if (j) mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - +#endif mutex_unlock(&vcpu->mutex); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44bbf25dfeb9..38f873cb6f2c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) + static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is initially cleared */ @@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); -static bool force_avic; -module_param_unsafe(force_avic, bool, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; + + rip = kvm_rip_read(vcpu); + + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); + + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)svm_skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + if (kvm_exception_is_soft(nr) && + svm_update_soft_interrupt_rip(vcpu)) + return; svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID @@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } + + svm->x2avic_msrs_intercepted = intercept; +} void svm_vcpu_free_msrpm(u32 *msrpm) { @@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); @@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: @@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } -static void svm_inject_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; + + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; - BUG_ON(!(gif_set(svm))); + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { /* - * vcpu->arch.apicv_active must be read after vcpu->mode. + * apic->apicv_active must be read after vcpu->mode. * Pairs with smp_store_release in vcpu_enter_guest. */ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && (is_soft || is_exception) && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); - break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); @@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) @@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - } init_vmcb_after_set_cpuid(vcpu); } @@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - supported_xss = 0; + kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void) tsc_scaling = false; } else { pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; } } - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - if (enable_apicv) { - if (!boot_cpu_has(X86_FEATURE_AVIC)) { - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } else - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } else { + if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9223ac100ef5..6a7686bf6900 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,13 +29,21 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 21 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached { u64 nested_ctl; u32 event_inj; u32 event_inj_err; + u64 next_rip; u64 nested_cr3; u64 virt_ext; u32 clean; @@ -228,9 +237,12 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; @@ -264,6 +276,8 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { @@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU @@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); @@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); -void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index dfaeb47fcf2a..723f8534986c 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -111,6 +111,15 @@ SYM_FUNC_START(__svm_vcpu_run) #endif /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize 'ret' if the return is + * from the kernel. + */ + UNTRAIN_RET + + /* * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers @@ -190,6 +199,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif + /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize RET if the return is + * from the kernel. + */ + UNTRAIN_RET + pop %_ASM_BX #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index de4762517569..2120d7c060a9 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), + unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( @@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit); * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), + TP_PROTO(unsigned int vector, bool soft, bool reinjected), + TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( - __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, soft ) + __field( bool, reinjected ) ), TP_fast_assign( - __entry->irq = irq; + __entry->vector = vector; + __entry->soft = soft; + __entry->reinjected = reinjected; ), - TP_printk("irq %u", __entry->irq) + TP_printk("%s 0x%x%s", + __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, + __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } @@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq, * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), + TP_PROTO(unsigned exception, bool has_error, unsigned error_code, + bool reinjected), + TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) + __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; + __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)", + TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) + !__entry->has_error ? "" : " (", + !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), + !__entry->has_error ? "" : ")", + __entry->reinjected ? " [reinjected]" : "") ); /* @@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, __entry->icrh, __entry->icrl, __entry->index) ); +TRACE_EVENT(kvm_avic_doorbell, + TP_PROTO(u32 vcpuid, u32 apicid), + TP_ARGS(vcpuid, apicid), + + TP_STRUCT__entry( + __field(u32, vcpuid) + __field(u32, apicid) + ), + + TP_fast_assign( + __entry->vcpuid = vcpuid; + __entry->apicid = apicid; + ), + + TP_printk("vcpuid=%u, apicid=%u", + __entry->vcpuid, __entry->apicid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 3f430e218375..c5e5dfef69c7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -4,8 +4,10 @@ #include <asm/vmx.h> -#include "lapic.h" -#include "x86.h" +#include "../lapic.h" +#include "../x86.h" +#include "../pmu.h" +#include "../cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -59,6 +62,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void) static inline bool cpu_has_load_ia32_efer(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) @@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void) rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && - (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } @@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline bool vmx_pebs_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} + static inline u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = 0; + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (!enable_pmu) - return perf_cap; + return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) @@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 87e3dc10edf4..6a61b1ae7942 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); #if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 8d70f9aea94b..f886a8ff0342 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f5cb18e00e78..ddd4367d4826 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; + u64 vmx_basic = vmcs_config.nested.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) { - u64 supported; - u32 *lowp, *highp; - switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; break; default: BUG(); } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); @@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; @@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | /* reserved */ GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; @@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) @@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) return 0; } -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { - u64 *msr; - switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; + return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; + return &msrs->cr4_fixed0; default: BUG(); } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) @@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; - *msr = data; + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } @@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: - if (data & ~vmx->nested.msrs.vmfunc_controls) + if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; @@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -2278,7 +2288,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_DESC); if (nested_cpu_has(vmcs12, @@ -2515,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2548,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -2614,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3087,7 +3097,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) } vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); + __vmx_vcpu_run_flags(vmx)); if (vmx->msr_autoload.host.nr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -3159,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; - struct page *page; - u64 hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3175,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { - pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -3374,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* @@ -4097,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } @@ -4337,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); @@ -4610,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) @@ -4627,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -4829,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl) -{ - struct vcpu_vmx *vmx; - - if (!nested_vmx_allowed(vcpu)) - return; - - vmx = to_vmx(vcpu); - if (vcpu_has_perf_global_ctrl) { - vmx->nested.msrs.entry_ctls_high |= - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high |= - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } else { - vmx->nested.msrs.entry_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high &= - ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } -} - static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { @@ -4953,7 +4926,7 @@ out_vmcs02: } /* Emulate the VMXON instruction. */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; @@ -4963,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. + * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks + * that have higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON), as KVM must load valid CR0/CR4 values into hardware while + * running the guest, i.e. KVM needs to check the _guest_ values. + * + * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and + * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real + * Mode, but KVM will never take the guest out of those modes. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* CPL=0 must be checked manually. */ + /* + * CPL=0 and all other checks that are lower priority than VM-Exit must + * be checked manually. + */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; @@ -5045,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) } /* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6112,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } @@ -6776,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } @@ -6819,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; - exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c92cea0b8ccc..88b00a7359e4 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - return fixed_bits_valid(val, fixed0, fixed1); + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 37e9eb32e3d9..862c1a4d971b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_counter(pmc); } +} - pmu->fixed_ctr_ctrl = data; +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } } /* function is called when global control register has been updated. */ @@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } -static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) /* disable event that reported as not present by cpuid */ if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return PERF_COUNT_HW_MAX + 1; + return false; break; } - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; + return true; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) @@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; int ret; switch (msr) { @@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + return intel_pmu_has_perf_global_ctrl(pmu); + break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || @@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444ull)) { + if (!(data & pmu->fixed_ctr_ctrl_mask)) { reprogram_fixed_counters(pmu, data); return 0; } @@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; + case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; + int i; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); } - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { @@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { - .pmc_perf_hw_id = intel_pmc_perf_hw_id, + .hw_event_available = intel_hw_event_available, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 07e5fcf5a5aa..1b56c5e5c9fb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (cmpxchg64(&pi_desc->control, old, new) != old) + if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; @@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); + new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to @@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); @@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); - /* set 'NV' to 'wakeup vector' */ + new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted @@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_needs_pi_wakeup(vcpu)) return; if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 9a45d5c9f116..26992076552e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h new file mode 100644 index 000000000000..edc3f16cc189 --- /dev/null +++ b/arch/x86/kvm/vmx/run_flags.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_RUN_FLAGS_H +#define __KVM_X86_VMX_RUN_FLAGS_H + +#define VMX_RUN_VMRESUME (1 << 0) +#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) + +#endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 35e7ec91ae86..aba8cebdc587 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); - if (*gpa == UNMAPPED_GVA) { + if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } @@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; @@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 2b9d7a7e83f7..ac290a44a693 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,6 +50,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 435c187927c4..4182c7ffc909 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,10 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/asm.h> +#include <asm/asm-offsets.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> +#include <asm/percpu.h> #include <asm/segment.h> +#include "run_flags.h" #define WORD_SIZE (BITS_PER_LONG / 8) @@ -31,72 +34,11 @@ .section .noinstr.text, "ax" /** - * vmx_vmenter - VM-Enter the current loaded VMCS - * - * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME - * - * Returns: - * %RFLAGS.CF is set on VM-Fail Invalid - * %RFLAGS.ZF is set on VM-Fail Valid - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit - * - * Note that VMRESUME/VMLAUNCH fall-through and return directly if - * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump - * to vmx_vmexit. - */ -SYM_FUNC_START_LOCAL(vmx_vmenter) - /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ - je 2f - -1: vmresume - RET - -2: vmlaunch - RET - -3: cmpb $0, kvm_rebooting - je 4f - RET -4: ud2 - - _ASM_EXTABLE(1b, 3b) - _ASM_EXTABLE(2b, 3b) - -SYM_FUNC_END(vmx_vmenter) - -/** - * vmx_vmexit - Handle a VMX VM-Exit - * - * Returns: - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit - * - * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump - * here after hardware loads the host's state, i.e. this is the destination - * referred to by VMCS.HOST_RIP. - */ -SYM_FUNC_START(vmx_vmexit) -#ifdef CONFIG_RETPOLINE - ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE - /* Preserve guest's RAX, it's used to stuff the RSB. */ - push %_ASM_AX - - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE - - /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */ - or $1, %_ASM_AX - - pop %_ASM_AX -.Lvmexit_skip_rsb: -#endif - RET -SYM_FUNC_END(vmx_vmexit) - -/** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode - * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) + * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) - * @launched: %true if the VMCS has been launched + * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH + * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl * * Returns: * 0 on VM-Exit, 1 on VM-Fail @@ -115,24 +57,56 @@ SYM_FUNC_START(__vmx_vcpu_run) #endif push %_ASM_BX + /* Save @vmx for SPEC_CTRL handling */ + push %_ASM_ARG1 + + /* Save @flags for SPEC_CTRL handling */ + push %_ASM_ARG3 + /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and * @regs is needed after VM-Exit to save the guest's register values. */ push %_ASM_ARG2 - /* Copy @launched to BL, _ASM_ARG3 is volatile. */ + /* Copy @flags to BL, _ASM_ARG3 is volatile. */ mov %_ASM_ARG3B, %bl - /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 + lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp + ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL + + /* + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the + * host's, write the MSR. + * + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, + * there must not be any returns or indirect branches between this code + * and vmentry. + */ + mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI + movl VMX_spec_ctrl(%_ASM_DI), %edi + movl PER_CPU_VAR(x86_spec_ctrl_current), %esi + cmp %edi, %esi + je .Lspec_ctrl_done + mov $MSR_IA32_SPEC_CTRL, %ecx + xor %edx, %edx + mov %edi, %eax + wrmsr + +.Lspec_ctrl_done: + + /* + * Since vmentry is serializing on affected CPUs, there's no need for + * an LFENCE to stop speculation from skipping the wrmsr. + */ + /* Load @regs to RAX. */ mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - testb %bl, %bl + testb $VMX_RUN_VMRESUME, %bl /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -154,11 +128,37 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Enter guest mode */ - call vmx_vmenter + /* Check EFLAGS.ZF from 'testb' above */ + jz .Lvmlaunch + + /* + * After a successful VMRESUME/VMLAUNCH, control flow "magically" + * resumes below at 'vmx_vmexit' due to the VMCS HOST_RIP setting. + * So this isn't a typical function and objtool needs to be told to + * save the unwind state here and restore it below. + */ + UNWIND_HINT_SAVE + +/* + * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at + * the 'vmx_vmexit' label below. + */ +.Lvmresume: + vmresume + jmp .Lvmfail + +.Lvmlaunch: + vmlaunch + jmp .Lvmfail - /* Jump on VM-Fail. */ - jbe 2f + _ASM_EXTABLE(.Lvmresume, .Lfixup) + _ASM_EXTABLE(.Lvmlaunch, .Lfixup) + +SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) + + /* Restore unwind state from before the VMRESUME/VMLAUNCH. */ + UNWIND_HINT_RESTORE + ENDBR /* Temporarily save guest's RAX. */ push %_ASM_AX @@ -185,21 +185,23 @@ SYM_FUNC_START(__vmx_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif - /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ - xor %eax, %eax + /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */ + xor %ebx, %ebx +.Lclear_regs: /* - * Clear all general purpose registers except RSP and RAX to prevent + * Clear all general purpose registers except RSP and RBX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers * could lead to speculative execution with the guest's values. * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially * free. RSP and RAX are exempt as RSP is restored by hardware during - * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. + * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return + * value. */ -1: xor %ecx, %ecx + xor %eax, %eax + xor %ecx, %ecx xor %edx, %edx - xor %ebx, %ebx xor %ebp, %ebp xor %esi, %esi xor %edi, %edi @@ -216,8 +218,30 @@ SYM_FUNC_START(__vmx_vcpu_run) /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP - pop %_ASM_BX + /* + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before + * the first unbalanced RET after vmexit! + * + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB + * entries and (in some cases) RSB underflow. + * + * eIBRS has its own protection against poisoned RSB, so it doesn't + * need the RSB filling sequence. But it does need to be enabled + * before the first unbalanced RET. + */ + + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT + + pop %_ASM_ARG2 /* @flags */ + pop %_ASM_ARG1 /* @vmx */ + + call vmx_spec_ctrl_restore_host + + /* Put return value in AX */ + mov %_ASM_BX, %_ASM_AX + + pop %_ASM_BX #ifdef CONFIG_X86_64 pop %r12 pop %r13 @@ -230,9 +254,15 @@ SYM_FUNC_START(__vmx_vcpu_run) pop %_ASM_BP RET - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ -2: mov $1, %eax - jmp 1b +.Lfixup: + cmpb $0, kvm_rebooting + jne .Lvmfail + ud2 +.Lvmfail: + /* VM-Fail: set return value to 1 */ + mov $1, %_ASM_BX + jmp .Lclear_regs + SYM_FUNC_END(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3a919e49129b..d7f8331d6f7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -383,9 +389,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -396,7 +402,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) @@ -839,6 +847,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) MSR_IA32_SPEC_CTRL); } +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) +{ + unsigned int flags = 0; + + if (vmx->loaded_vmcs->launched) + flags |= VMX_RUN_VMRESUME; + + /* + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free + * to change it directly without causing a vmexit. In that case read + * it after vmexit and store it in vmx->spec_ctrl. + */ + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + flags |= VMX_RUN_SAVE_SPEC_CTRL; + + return flags; +} + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -1769,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -2093,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: @@ -2294,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -2471,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2479,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2500,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2533,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2563,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; + } + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -2612,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear @@ -2660,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -3212,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -3684,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { @@ -3914,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -3959,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4067,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) @@ -4241,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); } vmx_update_msr_bitmap_x2apic(vcpu); @@ -4281,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4423,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4446,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -4458,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -4634,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -5752,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5809,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -5906,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5919,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -6021,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), @@ -6173,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -6435,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6765,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; @@ -6813,6 +7033,31 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + unsigned int flags) +{ + u64 hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return; + + if (flags & VMX_RUN_SAVE_SPEC_CTRL) + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * If the guest/host SPEC_CTRL values differ, restore the host value. + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + vmx->spec_ctrl != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); +} + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { switch (to_vmx(vcpu)->exit_reason.basic) { @@ -6826,7 +7071,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx) + struct vcpu_vmx *vmx, + unsigned long flags) { guest_state_enter_irqoff(); @@ -6845,7 +7091,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, native_write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); + flags); vcpu->arch.cr2 = native_read_cr2(); @@ -6944,36 +7190,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); - /* - * If this vCPU has touched SPEC_CTRL, restore the guest's value if - * it's non-zero. Since vmentry is serialising on affected CPUs, there - * is no need to worry about the conditional branch over the wrmsr - * being speculatively taken. - */ - x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); + vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { @@ -7150,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7218,7 +7440,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7294,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7310,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7321,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7345,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7429,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) @@ -7486,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7498,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7639,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -7713,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); @@ -7786,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7797,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8011,8 +8237,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8075,12 +8301,16 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8137,11 +8367,12 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8d2342ede0c5..fb8e3480a9d7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -8,11 +8,12 @@ #include <asm/intel_pt.h> #include "capabilities.h" -#include "kvm_cache_regs.h" +#include "../kvm_cache_regs.h" #include "posted_intr.h" #include "vmcs.h" #include "vmx_ops.h" -#include "cpuid.h" +#include "../cpuid.h" +#include "run_flags.h" #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -91,10 +92,22 @@ union vmx_exit_reason { u32 full; }; +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); @@ -204,7 +217,7 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct page *apic_access_page; + struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; @@ -219,9 +232,18 @@ struct nested_vmx { bool has_preemption_timer_deadline; bool preemption_timer_expired; - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; @@ -368,6 +390,8 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -404,7 +428,10 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, + unsigned int flags); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); @@ -458,35 +485,36 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the @@ -582,4 +610,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) return (vmx_instr_info >> 28) & 0xf; } +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 5e7f41225780..5cfc49ddb1b4 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -8,7 +8,7 @@ #include "evmcs.h" #include "vmcs.h" -#include "x86.h" +#include "../x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1910e1e78b15..79a8a74b6b2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,6 @@ #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> -#include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> @@ -87,8 +86,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +153,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +224,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -298,7 +285,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -311,8 +299,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -862,7 +848,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -1094,7 +1080,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1102,9 +1088,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { @@ -1450,6 +1442,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -2051,13 +2044,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2065,11 +2051,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -2349,12 +2350,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2366,10 +2367,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2387,7 +2388,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2464,18 +2465,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2502,11 +2503,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2515,9 +2516,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2559,7 +2560,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2695,7 +2696,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3108,7 +3109,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3198,6 +3199,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3216,6 +3227,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3229,32 +3241,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3538,7 +3571,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3560,9 +3594,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3571,6 +3617,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; @@ -3597,7 +3644,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -3695,6 +3742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3785,6 +3833,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3799,6 +3858,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3816,16 +3876,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3865,9 +3936,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; @@ -3922,7 +4000,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4038,6 +4117,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4280,6 +4360,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4296,6 +4377,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4357,7 +4439,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4379,7 +4461,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4388,17 +4470,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -4426,7 +4511,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4503,8 +4588,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4803,22 +4888,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4828,6 +4954,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4835,7 +4967,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -4941,6 +5072,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4954,7 +5089,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5027,6 +5163,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -5095,7 +5240,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5600,8 +5746,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6028,7 +6174,16 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: + r = -EINVAL; + if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER)) + break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; @@ -6041,7 +6196,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6104,6 +6259,65 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6183,6 +6397,9 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) return -EFAULT; + if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) empty &= !filter.ranges[i].nmsrs; @@ -6576,8 +6793,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6652,15 +6869,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6712,12 +6926,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -6874,7 +7088,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -6905,7 +7119,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -6975,7 +7189,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7086,7 +7300,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7323,7 +7537,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -7377,36 +7591,47 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -7414,30 +7639,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7446,39 +7674,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7860,7 +8068,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, @@ -8137,7 +8354,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -8664,11 +8881,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -8743,7 +8956,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -8944,25 +9157,23 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -8972,27 +9183,37 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - r = -ENOMEM; + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + return -EIO; + } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9005,7 +9226,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -9023,7 +9244,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } @@ -9143,15 +9363,17 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, */ static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } @@ -9396,7 +9618,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9425,6 +9647,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9460,7 +9687,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9482,13 +9709,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9502,6 +9722,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } @@ -9554,7 +9778,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9847,6 +10071,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9855,12 +10080,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9870,7 +10097,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -10265,7 +10492,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10644,8 +10872,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; @@ -11173,7 +11403,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -11266,11 +11496,17 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -11307,7 +11543,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else @@ -11320,9 +11556,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11376,6 +11614,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11421,6 +11660,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); @@ -11500,6 +11740,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); @@ -11516,7 +11758,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); @@ -11707,6 +11949,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; @@ -11716,13 +11960,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11730,10 +11974,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } @@ -12321,7 +12565,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -12631,9 +12876,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); @@ -12763,7 +13008,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12988,6 +13233,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13011,8 +13262,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13028,20 +13278,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) -{ - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13053,11 +13297,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } @@ -13100,6 +13344,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..1926d2cb8e79 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,27 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -283,14 +304,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; -extern u64 supported_xcr0; extern u64 host_xss; -extern u64 supported_xss; + +extern struct kvm_caps kvm_caps; + extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { - return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); } @@ -344,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, @@ -407,7 +434,7 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 610beba35907..a0c05ccbf4b1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, else vcpu->arch.xen.poll_evtchn = -1; - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; @@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = 0; out: /* Really, this is only needed in case of timeout */ - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); @@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && - test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } @@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; - WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) @@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) - e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index d83cba364e31..724bbf83eb5b 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -39,7 +39,7 @@ SYM_FUNC_START(__memmove) /* FSRM implies ERMS => no length checks, do the copy directly */ .Lmemmove_begin_forward: ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM - ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS + ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS /* * movsq instruction have many startup latency @@ -205,6 +205,11 @@ SYM_FUNC_START(__memmove) movb %r11b, (%rdi) 13: RET + +.Lmemmove_erms: + movq %rdx, %rcx + rep movsb + RET SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index b2b2366885a2..073289a55f84 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -33,9 +33,9 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ - __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE + ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \ + __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE) .endm @@ -67,3 +67,76 @@ SYM_CODE_END(__x86_indirect_thunk_array) #define GEN(reg) EXPORT_THUNK(reg) #include <asm/GEN-for-each-reg.h> #undef GEN + +/* + * This function name is magical and is used by -mfunction-return=thunk-extern + * for the compiler to generate JMPs to it. + */ +#ifdef CONFIG_RETHUNK + + .section .text.__x86.return_thunk + +/* + * Safety details here pertain to the AMD Zen{1,2} microarchitecture: + * 1) The RET at __x86_return_thunk must be on a 64 byte boundary, for + * alignment within the BTB. + * 2) The instruction at zen_untrain_ret must contain, and not + * end with, the 0xc3 byte of the RET. + * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread + * from re-poisioning the BTB prediction. + */ + .align 64 + .skip 63, 0xcc +SYM_FUNC_START_NOALIGN(zen_untrain_ret); + + /* + * As executed from zen_untrain_ret, this is: + * + * TEST $0xcc, %bl + * LFENCE + * JMP __x86_return_thunk + * + * Executing the TEST instruction has a side effect of evicting any BTB + * prediction (potentially attacker controlled) attached to the RET, as + * __x86_return_thunk + 1 isn't an instruction boundary at the moment. + */ + .byte 0xf6 + + /* + * As executed from __x86_return_thunk, this is a plain RET. + * + * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8. + * + * We subsequently jump backwards and architecturally execute the RET. + * This creates a correct BTB prediction (type=ret), but in the + * meantime we suffer Straight Line Speculation (because the type was + * no branch) which is halted by the INT3. + * + * With SMT enabled and STIBP active, a sibling thread cannot poison + * RET's prediction to a type of its choice, but can evict the + * prediction due to competitive sharing. If the prediction is + * evicted, __x86_return_thunk will suffer Straight Line Speculation + * which will be contained safely by the INT3. + */ +SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) + ret + int3 +SYM_CODE_END(__x86_return_thunk) + + /* + * Ensure the TEST decoding / BTB invalidation is complete. + */ + lfence + + /* + * Jump back and execute the RET in the middle of the TEST instruction. + * INT3 is for SLS protection. + */ + jmp __x86_return_thunk + int3 +SYM_FUNC_END(zen_untrain_ret) +__EXPORT_THUNK(zen_untrain_ret) + +EXPORT_SYMBOL(__x86_return_thunk) + +#endif /* CONFIG_RETHUNK */ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index dba2197c05c3..331310c29349 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -94,16 +94,18 @@ static bool ex_handler_copy(const struct exception_table_entry *fixup, static bool ex_handler_msr(const struct exception_table_entry *fixup, struct pt_regs *regs, bool wrmsr, bool safe, int reg) { - if (!safe && wrmsr && - pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, - (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) + if (__ONCE_LITE_IF(!safe && wrmsr)) { + pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip); show_stack_regs(regs); + } - if (!safe && !wrmsr && - pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) + if (__ONCE_LITE_IF(!safe && !wrmsr)) { + pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); show_stack_regs(regs); + } if (!wrmsr) { /* Pretend that the read succeeded and returned 0. */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fad8faa29d04..fa71a5d12e87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1408,6 +1408,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee @@ -1526,7 +1530,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) /* * Entry handling for valid #PF from kernel mode is slightly - * different: RCU is already watching and rcu_irq_enter() must not + * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index a0d023cb4292..509408da0da1 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,44 +19,6 @@ #include <asm/tlbflush.h> #include <asm/elf.h> -#if 0 /* This is just for testing */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - unsigned long start = address; - int length = 1; - int nr; - struct page *page; - struct vm_area_struct *vma; - - vma = find_vma(mm, addr); - if (!vma || !is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); - - pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); - - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); - - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageHead(page)); - - return page; -} - -int pmd_huge(pmd_t pmd) -{ - return 0; -} - -int pud_huge(pud_t pud) -{ - return 0; -} - -#else - /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. @@ -72,7 +34,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } -#endif #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d8cfce221275..82a042c03824 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -77,10 +77,20 @@ static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -/* Check that the write-protect PAT entry is set for write-protect */ +/* + * Check that the write-protect PAT entry is set for write-protect. + * To do this without making assumptions how PAT has been set up (Xen has + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache + * mode via the __cachemode2pte_tbl[] into protection bits (those protection + * bits will select a cache mode of WP or better), and then translate the + * protection bits back into the cache mode using __pte2cm_idx() and the + * __pte2cachemode_tbl[] array. This will return the really used cache mode. + */ bool x86_has_pat_wp(void) { - return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; + + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; } enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) @@ -846,7 +856,7 @@ int devmem_is_allowed(unsigned long pagenr) /* * This must follow RAM test, since System RAM is considered a - * restricted resource under CONFIG_STRICT_IOMEM. + * restricted resource under CONFIG_STRICT_DEVMEM. */ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { /* Low 1MB bypasses iomem restrictions. */ diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index f6d038e2cd8e..9c4d8dbcb129 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -20,12 +20,13 @@ #include <linux/bitops.h> #include <linux/dma-mapping.h> #include <linux/virtio_config.h> +#include <linux/virtio_anchor.h> #include <linux/cc_platform.h> -#include <linux/platform-feature.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> #include <asm/setup.h> +#include <asm/mem_encrypt.h> #include <asm/bootparam.h> #include <asm/set_memory.h> #include <asm/cacheflush.h> @@ -245,7 +246,7 @@ void __init sev_setup_arch(void) swiotlb_adjust_size(size); /* Set restricted memory access for virtio. */ - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); } static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) @@ -486,8 +487,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, boo void __init sme_early_init(void) { - unsigned int i; - if (!sme_me_mask) return; @@ -496,8 +495,7 @@ void __init sme_early_init(void) __supported_pte_mask = __sme_set(__supported_pte_mask); /* Update the protection map with memory encryption mask */ - for (i = 0; i < ARRAY_SIZE(protection_map); i++) - protection_map[i] = pgprot_encrypted(protection_map[i]); + add_encrypt_protection_map(); x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare; x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 3d1dba05fce4..9de3d900bc92 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -65,7 +65,10 @@ SYM_FUNC_START(sme_encrypt_execute) movq %rbp, %rsp /* Restore original stack pointer */ pop %rbp - RET + /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE + ret + int3 SYM_FUNC_END(sme_encrypt_execute) SYM_FUNC_START(__enc_copy) @@ -151,6 +154,9 @@ SYM_FUNC_START(__enc_copy) pop %r12 pop %r15 - RET + /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE + ret + int3 .L__enc_copy_end: SYM_FUNC_END(__enc_copy) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e8b061557887..2aadb2019b4f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -867,7 +867,7 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) return; } mask = node_to_cpumask_map[node]; - if (!mask) { + if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; @@ -913,7 +913,7 @@ const struct cpumask *cpumask_of_node(int node) dump_stack(); return cpu_none_mask; } - if (node_to_cpumask_map[node] == NULL) { + if (!cpumask_available(node_to_cpumask_map[node])) { printk(KERN_WARNING "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c index 763742782286..c84bd9540b16 100644 --- a/arch/x86/mm/pgprot.c +++ b/arch/x86/mm/pgprot.c @@ -3,6 +3,34 @@ #include <linux/export.h> #include <linux/mm.h> #include <asm/pgtable.h> +#include <asm/mem_encrypt.h> + +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; + +void add_encrypt_protection_map(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} pgprot_t vm_get_page_prot(unsigned long vm_flags) { diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e44e938885b7..7418c367e328 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey return vma_pkey(vma); } -#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) +#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) /* * Make the default PKRU value (at execve() time) as restrictive @@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | - PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | - PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | - PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | - PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); +u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) | + PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) | + PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) | + PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) | + PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) | + PKRU_AD_MASK(11) | PKRU_AD_MASK(12) | + PKRU_AD_MASK(13) | PKRU_AD_MASK(14) | + PKRU_AD_MASK(15); static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d400b6d9d246..c1e31e9a85d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -734,10 +734,10 @@ static void flush_tlb_func(void *info) const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); bool local = smp_processor_id() == f->initiating_cpu; unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); @@ -771,6 +771,23 @@ static void flush_tlb_func(void *info) return; } + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* + * The TLB is already up to date in respect to f->new_tlb_gen. + * While the core might be still behind mm_tlb_gen, checking + * mm_tlb_gen unnecessarily would have negative caching effects + * so avoid it. + */ + return; + } + + /* + * Defer mm_tlb_gen reading as long as possible to avoid cache + * contention. + */ + mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + if (unlikely(local_tlb_gen == mm_tlb_gen)) { /* * There's nothing to do: we're already up to date. This can @@ -827,6 +844,12 @@ static void flush_tlb_func(void *info) /* Partial flush */ unsigned long addr = f->start; + /* Partial flush cannot have invalid generations */ + VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); + + /* Partial flush must have valid mm */ + VM_WARN_ON(f->mm == NULL); + nr_invalidate = (f->end - f->start) >> f->stride_shift; while (addr < f->end) { @@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) struct flush_tlb_info *info; preempt_disable(); - info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + info = get_flush_tlb_info(NULL, start, end, 0, false, + TLB_GENERATION_INVALID); on_each_cpu(do_kernel_range_flush, info, 1); @@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) int cpu = get_cpu(); - info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0); + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); /* * flush_tlb_multi() is not optimized for the common case in which only * a local TLB flush is needed. Optimize this use-case by calling diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c98b8c0ed3b8..c1f6c1c51d99 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -412,16 +412,30 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { u8 *prog = *pprog; -#ifdef CONFIG_RETPOLINE if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); - } else -#endif - EMIT2(0xFF, 0xE0 + reg); + } else { + EMIT2(0xFF, 0xE0 + reg); + } + + *pprog = prog; +} + +static void emit_return(u8 **pprog, u8 *ip) +{ + u8 *prog = *pprog; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + emit_jump(&prog, &__x86_return_thunk, ip); + } else { + EMIT1(0xC3); /* ret */ + if (IS_ENABLED(CONFIG_SLS)) + EMIT1(0xCC); /* int3 */ + } *pprog = prog; } @@ -1686,7 +1700,7 @@ emit_jmp: ctx->cleanup_addr = proglen; pop_callee_regs(&prog, callee_regs_used); EMIT1(0xC9); /* leave */ - EMIT1(0xC3); /* ret */ + emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; default: @@ -1771,6 +1785,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { + void (*exit)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; + u64 (*enter)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1789,15 +1807,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + if (p->aux->sleepable) { + enter = __bpf_prog_enter_sleepable; + exit = __bpf_prog_exit_sleepable; + } else if (p->expected_attach_type == BPF_LSM_CGROUP) { + enter = __bpf_prog_enter_lsm_cgroup; + exit = __bpf_prog_exit_lsm_cgroup; + } + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_enter_sleepable : - __bpf_prog_enter, prog)) - return -EINVAL; + if (emit_call(&prog, enter, prog)) + return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1841,10 +1865,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_exit_sleepable : - __bpf_prog_exit, prog)) - return -EINVAL; + if (emit_call(&prog, exit, prog)) + return -EINVAL; *pprog = prog; return 0; @@ -1928,23 +1950,6 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } -static bool is_valid_bpf_tramp_flags(unsigned int flags) -{ - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) - return false; - - /* - * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, - * and it must be used alone. - */ - if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && - (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) - return false; - - return true; -} - /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -2023,9 +2028,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (nr_args > 6) return -ENOTSUPP; - if (!is_valid_bpf_tramp_flags(flags)) - return -EINVAL; - /* Generated trampoline stack layout: * * RBP + 8 [ return address ] @@ -2131,10 +2133,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { restore_regs(m, &prog, nr_args, regs_off); - /* call original function */ - if (emit_call(&prog, orig_call, prog)) { - ret = -EINVAL; - goto cleanup; + if (flags & BPF_TRAMP_F_ORIG_STACK) { + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); + EMIT2(0xff, 0xd0); /* call *rax */ + } else { + /* call original function */ + if (emit_call(&prog, orig_call, prog)) { + ret = -EINVAL; + goto cleanup; + } } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); @@ -2189,7 +2196,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_SKIP_FRAME) /* skip our return address and return to parent */ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ - EMIT1(0xC3); /* ret */ + emit_return(&prog, prog); /* Make sure the trampoline generation logic doesn't overflow */ if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) { ret = -EFAULT; @@ -2492,3 +2499,34 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-EINVAL); return dst; } + +/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} + +void bpf_jit_free(struct bpf_prog *prog) +{ + if (prog->jited) { + struct x64_jit_data *jit_data = prog->aux->jit_data; + struct bpf_binary_header *hdr; + + /* + * If we fail the final pass of JIT (from jit_subprogs), + * the program may not be finalized yet. Call finalize here + * before freeing it. + */ + if (jit_data) { + bpf_jit_binary_pack_finalize(prog, jit_data->header, + jit_data->rw_header); + kvfree(jit_data->addrs); + kfree(jit_data); + } + hdr = bpf_jit_binary_pack_hdr(prog); + bpf_jit_binary_pack_free(hdr, NULL); + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); + } + + bpf_prog_unlock_free(prog); +} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1591d67e0bcd..6e598bd78eef 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -108,29 +108,6 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); -void __init efi_find_mirror(void) -{ - efi_memory_desc_t *md; - u64 mirror_size = 0, total_size = 0; - - if (!efi_enabled(EFI_MEMMAP)) - return; - - for_each_efi_memory_desc(md) { - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - total_size += size; - if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { - memblock_mark_mirror(start, size); - mirror_size += size; - } - } - if (mirror_size) - pr_info("Memory: %lldM/%lldM mirrored memory\n", - mirror_size>>20, total_size>>20); -} - /* * Tell the kernel about the EFI memory map. This might include * more than the max 128 entries that can fit in the passed in e820 diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 9ffe2bad27d5..4e5257a4811b 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -23,6 +23,7 @@ #include <linux/objtool.h> #include <asm/page_types.h> #include <asm/segment.h> +#include <asm/nospec-branch.h> .text .code64 @@ -75,7 +76,9 @@ STACK_FRAME_NON_STANDARD __efi64_thunk 1: movq 0x20(%rsp), %rsp pop %rbx pop %rbp - RET + ANNOTATE_UNRET_SAFE + ret + int3 .code32 2: pushl $__KERNEL_CS diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index ae53d54d7959..31c634a22818 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -73,12 +73,6 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(obj)/purgatory.chk: $(obj)/purgatory.ro FORCE $(call if_changed,ld) -targets += kexec-purgatory.c +$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro $(obj)/purgatory.chk -quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ - -$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro $(obj)/purgatory.chk FORCE - $(call if_changed,bin2c) - -obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o +obj-y += kexec-purgatory.o diff --git a/arch/x86/purgatory/kexec-purgatory.S b/arch/x86/purgatory/kexec-purgatory.S new file mode 100644 index 000000000000..8530fe93b718 --- /dev/null +++ b/arch/x86/purgatory/kexec-purgatory.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .section .rodata, "a" + + .align 8 +kexec_purgatory: + .globl kexec_purgatory + .incbin "arch/x86/purgatory/purgatory.ro" +.Lkexec_purgatory_end: + + .align 8 +kexec_purgatory_size: + .globl kexec_purgatory_size + .quad .Lkexec_purgatory_end - kexec_purgatory diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1bcd42c53039..186f13268401 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -32,12 +32,12 @@ config 3_LEVEL_PGTABLES bool "Three-level pagetables" if !64BIT default 64BIT help - Three-level pagetables will let UML have more than 4G of physical - memory. All the memory that can't be mapped directly will be treated - as high memory. + Three-level pagetables will let UML have more than 4G of physical + memory. All the memory that can't be mapped directly will be treated + as high memory. - However, this it experimental on 32-bit architectures, so if unsure say - N (on x86-64 it's automatically enabled, instead, as it's safe there). + However, this it experimental on 32-bit architectures, so if unsure say + N (on x86-64 it's automatically enabled, instead, as it's safe there). config ARCH_HAS_SC_SIGNALS def_bool !64BIT diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index ba5789c35809..3d5cd2e57820 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -28,7 +28,9 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \ + ../lib/memmove_64.o ../lib/memset_64.o +subarch-$(CONFIG_PREEMPTION) += ../entry/thunk_64.o endif diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 19c5dbd46770..cafd01f730da 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -17,7 +17,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; + gate_vma.vm_page_prot = PAGE_READONLY; return 0; } diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index e9c4b2b38803..92ea1670cf1c 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -8,6 +8,7 @@ #include <sysdep/ptrace_user.h> #include <generated/asm-offsets.h> +#include <linux/stddef.h> #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index 903ad91b624f..ef1eb4f4f612 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -19,8 +19,8 @@ void show_regs(struct pt_regs *regs) print_modules(); printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release); - printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, - PT_REGS_IP(regs)); + printk(KERN_INFO "RIP: %04lx:%pS\n", PT_REGS_CS(regs) & 0xffff, + (void *)PT_REGS_IP(regs)); printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), PT_REGS_EFLAGS(regs)); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 5943387e3f35..8c0396fd0e6f 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -3,6 +3,9 @@ # Building vDSO images for x86. # +# do not instrument on vdso because KASAN is not compatible with user mode +KASAN_SANITIZE := n + # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 8b71b1dd7639..28762f800596 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/memblock.h> +#include <linux/virtio_anchor.h> #include <xen/features.h> #include <xen/events.h> @@ -195,7 +196,8 @@ static void __init xen_hvm_guest_init(void) if (xen_pv_domain()) return; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index e3297b15701c..0ed2e487a693 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -31,6 +31,7 @@ #include <linux/gfp.h> #include <linux/edd.h> #include <linux/reboot.h> +#include <linux/virtio_anchor.h> #include <xen/xen.h> #include <xen/events.h> @@ -109,7 +110,9 @@ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); static void __init xen_pv_init_platform(void) { - xen_set_restricted_virtio_memory_access(); + /* PV guests can't operate virtio devices without grants. */ + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -1183,15 +1186,19 @@ static void __init xen_domu_set_legacy_features(void) extern void early_xen_iret_patch(void); /* First C function to be called on Xen boot */ -asmlinkage __visible void __init xen_start_kernel(void) +asmlinkage __visible void __init xen_start_kernel(struct start_info *si) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; int rc; - if (!xen_start_info) + if (!si) return; + clear_bss(); + + xen_start_info = si; + __text_gen_insn(&early_xen_iret_patch, JMP32_INSN_OPCODE, &early_xen_iret_patch, &xen_iret, JMP32_INSN_SIZE); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 81aa46f770c5..cfa99e8f054b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -918,7 +918,7 @@ void xen_enable_sysenter(void) if (!boot_cpu_has(sysenter_feature)) return; - ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat); if(ret != 0) setup_clear_cpu_cap(sysenter_feature); } @@ -927,7 +927,7 @@ void xen_enable_syscall(void) { int ret; - ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + ret = register_callback(CALLBACKTYPE_syscall, xen_entry_SYSCALL_64); if (ret != 0) { printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); /* Pretty fatal; 64-bit userspace has no other @@ -936,7 +936,7 @@ void xen_enable_syscall(void) if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { ret = register_callback(CALLBACKTYPE_syscall32, - xen_syscall32_target); + xen_entry_SYSCALL_compat); if (ret != 0) setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index caa9bc2fa100..6b4fdf6b9542 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -121,7 +121,7 @@ SYM_FUNC_END(xen_read_cr2_direct); .macro xen_pv_trap name SYM_CODE_START(xen_\name) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR pop %rcx pop %r11 @@ -234,8 +234,8 @@ SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) */ /* Normal 64-bit system call target */ -SYM_CODE_START(xen_syscall_target) - UNWIND_HINT_EMPTY +SYM_CODE_START(xen_entry_SYSCALL_64) + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -249,13 +249,13 @@ SYM_CODE_START(xen_syscall_target) movq $__USER_CS, 1*8(%rsp) jmp entry_SYSCALL_64_after_hwframe -SYM_CODE_END(xen_syscall_target) +SYM_CODE_END(xen_entry_SYSCALL_64) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ -SYM_CODE_START(xen_syscall32_target) - UNWIND_HINT_EMPTY +SYM_CODE_START(xen_entry_SYSCALL_compat) + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -269,11 +269,11 @@ SYM_CODE_START(xen_syscall32_target) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe -SYM_CODE_END(xen_syscall32_target) +SYM_CODE_END(xen_entry_SYSCALL_compat) /* 32-bit compat sysenter target */ -SYM_CODE_START(xen_sysenter_target) - UNWIND_HINT_EMPTY +SYM_CODE_START(xen_entry_SYSENTER_compat) + UNWIND_HINT_ENTRY ENDBR /* * NB: Xen is polite and clears TF from EFLAGS for us. This means @@ -291,19 +291,19 @@ SYM_CODE_START(xen_sysenter_target) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe -SYM_CODE_END(xen_sysenter_target) +SYM_CODE_END(xen_entry_SYSENTER_compat) #else /* !CONFIG_IA32_EMULATION */ -SYM_CODE_START(xen_syscall32_target) -SYM_CODE_START(xen_sysenter_target) - UNWIND_HINT_EMPTY +SYM_CODE_START(xen_entry_SYSCALL_compat) +SYM_CODE_START(xen_entry_SYSENTER_compat) + UNWIND_HINT_ENTRY ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax pushq $0 jmp hypercall_iret -SYM_CODE_END(xen_sysenter_target) -SYM_CODE_END(xen_syscall32_target) +SYM_CODE_END(xen_entry_SYSENTER_compat) +SYM_CODE_END(xen_entry_SYSCALL_compat) #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 3a2cd93bf059..ffaa62167f6e 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -26,6 +26,7 @@ SYM_CODE_START(hypercall_page) .rept (PAGE_SIZE / 32) UNWIND_HINT_FUNC ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE ret /* * Xen will write the hypercall page, and sort out ENDBR. @@ -48,15 +49,6 @@ SYM_CODE_START(startup_xen) ANNOTATE_NOENDBR cld - /* Clear .bss */ - xor %eax,%eax - mov $__bss_start, %rdi - mov $__bss_stop, %rcx - sub %rdi, %rcx - shr $3, %rcx - rep stosq - - mov %rsi, xen_start_info mov initial_stack(%rip), %rsp /* Set up %gs. @@ -71,6 +63,7 @@ SYM_CODE_START(startup_xen) cdq wrmsr + mov %rsi, %rdi call xen_start_kernel SYM_CODE_END(startup_xen) __FINIT diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index fd0fec6e92f4..9a8bb972193d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -10,10 +10,10 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_failsafe_callback[]; -void xen_sysenter_target(void); +void xen_entry_SYSENTER_compat(void); #ifdef CONFIG_X86_64 -void xen_syscall_target(void); -void xen_syscall32_target(void); +void xen_entry_SYSCALL_64(void); +void xen_entry_SYSCALL_compat(void); #endif extern void *xen_initial_gdt; |