diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 48 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/bugs.c | 488 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 61 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/cpu.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/cyrix.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/feat_ctl.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/hygon.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 31 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/inject.c | 47 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/internal.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/rdrand.c | 59 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/scattered.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encl.c | 330 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encl.h | 16 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encls.h | 33 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/ioctl.c | 641 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/main.c | 75 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/sgx.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/vmware.c | 4 |
19 files changed, 1571 insertions, 286 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0c0b09796ced..48276c0e479d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -808,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) return; /* - * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * The self-test can clear X86_FEATURE_RDRAND, so check for * RDRAND support using the CPUID function directly. */ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) @@ -862,6 +862,28 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif +} + static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); @@ -870,12 +892,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) node_reclaim_distance = 32; #endif - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. - */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -907,7 +938,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: fallthrough; + case 0x17: init_spectral_chicken(c); + fallthrough; case 0x19: init_amd_zn(c); break; } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 74c62cc47a5f..6761668100b9 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -38,6 +38,8 @@ static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); @@ -48,16 +50,40 @@ static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void write_spec_ctrl_current(u64 val, bool force) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -114,13 +140,21 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); + /* + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET. + */ + spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); md_clear_select_mitigation(); @@ -161,31 +195,17 @@ void __init check_bugs(void) #endif } +/* + * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is + * done in vmenter.S. + */ void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - if (hostval != guestval) { msrval = setguest ? guestval : hostval; wrmsrl(MSR_IA32_SPEC_CTRL, msrval); @@ -752,12 +772,180 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, +}; + +static const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + mitigate_smt = true; + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = @@ -787,6 +975,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) @@ -828,6 +1017,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_EIBRS, SPECTRE_V2_CMD_EIBRS_RETPOLINE, SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -868,13 +1058,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -900,15 +1092,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { - return (mode == SPECTRE_V2_EIBRS || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_EIBRS_LFENCE); + return mode == SPECTRE_V2_IBRS || + mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; } static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -921,7 +1114,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; @@ -969,12 +1162,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not - * required. + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, + * STIBP is not required. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return; /* @@ -986,6 +1179,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + spectre_v2_user_stibp = mode; set_mode: @@ -999,6 +1199,7 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -1016,6 +1217,7 @@ static const struct { { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -1078,6 +1280,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -1093,6 +1319,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1117,6 +1359,15 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + mode = spectre_v2_select_retpoline(); break; @@ -1133,6 +1384,10 @@ static void __init spectre_v2_select_mitigation(void) mode = spectre_v2_select_retpoline(); break; + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + case SPECTRE_V2_CMD_EIBRS: mode = SPECTRE_V2_EIBRS; break; @@ -1149,10 +1404,9 @@ static void __init spectre_v2_select_mitigation(void) if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_eibrs_mode(mode)) { - /* Force it so VMEXIT will restore correctly */ + if (spectre_v2_in_ibrs_mode(mode)) { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } switch (mode) { @@ -1160,6 +1414,12 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_EIBRS: break; + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + case SPECTRE_V2_LFENCE: case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); @@ -1171,43 +1431,117 @@ static void __init spectre_v2_select_mitigation(void) break; } + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Similar to context switches, there are two types of RSB attacks + * after vmexit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS, on the other hand, has RSB-poisoning protections, so it + * doesn't need RSB clearing after vmexit. + */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) || + boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + + /* + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + write_spec_ctrl_current(val, true); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1424,16 +1758,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1450,7 +1774,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } } @@ -1701,7 +2025,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1938,7 +2262,7 @@ static ssize_t mmio_stale_data_show_state(char *buf) static char *stibp_state(void) { - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { @@ -1994,6 +2318,24 @@ static ssize_t srbds_show_state(char *buf) return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); } +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sprintf(buf, "Vulnerable: untrained return thunk on non-Zen uarch\n"); + + return sprintf(buf, "%s; SMT %s\n", + retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2039,6 +2381,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MMIO_STALE_DATA: return mmio_stale_data_show_state(buf); + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + default: break; } @@ -2095,4 +2440,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at { return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4730b0a58f24..736262a76a12 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1205,48 +1205,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) /* CPU is affected by X86_BUG_MMIO_STALE_DATA */ #define MMIO BIT(1) /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ #define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) | - BIT(7) | BIT(0xB), MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO), - VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO), - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; @@ -1348,6 +1360,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !arch_cap_mmio_immune(ia32_cap)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2a8e584fc991..7c9b5893c30a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -61,6 +61,8 @@ static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 7227c15299d0..9651275aecd1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/isa-dma.h> #include <linux/pci.h> #include <asm/dma.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 3fcdda4c1e11..21fd425088fe 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -302,6 +302,12 @@ static void init_hygon(struct cpuinfo_x86 *c) /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + set_cpu_cap(c, X86_FEATURE_ZEN); set_cpu_cap(c, X86_FEATURE_CPB); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fd5dead8371c..2d7ea5480ec3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -682,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); } @@ -1216,22 +1216,23 @@ static void bus_lock_init(void) { u64 val; - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || - (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - /* - * Enable #DB for bus lock. All bus locks are handled in #DB except - * split locks are handled in #AC in the fatal case. - */ rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 5fbd7ffb3233..12cf2e7ca33c 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -33,6 +33,8 @@ #include "internal.h" +static bool hw_injection_possible = true; + /* * Collect all the MCi_XXX settings */ @@ -339,6 +341,8 @@ static int __set_inj(const char *buf) for (i = 0; i < N_INJ_TYPES; i++) { if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; inj_type = i; return 0; } @@ -717,11 +721,54 @@ static void __init debugfs_init(void) &i_mce, dfs_fls[i].fops); } +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + static int __init inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + check_hw_inj_possible(); + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 4ae0e603f7fa..7e03f5b7f6bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -211,7 +211,7 @@ noinstr u64 mce_rdmsrl(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { - if (mce_flags.smca) { + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { switch (reg) { case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..26a427fa84ea 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -11,56 +11,39 @@ #include <asm/archrandom.h> #include <asm/sections.h> -static int __init x86_rdrand_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_RDRAND); - setup_clear_cpu_cap(X86_FEATURE_RDSEED); - return 1; -} -__setup("nordrand", x86_rdrand_setup); - /* * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. - * Run the instruction a few times as a sanity check. - * If it fails, it is simple to disable RDRAND here. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. */ -#define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned int changed = 0; - unsigned long tmp, prev; - int i; + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; if (!cpu_has(c, X86_FEATURE_RDRAND)) return; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (!rdrand_long(&tmp)) { - clear_cpu_cap(c, X86_FEATURE_RDRAND); - pr_warn_once("rdrand: disabled\n"); - return; + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; } + changed += i && sample != prev; + prev = sample; } + if (changed < MIN_CHANGE) + failure = true; - /* - * Stupid sanity-check whether RDRAND does *actually* generate - * some at least random-looking data. - */ - prev = tmp; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (rdrand_long(&tmp)) { - if (prev != tmp) - changed++; - - prev = tmp; - } + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); } - - if (WARN_ON_ONCE(!changed)) - pr_emerg( -"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); - } -#endif diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index dbaa8326d6f2..fd44b54c90d5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 19876ebfb504..24c1bb8eb196 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } -static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr, - unsigned long vm_flags) +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); struct sgx_epc_page *epc_page; - struct sgx_encl_page *entry; - - entry = xa_load(&encl->page_array, PFN_DOWN(addr)); - if (!entry) - return ERR_PTR(-EFAULT); - - /* - * Verify that the faulted page has equal or higher build time - * permissions than the VMA permissions (i.e. the subset of {VM_READ, - * VM_WRITE, VM_EXECUTE} in vma->vm_flags). - */ - if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) - return ERR_PTR(-EFAULT); /* Entry successfully located. */ if (entry->epc_page) { @@ -276,6 +261,146 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return entry; } +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) + goto err_out_epc; + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -295,9 +420,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) if (unlikely(!encl)) return VM_FAULT_SIGBUS; + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); if (IS_ERR(entry)) { mutex_unlock(&encl->lock); @@ -367,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + /* * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might * conflict with the enclave page permissions. @@ -445,7 +586,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, for ( ; ; ) { mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); if (PTR_ERR(entry) != -EBUSY) break; @@ -687,7 +828,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) spin_lock(&encl->mm_lock); list_add_rcu(&encl_mm->list, &encl->mm_list); - /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ smp_wmb(); encl->mm_list_version++; spin_unlock(&encl->mm_lock); @@ -695,6 +836,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) return 0; } +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { @@ -735,7 +943,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return PTR_ERR(pcmd); } - backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); @@ -902,8 +1109,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. * * Allocate a free EPC page and convert it to a Version Array (VA) page. * @@ -911,12 +1195,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, * a VA page, * -errno otherwise */ -struct sgx_epc_page *sgx_alloc_va_page(void) +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) { struct sgx_epc_page *epc_page; int ret; - epc_page = sgx_alloc_epc_page(NULL, true); + epc_page = sgx_alloc_epc_page(NULL, reclaim); if (IS_ERR(epc_page)) return ERR_CAST(epc_page); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 332ef3568267..a65a952116fd 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -27,7 +27,8 @@ struct sgx_encl_page { unsigned long desc; - unsigned long vm_max_prot_bits; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; struct sgx_epc_page *epc_page; struct sgx_encl *encl; struct sgx_va_page *va_page; @@ -78,7 +79,6 @@ struct sgx_va_page { }; struct sgx_backing { - pgoff_t page_index; struct page *contents; struct page *pcmd; unsigned long pcmd_offset; @@ -106,6 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, @@ -113,11 +114,18 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); - -struct sgx_epc_page *sgx_alloc_va_page(void); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index fa04a73daf9c..99004b02e2ed 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -136,57 +136,71 @@ static inline bool encls_failed(int ret) ret; \ }) +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) { return __encls_2(ECREATE, pginfo, secs); } +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ static inline int __eextend(void *secs, void *addr) { return __encls_2(EEXTEND, secs, addr); } +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) { return __encls_2(EADD, pginfo, addr); } +/* Finalize enclave build, initialize enclave for user code execution. */ static inline int __einit(void *sigstruct, void *token, void *secs) { return __encls_ret_3(EINIT, sigstruct, secs, token); } +/* Disassociate EPC page from its enclave and mark it as unused. */ static inline int __eremove(void *addr) { return __encls_ret_1(EREMOVE, addr); } +/* Copy data to an EPC page belonging to a debug enclave. */ static inline int __edbgwr(void *addr, unsigned long *data) { return __encls_2(EDGBWR, *data, addr); } +/* Copy data from an EPC page belonging to a debug enclave. */ static inline int __edbgrd(void *addr, unsigned long *data) { return __encls_1_1(EDGBRD, *data, addr); } +/* Track that software has completed the required TLB address clears. */ static inline int __etrack(void *addr) { return __encls_ret_1(ETRACK, addr); } +/* Load, verify, and unblock an EPC page. */ static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(ELDU, pginfo, addr, va); } +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ static inline int __eblock(void *addr) { return __encls_ret_1(EBLOCK, addr); } +/* Initialize an EPC page into a Version Array (VA) page. */ static inline int __epa(void *addr) { unsigned long rbx = SGX_PAGE_TYPE_VA; @@ -194,10 +208,29 @@ static inline int __epa(void *addr) return __encls_2(EPA, rbx, addr); } +/* Invalidate an EPC page and write it out to main memory. */ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(EWB, pginfo, addr, va); } +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 83df20e3e633..ebe79d60619f 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) { struct sgx_va_page *va_page = NULL; void *err; @@ -30,7 +30,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) if (!va_page) return ERR_PTR(-ENOMEM); - va_page->epc_page = sgx_alloc_va_page(); + va_page->epc_page = sgx_alloc_va_page(reclaim); if (IS_ERR(va_page->epc_page)) { err = ERR_CAST(va_page->epc_page); kfree(va_page); @@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) return va_page; } -static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) { encl->page_cnt--; @@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); else if (va_page) @@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) set_bit(SGX_ENCL_DEBUG, &encl->flags); encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; @@ -168,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* - * TCS pages must always RW set for CPU access while the SECINFO - * permissions are *always* zero - the CPU ignores the user provided - * values and silently overwrites them with zero permissions. - */ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; @@ -306,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return PTR_ERR(epc_page); } - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) { ret = PTR_ERR(va_page); goto err_out_free; @@ -344,6 +313,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, */ encl_page->encl = encl; encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; encl->secs_child_cnt++; if (flags & SGX_PAGE_MEASURE) { @@ -372,6 +342,26 @@ err_out_free: return ret; } +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + /** * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES * @encl: an enclave pointer @@ -425,14 +415,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&add_arg, arg, sizeof(add_arg))) return -EFAULT; - if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || - !IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - - if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) return -EINVAL; - if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) return -EINVAL; if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, @@ -674,6 +660,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) return sgx_set_attribute(&encl->attributes_mask, params.fd); } +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -695,6 +1240,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_PROVISION: ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index a78652d43e61..515e2a5f25bb 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; - unsigned long mm_list_version; - struct sgx_encl_mm *encl_mm; - struct vm_area_struct *vma; - int idx, ret; - - do { - mm_list_version = encl->mm_list_version; - - /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ - smp_rmb(); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - mmap_read_lock(encl_mm->mm); - - ret = sgx_encl_find(encl_mm->mm, addr, &vma); - if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); - - mmap_read_unlock(encl_mm->mm); - - mmput_async(encl_mm->mm); - } + int ret; - srcu_read_unlock(&encl->srcu, idx); - } while (unlikely(encl->mm_list_version != mm_list_version)); + sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); @@ -201,39 +174,10 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, return ret; } -static void sgx_ipi_cb(void *info) +void sgx_ipi_cb(void *info) { } -static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -{ - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; - int idx; - - /* - * Can race with sgx_encl_mm_add(), but ETRACK has already been - * executed, which means that the CPUs running in the new mm will enter - * into the enclave with a fresh epoch. - */ - cpumask_clear(cpumask); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); - - mmput_async(encl_mm->mm); - } - - srcu_read_unlock(&encl->srcu, idx); - - return cpumask; -} - /* * Swap page to the regular memory transformed to the blocked state by using * EBLOCK, which means that it can no longer be referenced (no new TLB entries). @@ -280,7 +224,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ - on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } @@ -431,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark) !list_empty(&sgx_active_page_list); } +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + static int ksgxd(void *p) { set_freezable(); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f17def9fe6f..0f2020653fba 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -86,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) struct sgx_epc_page *__sgx_alloc_epc_page(void); void sgx_free_epc_page(struct sgx_epc_page *page); +void sgx_reclaim_direct(void); void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +void sgx_ipi_cb(void *info); + #ifdef CONFIG_X86_SGX_KVM int __init sgx_vepc_init(void); #else diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c04b933f48d3..02039ec3597d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); } #ifdef CONFIG_AMD_MEM_ENCRYPT |