summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/amd.c48
-rw-r--r--arch/x86/kernel/cpu/bugs.c488
-rw-r--r--arch/x86/kernel/cpu/common.c61
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c9
-rw-r--r--arch/x86/kernel/cpu/hygon.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c31
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c47
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/rdrand.c59
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c330
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h16
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h33
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c641
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c75
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h3
-rw-r--r--arch/x86/kernel/cpu/vmware.c4
19 files changed, 1571 insertions, 286 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0c0b09796ced..48276c0e479d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -808,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
return;
/*
- * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ * The self-test can clear X86_FEATURE_RDRAND, so check for
* RDRAND support using the CPUID function directly.
*/
if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
@@ -862,6 +862,28 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+void init_spectral_chicken(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_CPU_UNRET_ENTRY
+ u64 value;
+
+ /*
+ * On Zen2 we offer this chicken (bit) on the altar of Speculation.
+ *
+ * This suppresses speculation from the middle of a basic block, i.e. it
+ * suppresses non-branch predictions.
+ *
+ * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
+ wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ }
+ }
+#endif
+}
+
static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
@@ -870,12 +892,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
node_reclaim_distance = 32;
#endif
- /*
- * Fix erratum 1076: CPB feature bit not being set in CPUID.
- * Always set it, except when running under a hypervisor.
- */
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
- set_cpu_cap(c, X86_FEATURE_CPB);
+ /* Fix up CPUID bits, but only if not virtualised. */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+
+ /* Erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
}
static void init_amd(struct cpuinfo_x86 *c)
@@ -907,7 +938,8 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: fallthrough;
+ case 0x17: init_spectral_chicken(c);
+ fallthrough;
case 0x19: init_amd_zn(c); break;
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 74c62cc47a5f..6761668100b9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -38,6 +38,8 @@
static void __init spectre_v1_select_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
+static void __init retbleed_select_mitigation(void);
+static void __init spectre_v2_user_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
@@ -48,16 +50,40 @@ static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
-/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+
+/* The current value of the SPEC_CTRL MSR with task-specific bits set */
+DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+
static DEFINE_MUTEX(spec_ctrl_mutex);
/*
- * The vendor and possibly platform specific bits which can be modified in
- * x86_spec_ctrl_base.
+ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
+ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+void write_spec_ctrl_current(u64 val, bool force)
+{
+ if (this_cpu_read(x86_spec_ctrl_current) == val)
+ return;
+
+ this_cpu_write(x86_spec_ctrl_current, val);
+
+ /*
+ * When KERNEL_IBRS this MSR is written on return-to-user, unless
+ * forced the update can be delayed until that time.
+ */
+ if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
+u64 spec_ctrl_current(void)
+{
+ return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
/*
* AMD specific MSR info for Speculative Store Bypass control.
@@ -114,13 +140,21 @@ void __init check_bugs(void)
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- /* Allow STIBP in MSR_SPEC_CTRL if supported */
- if (boot_cpu_has(X86_FEATURE_STIBP))
- x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
-
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
+ /*
+ * retbleed_select_mitigation() relies on the state set by
+ * spectre_v2_select_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_select_mitigation();
+ /*
+ * spectre_v2_user_select_mitigation() relies on the state set by
+ * retbleed_select_mitigation(); specifically the STIBP selection is
+ * forced for UNRET.
+ */
+ spectre_v2_user_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
md_clear_select_mitigation();
@@ -161,31 +195,17 @@ void __init check_bugs(void)
#endif
}
+/*
+ * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
+ * done in vmenter.S.
+ */
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+ u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
struct thread_info *ti = current_thread_info();
- /* Is MSR_SPEC_CTRL implemented ? */
if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- /*
- * Restrict guest_spec_ctrl to supported values. Clear the
- * modifiable bits in the host base value and or the
- * modifiable bits from the guest value.
- */
- guestval = hostval & ~x86_spec_ctrl_mask;
- guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
-
- /* SSBD controlled in MSR_SPEC_CTRL */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD))
- hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
-
- /* Conditional STIBP enabled? */
- if (static_branch_unlikely(&switch_to_cond_stibp))
- hostval |= stibp_tif_to_spec_ctrl(ti->flags);
-
if (hostval != guestval) {
msrval = setguest ? guestval : hostval;
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
@@ -752,12 +772,180 @@ static int __init nospectre_v1_cmdline(char *str)
}
early_param("nospectre_v1", nospectre_v1_cmdline);
-#undef pr_fmt
-#define pr_fmt(fmt) "Spectre V2 : " fmt
-
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
+#undef pr_fmt
+#define pr_fmt(fmt) "RETBleed: " fmt
+
+enum retbleed_mitigation {
+ RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_UNRET,
+ RETBLEED_MITIGATION_IBPB,
+ RETBLEED_MITIGATION_IBRS,
+ RETBLEED_MITIGATION_EIBRS,
+};
+
+enum retbleed_mitigation_cmd {
+ RETBLEED_CMD_OFF,
+ RETBLEED_CMD_AUTO,
+ RETBLEED_CMD_UNRET,
+ RETBLEED_CMD_IBPB,
+};
+
+static const char * const retbleed_strings[] = {
+ [RETBLEED_MITIGATION_NONE] = "Vulnerable",
+ [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
+ [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
+ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+};
+
+static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
+ RETBLEED_MITIGATION_NONE;
+static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
+ RETBLEED_CMD_AUTO;
+
+static int __ro_after_init retbleed_nosmt = false;
+
+static int __init retbleed_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "off")) {
+ retbleed_cmd = RETBLEED_CMD_OFF;
+ } else if (!strcmp(str, "auto")) {
+ retbleed_cmd = RETBLEED_CMD_AUTO;
+ } else if (!strcmp(str, "unret")) {
+ retbleed_cmd = RETBLEED_CMD_UNRET;
+ } else if (!strcmp(str, "ibpb")) {
+ retbleed_cmd = RETBLEED_CMD_IBPB;
+ } else if (!strcmp(str, "nosmt")) {
+ retbleed_nosmt = true;
+ } else {
+ pr_err("Ignoring unknown retbleed option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("retbleed", retbleed_parse_cmdline);
+
+#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
+#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
+
+static void __init retbleed_select_mitigation(void)
+{
+ bool mitigate_smt = false;
+
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ return;
+
+ switch (retbleed_cmd) {
+ case RETBLEED_CMD_OFF:
+ return;
+
+ case RETBLEED_CMD_UNRET:
+ if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n");
+ goto do_cmd_auto;
+ }
+ break;
+
+ case RETBLEED_CMD_IBPB:
+ if (!boot_cpu_has(X86_FEATURE_IBPB)) {
+ pr_err("WARNING: CPU does not support IBPB.\n");
+ goto do_cmd_auto;
+ } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ goto do_cmd_auto;
+ }
+ break;
+
+do_cmd_auto:
+ case RETBLEED_CMD_AUTO:
+ default:
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ }
+
+ /*
+ * The Intel mitigation (IBRS or eIBRS) was already selected in
+ * spectre_v2_select_mitigation(). 'retbleed_mitigation' will
+ * be set accordingly below.
+ */
+
+ break;
+ }
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ pr_err(RETBLEED_UNTRAIN_MSG);
+
+ mitigate_smt = true;
+ break;
+
+ case RETBLEED_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ mitigate_smt = true;
+ break;
+
+ default:
+ break;
+ }
+
+ if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
+ (retbleed_nosmt || cpu_mitigations_auto_nosmt()))
+ cpu_smt_disable(false);
+
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ pr_err(RETBLEED_INTEL_MSG);
+ }
+ }
+
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
SPECTRE_V2_USER_NONE;
static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
@@ -787,6 +975,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; }
#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
#ifdef CONFIG_BPF_SYSCALL
void unpriv_ebpf_notify(int new_state)
@@ -828,6 +1017,7 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_EIBRS,
SPECTRE_V2_CMD_EIBRS_RETPOLINE,
SPECTRE_V2_CMD_EIBRS_LFENCE,
+ SPECTRE_V2_CMD_IBRS,
};
enum spectre_v2_user_cmd {
@@ -868,13 +1058,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
+static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
+
static enum spectre_v2_user_cmd __init
-spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
+spectre_v2_parse_user_cmdline(void)
{
char arg[20];
int ret, i;
- switch (v2_cmd) {
+ switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
case SPECTRE_V2_CMD_FORCE:
@@ -900,15 +1092,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
return SPECTRE_V2_USER_CMD_AUTO;
}
-static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
{
- return (mode == SPECTRE_V2_EIBRS ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_EIBRS_LFENCE);
+ return mode == SPECTRE_V2_IBRS ||
+ mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
}
static void __init
-spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
+spectre_v2_user_select_mitigation(void)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
bool smt_possible = IS_ENABLED(CONFIG_SMP);
@@ -921,7 +1114,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
smt_possible = false;
- cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ cmd = spectre_v2_parse_user_cmdline();
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
goto set_mode;
@@ -969,12 +1162,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
}
/*
- * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
- * required.
+ * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible,
+ * STIBP is not required.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
!smt_possible ||
- spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ spectre_v2_in_ibrs_mode(spectre_v2_enabled))
return;
/*
@@ -986,6 +1179,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
+ if (mode != SPECTRE_V2_USER_STRICT &&
+ mode != SPECTRE_V2_USER_STRICT_PREFERRED)
+ pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
+ mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+ }
+
spectre_v2_user_stibp = mode;
set_mode:
@@ -999,6 +1199,7 @@ static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
[SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
[SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+ [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
};
static const struct {
@@ -1016,6 +1217,7 @@ static const struct {
{ "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
{ "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
+ { "ibrs", SPECTRE_V2_CMD_IBRS, false },
};
static void __init spec_v2_print_cond(const char *reason, bool secure)
@@ -1078,6 +1280,30 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
+ if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) {
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("%s selected but not Intel CPU. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+ pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
+ pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
spec_v2_print_cond(mitigation_options[i].option,
mitigation_options[i].secure);
return cmd;
@@ -1093,6 +1319,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
return SPECTRE_V2_RETPOLINE;
}
+/* Disable in-kernel use of non-RSB RET predictors */
+static void __init spec_ctrl_disable_kernel_rrsba(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ if (ia32_cap & ARCH_CAP_RRSBA) {
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ }
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -1117,6 +1359,15 @@ static void __init spectre_v2_select_mitigation(void)
break;
}
+ if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_cmd != RETBLEED_CMD_OFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ mode = SPECTRE_V2_IBRS;
+ break;
+ }
+
mode = spectre_v2_select_retpoline();
break;
@@ -1133,6 +1384,10 @@ static void __init spectre_v2_select_mitigation(void)
mode = spectre_v2_select_retpoline();
break;
+ case SPECTRE_V2_CMD_IBRS:
+ mode = SPECTRE_V2_IBRS;
+ break;
+
case SPECTRE_V2_CMD_EIBRS:
mode = SPECTRE_V2_EIBRS;
break;
@@ -1149,10 +1404,9 @@ static void __init spectre_v2_select_mitigation(void)
if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
- if (spectre_v2_in_eibrs_mode(mode)) {
- /* Force it so VMEXIT will restore correctly */
+ if (spectre_v2_in_ibrs_mode(mode)) {
x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ write_spec_ctrl_current(x86_spec_ctrl_base, true);
}
switch (mode) {
@@ -1160,6 +1414,12 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_EIBRS:
break;
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
+ break;
+
case SPECTRE_V2_LFENCE:
case SPECTRE_V2_EIBRS_LFENCE:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
@@ -1171,43 +1431,117 @@ static void __init spectre_v2_select_mitigation(void)
break;
}
+ /*
+ * Disable alternate RSB predictions in kernel when indirect CALLs and
+ * JMPs gets protection against BHI and Intramode-BTI, but RET
+ * prediction from a non-RSB predictor is still a risk.
+ */
+ if (mode == SPECTRE_V2_EIBRS_LFENCE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_RETPOLINE)
+ spec_ctrl_disable_kernel_rrsba();
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
/*
- * If spectre v2 protection has been enabled, unconditionally fill
- * RSB during a context switch; this protects against two independent
- * issues:
+ * If Spectre v2 protection has been enabled, fill the RSB during a
+ * context switch. In general there are two types of RSB attacks
+ * across context switches, for which the CALLs/RETs may be unbalanced.
+ *
+ * 1) RSB underflow
+ *
+ * Some Intel parts have "bottomless RSB". When the RSB is empty,
+ * speculated return targets may come from the branch predictor,
+ * which could have a user-poisoned BTB or BHB entry.
*
- * - RSB underflow (and switch to BTB) on Skylake+
- * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
+ * AMD has it even worse: *all* returns are speculated from the BTB,
+ * regardless of the state of the RSB.
+ *
+ * When IBRS or eIBRS is enabled, the "user -> kernel" attack
+ * scenario is mitigated by the IBRS branch prediction isolation
+ * properties, so the RSB buffer filling wouldn't be necessary to
+ * protect against this type of attack.
+ *
+ * The "user -> user" attack scenario is mitigated by RSB filling.
+ *
+ * 2) Poisoned RSB entry
+ *
+ * If the 'next' in-kernel return stack is shorter than 'prev',
+ * 'next' could be tricked into speculating with a user-poisoned RSB
+ * entry.
+ *
+ * The "user -> kernel" attack scenario is mitigated by SMEP and
+ * eIBRS.
+ *
+ * The "user -> user" scenario, also known as SpectreBHB, requires
+ * RSB clearing.
+ *
+ * So to mitigate all cases, unconditionally fill RSB on context
+ * switches.
+ *
+ * FIXME: Is this pointless for retbleed-affected AMD?
*/
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/*
- * Retpoline means the kernel is safe because it has no indirect
- * branches. Enhanced IBRS protects firmware too, so, enable restricted
- * speculation around firmware calls only when Enhanced IBRS isn't
- * supported.
+ * Similar to context switches, there are two types of RSB attacks
+ * after vmexit:
+ *
+ * 1) RSB underflow
+ *
+ * 2) Poisoned RSB entry
+ *
+ * When retpoline is enabled, both are mitigated by filling/clearing
+ * the RSB.
+ *
+ * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
+ * prediction isolation protections, RSB still needs to be cleared
+ * because of #2. Note that SMEP provides no protection here, unlike
+ * user-space-poisoned RSB entries.
+ *
+ * eIBRS, on the other hand, has RSB-poisoning protections, so it
+ * doesn't need RSB clearing after vmexit.
+ */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) ||
+ boot_cpu_has(X86_FEATURE_KERNEL_IBRS))
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
+ /*
+ * Retpoline protects the kernel, but doesn't protect firmware. IBRS
+ * and Enhanced IBRS protect firmware too, so enable IBRS around
+ * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
+ * enabled.
*
* Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
* the user might select retpoline on the kernel command line and if
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
+ if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ boot_cpu_has(X86_FEATURE_IBPB) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
+
+ if (retbleed_cmd != RETBLEED_CMD_IBPB) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
+ pr_info("Enabling Speculation Barrier for firmware calls\n");
+ }
+
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
/* Set up IBPB and STIBP depending on the general spectre V2 command */
- spectre_v2_user_select_mitigation(cmd);
+ spectre_v2_cmd = cmd;
}
static void update_stibp_msr(void * __unused)
{
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
+ write_spec_ctrl_current(val, true);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -1424,16 +1758,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
}
/*
- * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
- * bit in the mask to allow guests to use the mitigation even in the
- * case where the host does not enable it.
- */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
- static_cpu_has(X86_FEATURE_AMD_SSBD)) {
- x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
- }
-
- /*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
@@ -1450,7 +1774,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ write_spec_ctrl_current(x86_spec_ctrl_base, true);
}
}
@@ -1701,7 +2025,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
void x86_spec_ctrl_setup_ap(void)
{
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ write_spec_ctrl_current(x86_spec_ctrl_base, true);
if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
x86_amd_ssb_disable();
@@ -1938,7 +2262,7 @@ static ssize_t mmio_stale_data_show_state(char *buf)
static char *stibp_state(void)
{
- if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -1994,6 +2318,24 @@ static ssize_t srbds_show_state(char *buf)
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
+static ssize_t retbleed_show_state(char *buf)
+{
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sprintf(buf, "Vulnerable: untrained return thunk on non-Zen uarch\n");
+
+ return sprintf(buf, "%s; SMT %s\n",
+ retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
+ }
+
+ return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -2039,6 +2381,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_MMIO_STALE_DATA:
return mmio_stale_data_show_state(buf);
+ case X86_BUG_RETBLEED:
+ return retbleed_show_state(buf);
+
default:
break;
}
@@ -2095,4 +2440,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at
{
return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
}
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4730b0a58f24..736262a76a12 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1205,48 +1205,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
{}
};
+#define VULNBL(vendor, family, model, blacklist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, steppings, \
X86_FEATURE_ANY, issues)
+#define VULNBL_AMD(family, blacklist) \
+ VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
+#define VULNBL_HYGON(family, blacklist) \
+ VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
+
#define SRBDS BIT(0)
/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
#define MMIO BIT(1)
/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
#define MMIO_SBDS BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED BIT(3)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
- BIT(7) | BIT(0xB), MMIO),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+
+ VULNBL_AMD(0x15, RETBLEED),
+ VULNBL_AMD(0x16, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED),
+ VULNBL_HYGON(0x18, RETBLEED),
{}
};
@@ -1348,6 +1360,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
!arch_cap_mmio_immune(ia32_cap))
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ }
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2a8e584fc991..7c9b5893c30a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -61,6 +61,8 @@ static inline void tsx_init(void) { }
static inline void tsx_ap_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
+extern void init_spectral_chicken(struct cpuinfo_x86 *c);
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 7227c15299d0..9651275aecd1 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/isa-dma.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index da696eb4821a..993697e71854 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -15,6 +15,8 @@ enum vmx_feature_leafs {
MISC_FEATURES = 0,
PRIMARY_CTLS,
SECONDARY_CTLS,
+ TERTIARY_CTLS_LOW,
+ TERTIARY_CTLS_HIGH,
NR_VMX_FEATURE_WORDS,
};
@@ -22,7 +24,7 @@ enum vmx_feature_leafs {
static void init_vmx_capabilities(struct cpuinfo_x86 *c)
{
- u32 supported, funcs, ept, vpid, ign;
+ u32 supported, funcs, ept, vpid, ign, low, high;
BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
@@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
c->vmx_capability[SECONDARY_CTLS] = supported;
+ /* All 64 bits of tertiary controls MSR are allowed-1 settings. */
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
+ c->vmx_capability[TERTIARY_CTLS_LOW] = low;
+ c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
+
rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 3fcdda4c1e11..21fd425088fe 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -302,6 +302,12 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id();
+ /*
+ * XXX someone from Hygon needs to confirm this DTRT
+ *
+ init_spectral_chicken(c);
+ */
+
set_cpu_cap(c, X86_FEATURE_ZEN);
set_cpu_cap(c, X86_FEATURE_CPB);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fd5dead8371c..2d7ea5480ec3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -682,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c)
unsigned int l1, l2;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
@@ -1216,22 +1216,23 @@ static void bus_lock_init(void)
{
u64 val;
- /*
- * Warn and fatal are handled by #AC for split lock if #AC for
- * split lock is supported.
- */
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
- (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- (sld_state == sld_warn || sld_state == sld_fatal)) ||
- sld_state == sld_off)
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
return;
- /*
- * Enable #DB for bus lock. All bus locks are handled in #DB except
- * split locks are handled in #AC in the fatal case.
- */
rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
- val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 5fbd7ffb3233..12cf2e7ca33c 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -33,6 +33,8 @@
#include "internal.h"
+static bool hw_injection_possible = true;
+
/*
* Collect all the MCi_XXX settings
*/
@@ -339,6 +341,8 @@ static int __set_inj(const char *buf)
for (i = 0; i < N_INJ_TYPES; i++) {
if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ if (i > SW_INJ && !hw_injection_possible)
+ continue;
inj_type = i;
return 0;
}
@@ -717,11 +721,54 @@ static void __init debugfs_init(void)
&i_mce, dfs_fls[i].fops);
}
+static void check_hw_inj_possible(void)
+{
+ int cpu;
+ u8 bank;
+
+ /*
+ * This behavior exists only on SMCA systems though its not directly
+ * related to SMCA.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SMCA))
+ return;
+
+ cpu = get_cpu();
+
+ for (bank = 0; bank < MAX_NR_BANKS; ++bank) {
+ u64 status = MCI_STATUS_VAL, ipid;
+
+ /* Check whether bank is populated */
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ if (!ipid)
+ continue;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+
+ if (!status) {
+ hw_injection_possible = false;
+ pr_warn("Platform does not allow *hardware* error injection."
+ "Try using APEI EINJ instead.\n");
+ }
+
+ toggle_hw_mce_inject(cpu, false);
+
+ break;
+ }
+
+ put_cpu();
+}
+
static int __init inject_init(void)
{
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
+ check_hw_inj_possible();
+
debugfs_init();
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 4ae0e603f7fa..7e03f5b7f6bd 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -211,7 +211,7 @@ noinstr u64 mce_rdmsrl(u32 msr);
static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
{
- if (mce_flags.smca) {
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
switch (reg) {
case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index c4be62058dd9..26a427fa84ea 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -11,56 +11,39 @@
#include <asm/archrandom.h>
#include <asm/sections.h>
-static int __init x86_rdrand_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_RDRAND);
- setup_clear_cpu_cap(X86_FEATURE_RDSEED);
- return 1;
-}
-__setup("nordrand", x86_rdrand_setup);
-
/*
* RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
- * Run the instruction a few times as a sanity check.
- * If it fails, it is simple to disable RDRAND here.
+ * Run the instruction a few times as a sanity check. Also make sure
+ * it's not outputting the same value over and over, which has happened
+ * as a result of past CPU bugs.
+ *
+ * If it fails, it is simple to disable RDRAND and RDSEED here.
*/
-#define SANITY_CHECK_LOOPS 8
-#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
- unsigned int changed = 0;
- unsigned long tmp, prev;
- int i;
+ enum { SAMPLES = 8, MIN_CHANGE = 5 };
+ unsigned long sample, prev;
+ bool failure = false;
+ size_t i, changed;
if (!cpu_has(c, X86_FEATURE_RDRAND))
return;
- for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
- if (!rdrand_long(&tmp)) {
- clear_cpu_cap(c, X86_FEATURE_RDRAND);
- pr_warn_once("rdrand: disabled\n");
- return;
+ for (changed = 0, i = 0; i < SAMPLES; ++i) {
+ if (!rdrand_long(&sample)) {
+ failure = true;
+ break;
}
+ changed += i && sample != prev;
+ prev = sample;
}
+ if (changed < MIN_CHANGE)
+ failure = true;
- /*
- * Stupid sanity-check whether RDRAND does *actually* generate
- * some at least random-looking data.
- */
- prev = tmp;
- for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
- if (rdrand_long(&tmp)) {
- if (prev != tmp)
- changed++;
-
- prev = tmp;
- }
+ if (failure) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ pr_emerg("RDRAND is not reliable on this platform; disabling.\n");
}
-
- if (WARN_ON_ONCE(!changed))
- pr_emerg(
-"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
-
}
-#endif
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index dbaa8326d6f2..fd44b54c90d5 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 19876ebfb504..24c1bb8eb196 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
-static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
- unsigned long addr,
- unsigned long vm_flags)
+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+ struct sgx_encl_page *entry)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
struct sgx_epc_page *epc_page;
- struct sgx_encl_page *entry;
-
- entry = xa_load(&encl->page_array, PFN_DOWN(addr));
- if (!entry)
- return ERR_PTR(-EFAULT);
-
- /*
- * Verify that the faulted page has equal or higher build time
- * permissions than the VMA permissions (i.e. the subset of {VM_READ,
- * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
- */
- if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
- return ERR_PTR(-EFAULT);
/* Entry successfully located. */
if (entry->epc_page) {
@@ -276,6 +261,146 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr)
+{
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+/**
+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
+ * @vma: VMA obtained from fault info from where page is accessed
+ * @encl: enclave accessing the page
+ * @addr: address that triggered the page fault
+ *
+ * When an initialized enclave accesses a page with no backing EPC page
+ * on a SGX2 system then the EPC can be added dynamically via the SGX2
+ * ENCLS[EAUG] instruction.
+ *
+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ */
+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
+ struct sgx_encl *encl, unsigned long addr)
+{
+ vm_fault_t vmret = VM_FAULT_SIGBUS;
+ struct sgx_pageinfo pginfo = {0};
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ unsigned long phys_addr;
+ u64 secinfo_flags;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Ignore internal permission checking for dynamically added pages.
+ * They matter only for data added during the pre-initialization
+ * phase. The enclave decides the permissions by the means of
+ * EACCEPT, EACCEPTCOPY and EMODPE.
+ */
+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
+ if (IS_ERR(encl_page))
+ return VM_FAULT_OOM;
+
+ mutex_lock(&encl->lock);
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ va_page = sgx_encl_grow(encl, false);
+ if (IS_ERR(va_page))
+ goto err_out_epc;
+
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ /*
+ * If ret == -EBUSY then page was created in another flow while
+ * running without encl->lock
+ */
+ if (ret)
+ goto err_out_shrink;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = 0;
+
+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
+ if (ret)
+ goto err_out;
+
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = SGX_PAGE_TYPE_REG;
+ encl->secs_child_cnt++;
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+
+ phys_addr = sgx_get_epc_phys_addr(epc_page);
+ /*
+ * Do not undo everything when creating PTE entry fails - next #PF
+ * would find page ready for a PTE.
+ */
+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (vmret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_NOPAGE;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+err_out_epc:
+ sgx_encl_free_epc_page(epc_page);
+err_out_unlock:
+ mutex_unlock(&encl->lock);
+ kfree(encl_page);
+
+ return vmret;
+}
+
static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
{
unsigned long addr = (unsigned long)vmf->address;
@@ -295,9 +420,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
if (unlikely(!encl))
return VM_FAULT_SIGBUS;
+ /*
+ * The page_array keeps track of all enclave pages, whether they
+ * are swapped out or not. If there is no entry for this page and
+ * the system supports SGX2 then it is possible to dynamically add
+ * a new enclave page. This is only possible for an initialized
+ * enclave that will be checked for right away.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
+ (!xa_load(&encl->page_array, PFN_DOWN(addr))))
+ return sgx_encl_eaug_page(vma, encl, addr);
+
mutex_lock(&encl->lock);
- entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
if (IS_ERR(entry)) {
mutex_unlock(&encl->lock);
@@ -367,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+ /* Disallow mapping outside enclave's address range. */
+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
+ (start < encl->base || end > encl->base + encl->size))
+ return -EACCES;
+
/*
* Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
* conflict with the enclave page permissions.
@@ -445,7 +586,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
for ( ; ; ) {
mutex_lock(&encl->lock);
- entry = sgx_encl_load_page(encl, addr, vm_flags);
+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
if (PTR_ERR(entry) != -EBUSY)
break;
@@ -687,7 +828,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
spin_lock(&encl->mm_lock);
list_add_rcu(&encl_mm->list, &encl->mm_list);
- /* Pairs with smp_rmb() in sgx_reclaimer_block(). */
+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
smp_wmb();
encl->mm_list_version++;
spin_unlock(&encl->mm_lock);
@@ -695,6 +836,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
return 0;
}
+/**
+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
+ * @encl: the enclave
+ *
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. For example, ENCLS[EWB]
+ * copies a page from the enclave page cache to regular main memory but
+ * it fails if it cannot ensure that there are no cached
+ * linear-to-physical address mappings referring to the page.
+ *
+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU
+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
+ * address mappings are cleared but coordination with the tracking done within
+ * the SGX hardware is needed to support the SGX functions that depend on this
+ * cache clearing.
+ *
+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware
+ * tracks threads operating inside the enclave at that time. The SGX
+ * hardware tracking require that all the identified threads must have
+ * exited the enclave in order to flush the mappings before a function such
+ * as ENCLS[EWB] will be permitted
+ *
+ * The following flow is used to support SGX functions that require that
+ * no cached linear-to-physical address mappings are present:
+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
+ * accessing the enclave.
+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and
+ * thus flushing all locally cached linear-to-physical address mappings.
+ * 4) Execute SGX function.
+ *
+ * Context: It is required to call this function after ENCLS[ETRACK].
+ * This will ensure that if any new mm appears (racing with
+ * sgx_encl_mm_add()) then the new mm will enter into the
+ * enclave with fresh linear-to-physical address mappings.
+ *
+ * It is required that all IPIs are completed before a new
+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
+ * of the above flow with the enclave's mutex.
+ *
+ * Return: cpumask of CPUs that might be accessing @encl
+ */
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
pgoff_t index)
{
@@ -735,7 +943,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
return PTR_ERR(pcmd);
}
- backing->page_index = page_index;
backing->contents = contents;
backing->pcmd = pcmd;
backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
@@ -902,8 +1109,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm,
return ret;
}
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+/**
+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
+ * @encl: the enclave
+ * @addr: page aligned pointer to single page for which PTEs will be removed
+ *
+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
+ * @addr from each VMA. Ensure that page fault handler is ready to handle
+ * new mappings of @addr before calling this function.
+ */
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+}
+
/**
* sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ * @reclaim: Reclaim EPC pages directly if none available. Enclave
+ * mutex should not be held if this is set.
*
* Allocate a free EPC page and convert it to a Version Array (VA) page.
*
@@ -911,12 +1195,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm,
* a VA page,
* -errno otherwise
*/
-struct sgx_epc_page *sgx_alloc_va_page(void)
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
{
struct sgx_epc_page *epc_page;
int ret;
- epc_page = sgx_alloc_epc_page(NULL, true);
+ epc_page = sgx_alloc_epc_page(NULL, reclaim);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 332ef3568267..a65a952116fd 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -27,7 +27,8 @@
struct sgx_encl_page {
unsigned long desc;
- unsigned long vm_max_prot_bits;
+ unsigned long vm_max_prot_bits:8;
+ enum sgx_page_type type:16;
struct sgx_epc_page *epc_page;
struct sgx_encl *encl;
struct sgx_va_page *va_page;
@@ -78,7 +79,6 @@ struct sgx_va_page {
};
struct sgx_backing {
- pgoff_t page_index;
struct page *contents;
struct page *pcmd;
unsigned long pcmd_offset;
@@ -106,6 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
bool current_is_ksgxd(void);
void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing);
int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
@@ -113,11 +114,18 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
void sgx_encl_put_backing(struct sgx_backing *backing);
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
struct sgx_encl_page *page);
-
-struct sgx_epc_page *sgx_alloc_va_page(void);
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags);
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr);
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
void sgx_encl_free_epc_page(struct sgx_epc_page *page);
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr);
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index fa04a73daf9c..99004b02e2ed 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -136,57 +136,71 @@ static inline bool encls_failed(int ret)
ret; \
})
+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
{
return __encls_2(ECREATE, pginfo, secs);
}
+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */
static inline int __eextend(void *secs, void *addr)
{
return __encls_2(EEXTEND, secs, addr);
}
+/*
+ * Associate an EPC page to an enclave either as a REG or TCS page
+ * populated with the provided data.
+ */
static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
{
return __encls_2(EADD, pginfo, addr);
}
+/* Finalize enclave build, initialize enclave for user code execution. */
static inline int __einit(void *sigstruct, void *token, void *secs)
{
return __encls_ret_3(EINIT, sigstruct, secs, token);
}
+/* Disassociate EPC page from its enclave and mark it as unused. */
static inline int __eremove(void *addr)
{
return __encls_ret_1(EREMOVE, addr);
}
+/* Copy data to an EPC page belonging to a debug enclave. */
static inline int __edbgwr(void *addr, unsigned long *data)
{
return __encls_2(EDGBWR, *data, addr);
}
+/* Copy data from an EPC page belonging to a debug enclave. */
static inline int __edbgrd(void *addr, unsigned long *data)
{
return __encls_1_1(EDGBRD, *data, addr);
}
+/* Track that software has completed the required TLB address clears. */
static inline int __etrack(void *addr)
{
return __encls_ret_1(ETRACK, addr);
}
+/* Load, verify, and unblock an EPC page. */
static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
void *va)
{
return __encls_ret_3(ELDU, pginfo, addr, va);
}
+/* Make EPC page inaccessible to enclave, ready to be written to memory. */
static inline int __eblock(void *addr)
{
return __encls_ret_1(EBLOCK, addr);
}
+/* Initialize an EPC page into a Version Array (VA) page. */
static inline int __epa(void *addr)
{
unsigned long rbx = SGX_PAGE_TYPE_VA;
@@ -194,10 +208,29 @@ static inline int __epa(void *addr)
return __encls_2(EPA, rbx, addr);
}
+/* Invalidate an EPC page and write it out to main memory. */
static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
void *va)
{
return __encls_ret_3(EWB, pginfo, addr, va);
}
+/* Restrict the EPCM permissions of an EPC page. */
+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODPR, secinfo, addr);
+}
+
+/* Change the type of an EPC page. */
+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODT, secinfo, addr);
+}
+
+/* Zero a page of EPC memory and add it to an initialized enclave. */
+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EAUG, pginfo, addr);
+}
+
#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 83df20e3e633..ebe79d60619f 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -17,7 +17,7 @@
#include "encl.h"
#include "encls.h"
-static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim)
{
struct sgx_va_page *va_page = NULL;
void *err;
@@ -30,7 +30,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
if (!va_page)
return ERR_PTR(-ENOMEM);
- va_page->epc_page = sgx_alloc_va_page();
+ va_page->epc_page = sgx_alloc_va_page(reclaim);
if (IS_ERR(va_page->epc_page)) {
err = ERR_CAST(va_page->epc_page);
kfree(va_page);
@@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
return va_page;
}
-static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
{
encl->page_cnt--;
@@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
- va_page = sgx_encl_grow(encl);
+ va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
else if (va_page)
@@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
set_bit(SGX_ENCL_DEBUG, &encl->flags);
encl->secs.encl = encl;
+ encl->secs.type = SGX_PAGE_TYPE_SECS;
encl->base = secs->base;
encl->size = secs->size;
encl->attributes = secs->attributes;
@@ -168,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
- unsigned long offset,
- u64 secinfo_flags)
-{
- struct sgx_encl_page *encl_page;
- unsigned long prot;
-
- encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
- if (!encl_page)
- return ERR_PTR(-ENOMEM);
-
- encl_page->desc = encl->base + offset;
- encl_page->encl = encl;
-
- prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
- _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
- _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
-
- /*
- * TCS pages must always RW set for CPU access while the SECINFO
- * permissions are *always* zero - the CPU ignores the user provided
- * values and silently overwrites them with zero permissions.
- */
- if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
- prot |= PROT_READ | PROT_WRITE;
-
- /* Calculate maximum of the VM flags for the page. */
- encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
-
- return encl_page;
-}
-
static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
{
u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
@@ -306,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
return PTR_ERR(epc_page);
}
- va_page = sgx_encl_grow(encl);
+ va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page)) {
ret = PTR_ERR(va_page);
goto err_out_free;
@@ -344,6 +313,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
*/
encl_page->encl = encl;
encl_page->epc_page = epc_page;
+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8;
encl->secs_child_cnt++;
if (flags & SGX_PAGE_MEASURE) {
@@ -372,6 +342,26 @@ err_out_free:
return ret;
}
+/*
+ * Ensure user provided offset and length values are valid for
+ * an enclave.
+ */
+static int sgx_validate_offset_length(struct sgx_encl *encl,
+ unsigned long offset,
+ unsigned long length)
+{
+ if (!IS_ALIGNED(offset, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!length || !IS_ALIGNED(length, PAGE_SIZE))
+ return -EINVAL;
+
+ if (offset + length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
* @encl: an enclave pointer
@@ -425,14 +415,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
return -EFAULT;
- if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) ||
- !IS_ALIGNED(add_arg.src, PAGE_SIZE))
- return -EINVAL;
-
- if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1))
+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE))
return -EINVAL;
- if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size)
+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length))
return -EINVAL;
if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
@@ -674,6 +660,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
+/*
+ * Ensure enclave is ready for SGX2 functions. Readiness is checked
+ * by ensuring the hardware supports SGX2 and the enclave is initialized
+ * and thus able to handle requests to modify pages within it.
+ */
+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2)))
+ return -ENODEV;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. Collaborate with
+ * hardware via ENCLS[ETRACK] to ensure that all cached
+ * linear-to-physical address mappings belonging to all threads of
+ * the enclave are cleared. See sgx_encl_cpumask() for details.
+ *
+ * Must be called with enclave's mutex held from the time the
+ * SGX function requiring that no cached linear-to-physical mappings
+ * are present is executed until this ETRACK flow is complete.
+ */
+static int sgx_enclave_etrack(struct sgx_encl *encl)
+{
+ void *epc_virt;
+ int ret;
+
+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ /*
+ * ETRACK only fails when there is an OS issue. For
+ * example, two consecutive ETRACK was sent without
+ * completed IPI between.
+ */
+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret);
+ /*
+ * Send IPIs to kick CPUs out of the enclave and
+ * try ETRACK again.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ pr_err_once("ETRACK repeat returned %d (0x%x)",
+ ret, ret);
+ return -EFAULT;
+ }
+ }
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+
+ return 0;
+}
+
+/**
+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions
+ * @encl: Enclave to which the pages belong.
+ * @modp: Checked parameters from user on which pages need modifying and
+ * their new permissions.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long
+sgx_enclave_restrict_permissions(struct sgx_encl *encl,
+ struct sgx_enclave_restrict_permissions *modp)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK;
+
+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
+ addr = encl->base + modp->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Changing EPCM permissions is only supported on regular
+ * SGX pages. Attempting this change on other pages will
+ * result in #PF.
+ */
+ if (entry->type != SGX_PAGE_TYPE_REG) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Apart from ensuring that read-access remains, do not verify
+ * the permission bits requested. Kernel has no control over
+ * how EPCM permissions can be relaxed from within the enclave.
+ * ENCLS[EMODPR] can only remove existing EPCM permissions,
+ * attempting to set new permissions will be ignored by the
+ * hardware.
+ */
+
+ /* Change EPCM permissions. */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * permissions of a regular page, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these
+ * are protected with mutex.
+ */
+ pr_err_once("EMODPR encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ if (encls_failed(ret)) {
+ modp->result = ret;
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modp->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_restrict_permissions() - handler for
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions
+ * instance
+ *
+ * SGX2 distinguishes between relaxing and restricting the enclave page
+ * permissions maintained by the hardware (EPCM permissions) of pages
+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT).
+ *
+ * EPCM permissions cannot be restricted from within the enclave, the enclave
+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR]
+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call
+ * will be ignored by the hardware.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_restrict_permissions params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK)
+ return -EINVAL;
+
+ /*
+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR]
+ * from faulting later when the CPU does the same check.
+ */
+ if ((params.permissions & SGX_SECINFO_W) &&
+ !(params.permissions & SGX_SECINFO_R))
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_restrict_permissions(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages
+ * @encl: Enclave to which the pages belong.
+ * @modt: Checked parameters from user about which pages need modifying
+ * and their new page type.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_enclave_modify_types(struct sgx_encl *encl,
+ struct sgx_enclave_modify_types *modt)
+{
+ unsigned long max_prot_restore;
+ enum sgx_page_type page_type;
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long prot;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK;
+
+ /*
+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM.
+ */
+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM)
+ return -EINVAL;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ secinfo.flags = page_type << 8;
+
+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
+ addr = encl->base + modt->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Borrow the logic from the Intel SDM. Regular pages
+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS
+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed.
+ * CET pages not supported yet.
+ */
+ if (!(entry->type == SGX_PAGE_TYPE_REG ||
+ (entry->type == SGX_PAGE_TYPE_TCS &&
+ page_type == SGX_PAGE_TYPE_TRIM))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ max_prot_restore = entry->vm_max_prot_bits;
+
+ /*
+ * Once a regular page becomes a TCS page it cannot be
+ * changed back. So the maximum allowed protection reflects
+ * the TCS page that is always RW from kernel perspective but
+ * will be inaccessible from within enclave. Before doing
+ * so, do make sure that the new page type continues to
+ * respect the originally vetted page permissions.
+ */
+ if (entry->type == SGX_PAGE_TYPE_REG &&
+ page_type == SGX_PAGE_TYPE_TCS) {
+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ prot = PROT_READ | PROT_WRITE;
+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ /*
+ * Prevent page from being reclaimed while mutex
+ * is released.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EAGAIN;
+ goto out_entry_changed;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_mark_page_reclaimable(entry->epc_page);
+ }
+
+ /* Change EPC type */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodt(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * valid page types, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these are
+ * protected with mutex.
+ */
+ pr_err_once("EMODT encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+ if (encls_failed(ret)) {
+ modt->result = ret;
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ entry->type = page_type;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_entry_changed:
+ entry->vm_max_prot_bits = max_prot_restore;
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modt->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance
+ *
+ * Ability to change the enclave page type supports the following use cases:
+ *
+ * * It is possible to add TCS pages to an enclave by changing the type of
+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages.
+ * With this support the number of threads supported by an initialized
+ * enclave can be increased dynamically.
+ *
+ * * Regular or TCS pages can dynamically be removed from an initialized
+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the
+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual
+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called
+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the
+ * enclave.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_modify_types params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.page_type & ~SGX_PAGE_TYPE_MASK)
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_modify_types(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave
+ * @encl: Enclave to which the pages belong
+ * @params: Checked parameters from user on which pages need to be removed
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_encl_remove_pages(struct sgx_encl *encl,
+ struct sgx_enclave_remove_pages *params)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+
+ for (c = 0 ; c < params->length; c += PAGE_SIZE) {
+ addr = encl->base + params->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entry->type != SGX_PAGE_TYPE_TRIM) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ /*
+ * ENCLS[EMODPR] is a no-op instruction used to inform if
+ * ENCLU[EACCEPT] was run from within the enclave. If
+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is
+ * not yet accepted then it will return
+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is
+ * accepted the instruction will encounter a page fault.
+ */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc));
+ sgx_encl_shrink(encl, NULL);
+ kfree(entry);
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ params->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance
+ *
+ * Final step of the flow removing pages from an initialized enclave. The
+ * complete flow is:
+ *
+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM
+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl().
+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within
+ * the enclave.
+ * 3) User initiates actual page removal using the
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here.
+ *
+ * First remove any page table entries pointing to the page and then proceed
+ * with the actual removal of the enclave page and data in support of it.
+ *
+ * VA pages are not affected by this removal. It is thus possible that the
+ * enclave may end up with more VA pages than needed to support all its
+ * pages.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_remove_pages params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.count)
+ return -EINVAL;
+
+ ret = sgx_encl_remove_pages(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
struct sgx_encl *encl = filep->private_data;
@@ -695,6 +1240,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
case SGX_IOC_ENCLAVE_PROVISION:
ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
break;
+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS:
+ ret = sgx_ioc_enclave_restrict_permissions(encl,
+ (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_MODIFY_TYPES:
+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_REMOVE_PAGES:
+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg);
+ break;
default:
ret = -ENOIOCTLCMD;
break;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index a78652d43e61..515e2a5f25bb 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
struct sgx_encl_page *page = epc_page->owner;
unsigned long addr = page->desc & PAGE_MASK;
struct sgx_encl *encl = page->encl;
- unsigned long mm_list_version;
- struct sgx_encl_mm *encl_mm;
- struct vm_area_struct *vma;
- int idx, ret;
-
- do {
- mm_list_version = encl->mm_list_version;
-
- /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
- smp_rmb();
-
- idx = srcu_read_lock(&encl->srcu);
-
- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
- if (!mmget_not_zero(encl_mm->mm))
- continue;
-
- mmap_read_lock(encl_mm->mm);
-
- ret = sgx_encl_find(encl_mm->mm, addr, &vma);
- if (!ret && encl == vma->vm_private_data)
- zap_vma_ptes(vma, addr, PAGE_SIZE);
-
- mmap_read_unlock(encl_mm->mm);
-
- mmput_async(encl_mm->mm);
- }
+ int ret;
- srcu_read_unlock(&encl->srcu, idx);
- } while (unlikely(encl->mm_list_version != mm_list_version));
+ sgx_zap_enclave_ptes(encl, addr);
mutex_lock(&encl->lock);
@@ -201,39 +174,10 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
return ret;
}
-static void sgx_ipi_cb(void *info)
+void sgx_ipi_cb(void *info)
{
}
-static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
-{
- cpumask_t *cpumask = &encl->cpumask;
- struct sgx_encl_mm *encl_mm;
- int idx;
-
- /*
- * Can race with sgx_encl_mm_add(), but ETRACK has already been
- * executed, which means that the CPUs running in the new mm will enter
- * into the enclave with a fresh epoch.
- */
- cpumask_clear(cpumask);
-
- idx = srcu_read_lock(&encl->srcu);
-
- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
- if (!mmget_not_zero(encl_mm->mm))
- continue;
-
- cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
-
- mmput_async(encl_mm->mm);
- }
-
- srcu_read_unlock(&encl->srcu, idx);
-
- return cpumask;
-}
-
/*
* Swap page to the regular memory transformed to the blocked state by using
* EBLOCK, which means that it can no longer be referenced (no new TLB entries).
@@ -280,7 +224,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
* miss cpus that entered the enclave between
* generating the mask and incrementing epoch.
*/
- on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
+ on_each_cpu_mask(sgx_encl_cpumask(encl),
sgx_ipi_cb, NULL, 1);
ret = __sgx_encl_ewb(epc_page, va_slot, backing);
}
@@ -431,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark)
!list_empty(&sgx_active_page_list);
}
+/*
+ * sgx_reclaim_direct() should be called (without enclave's mutex held)
+ * in locations where SGX memory resources might be low and might be
+ * needed in order to make forward progress.
+ */
+void sgx_reclaim_direct(void)
+{
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages();
+}
+
static int ksgxd(void *p)
{
set_freezable();
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 0f17def9fe6f..0f2020653fba 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -86,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
struct sgx_epc_page *__sgx_alloc_epc_page(void);
void sgx_free_epc_page(struct sgx_epc_page *page);
+void sgx_reclaim_direct(void);
void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+void sgx_ipi_cb(void *info);
+
#ifdef CONFIG_X86_SGX_KVM
int __init sgx_vepc_init(void);
#else
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index c04b933f48d3..02039ec3597d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void)
{
uint32_t eax, ebx, ecx, edx;
VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 &&
- (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
+ return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
+ (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
}
#ifdef CONFIG_AMD_MEM_ENCRYPT