diff options
Diffstat (limited to 'arch/x86/kernel')
44 files changed, 2633 insertions, 945 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..1a2dc328cb5e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,8 +46,6 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n -CFLAGS_head$(BITS).o += -fno-stack-protector - CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..6d2c50819501 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static bool acpi_support_online_capable; #endif +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -1083,6 +1143,29 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1278,6 +1361,14 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index df1644d9b3b6..8b8cbf22461a 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) return err; } -bool amd_set_max_freq_ratio(u64 *ratio) +static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; u64 highest_perf, nominal_perf; u64 perf_ratio; int rc; - if (!ratio) - return false; - rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; + return; } highest_perf = amd_get_highest_perf(); @@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio) if (!highest_perf || !nominal_perf) { pr_debug("Could not retrieve highest or nominal performance\n"); - return false; + return; } perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); @@ -79,25 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio) perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; if (!perf_ratio) { pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; + return; } - *ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; + freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); void init_freq_invariance_cppc(void) { - static bool secondary; + static bool init_done; - mutex_lock(&freq_invariance_lock); + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; - init_freq_invariance(secondary, true); - secondary = true; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; mutex_unlock(&freq_invariance_lock); } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 020c906f7934..190e0f763375 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -188,7 +188,7 @@ int amd_smn_write(u16 node, u32 address, u32 value) EXPORT_SYMBOL_GPL(amd_smn_write); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -210,14 +210,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -290,7 +290,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..189d3a5e471a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -170,7 +170,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -320,6 +320,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -340,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -1419,22 +1428,21 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -2551,6 +2559,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c1bb384935b0..a868b76cd3d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -65,6 +65,7 @@ #include <asm/irq_remapping.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/pgtable.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2677,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + flags = pgprot_decrypted(flags); + + __set_fixmap(idx, phys, flags); +} + void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; @@ -2709,7 +2723,7 @@ fake_ioapic_page: __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); + io_apic_set_fixmap(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); @@ -2838,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENODEV; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5a48e66e4f5..482855227964 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void) int mmr_shift; char *state; - /* Different returns from different UV BIOS versions */ + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; @@ -1340,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr) static void __init decode_gam_rng_tbl(unsigned long ptr) { struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; - unsigned long lgre = 0; + unsigned long lgre = 0, gend = 0; int index = 0; int sock_min = 999999, pnode_min = 99999; int sock_max = -1, pnode_max = -1; @@ -1374,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) @@ -1391,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) _max_pnode = pnode_max; _gr_table_len = index; - pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); } /* Walk through UVsystab decoding the fields */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9fb0a2f8b62a..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include <asm/bootparam.h> #include <asm/suspend.h> #include <asm/tlbflush.h> +#include <asm/tdx.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -66,6 +67,22 @@ static void __used common(void) #endif BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 9ca008f9e9b1..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,146 +6,446 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ - +#include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> -#include <linux/cpufreq.h> -#include <linux/smp.h> -#include <linux/sched/isolation.h> #include <linux/rcupdate.h> +#include <linux/sched/isolation.h> +#include <linux/sched/topology.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "cpu.h" -struct aperfmperf_sample { - unsigned int khz; - atomic_t scfpending; - ktime_t time; - u64 aperf; - u64 mperf; +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; }; -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ -static void aperfmperf_snapshot_khz(void *dummy) + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) { - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: /* - * There is no architectural guarantee that MPERF - * increments faster than we can read it. + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (mperf_delta == 0) - return; + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } - s->time = ktime_get(); - s->aperf = aperf; - s->mperf = mperf; - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - atomic_set_release(&s->scfpending, 0); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; } -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) { - s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return true; +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) +{ + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} + +static void __init bp_init_freq_invariance(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (!atomic_xchg(&s->scfpending, 1) || wait) - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); +} - /* Return false if the previous iteration was too long ago. */ - return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); } -unsigned int aperfmperf_get_khz(int cpu) +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +static void scale_freq_tick(u64 acnt, u64 mcnt) { - if (!cpu_khz) - return 0; + u64 freq_scale; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + if (!arch_scale_freq_invariant()) + return; - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; - if (rcu_is_idle_cpu(cpu)) - return 0; /* Idle CPUs are completely uninteresting. */ + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; - aperfmperf_snapshot_cpu(cpu, ktime_get(), true); - return per_cpu(samples.khz, cpu); + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ -void arch_freq_prepare_all(void) +void arch_scale_freq_tick(void) { - ktime_t now = ktime_get(); - bool wait = false; - int cpu; + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; - if (!cpu_khz) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; - for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - continue; - if (rcu_is_idle_cpu(cpu)) - continue; /* Idle CPUs are completely uninteresting. */ - if (!aperfmperf_snapshot_cpu(cpu, now, false)) - wait = true; - } + s->aperf = aperf; + s->mperf = mperf; + + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); - if (wait) - msleep(APERFMPERF_REFRESH_DELAY_MS); + scale_freq_tick(acnt, mcnt); } +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + unsigned int arch_freq_get_on_cpu(int cpu) { - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; + unsigned long last; + u64 acnt, mcnt; - if (!cpu_khz) - return 0; + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + goto fallback; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); - if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) - return 0; + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + goto fallback; + + return div64_u64((cpu_khz * acnt), mcnt); + +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; +} - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) - return per_cpu(samples.khz, cpu); +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; - msleep(APERFMPERF_REFRESH_DELAY_MS); - atomic_set(&s->scfpending, 1); - smp_mb(); /* ->scfpending before smp_call_function_single(). */ - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); - return per_cpu(samples.khz, cpu); +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6296e1ebed1d..d879a6c93609 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -446,6 +446,13 @@ void update_srbds_msr(void) if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e342ae4db3c4..2e9142797c99 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -60,6 +60,7 @@ #include <asm/uv/uv.h> #include <asm/sigframe.h> #include <asm/traps.h> +#include <asm/sev.h> #include "cpu.h" @@ -298,13 +299,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { @@ -376,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static __init int setup_disable_smep(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMEP); - return 1; -} -__setup("nosmep", setup_disable_smep); - static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -403,14 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - clear_cpu_cap(c, X86_FEATURE_SMAP); - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) @@ -1368,8 +1342,8 @@ static void detect_nopl(void) static void __init cpu_parse_early_param(void) { char arg[128]; - char *argptr = arg; - int arglen, res, bit; + char *argptr = arg, *opt; + int arglen, taint = 0; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1397,21 +1371,61 @@ static void __init cpu_parse_early_param(void) return; pr_info("Clearing CPUID bits:"); - do { - res = get_option(&argptr, &bit); - if (res == 0 || res == 3) - break; - /* If the argument was too long, the last bit may be cut off */ - if (res == 1 && arglen >= sizeof(arg)) - break; + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; - if (bit >= 0 && bit < NCAPINTS * 32) { - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); setup_clear_cpu_cap(bit); + taint++; + found = true; + break; } - } while (res == 2); + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } /* @@ -1859,14 +1873,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) tsx_ap_init(); } -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); - setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); - return 1; -} -__setup("noclflush", setup_noclflush); - void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; @@ -2126,6 +2132,9 @@ void cpu_init_exception_handling(void) load_TR_desc(); + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + /* Finally load the IDT */ load_current_idt(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f7a5370a9b3b..fd5dead8371c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,10 +7,13 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> +#include <linux/workqueue.h> #include <linux/delay.h> +#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -91,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); @@ -181,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -999,6 +1034,8 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static DEFINE_SEMAPHORE(buslock_sem); + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1109,18 +1146,52 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + static void split_lock_warn(unsigned long ip) { - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); + int cpu; - /* - * Disable the split lock detection for this task so it can make - * progress and set TIF_SLD so the detection is re-enabled via - * switch_to_sld() when the task is scheduled out. - */ + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); - set_tsk_thread_flag(current, TIF_SLD); + put_cpu(); } bool handle_guest_split_lock(unsigned long ip) @@ -1194,18 +1265,6 @@ void handle_bus_lock(struct pt_regs *regs) } /* - * This function is called only when switching between tasks with - * different split-lock detection modes. It sets the MSR for the - * mode of the new task. This is right most of the time, but since - * the MSR is shared by hyperthreads on a physical core there can - * be glitches when the two threads need different modes. - */ -void switch_to_sld(unsigned long tifn) -{ - sld_update_msr(!(tifn & _TIF_SLD)); -} - -/* * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should * only be trusted if it is confirmed that a CPU model implements a * specific feature at a particular bit position. @@ -1230,6 +1289,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), {} }; @@ -1274,10 +1334,14 @@ static void sld_state_show(void) pr_info("disabled\n"); break; case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { pr_info("#DB: warning on user-space bus_locks\n"); + } break; case sld_fatal: if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1940d305db1c..1c87501e0fa3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1294,10 +1294,23 @@ out_free: kfree(bank); } +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; @@ -1308,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu) */ this_cpu_write(threshold_banks, NULL); - for (bank = 0; bank < numbanks; bank++) { - if (bp[bank]) { - threshold_remove_bank(bp[bank]); - bp[bank] = NULL; - } - } - kfree(bp); + __threshold_remove_device(bp); return 0; } @@ -1351,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu) if (!(this_cpu_read(bank_map) & (1 << bank))) continue; err = threshold_create_bank(bp, cpu, bank); - if (err) - goto out_err; + if (err) { + __threshold_remove_device(bp); + return err; + } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; -out_err: - mce_threshold_remove_device(cpu); - return err; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 981496e6bc0e..d775fcd74e98 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 1add86935349..00483d1c27e4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -301,85 +301,65 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) } } -static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { - u64 mcx_cfg; + char *panic_msg = NULL; + int ret; /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. + * Default return value: Action required, the error must be handled + * immediately. */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank)); - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((mcx_cfg & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) -{ - enum context ctx = error_context(m, regs); + ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; + if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; + ret = MCE_PANIC_SEVERITY; + goto out; + } - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; } /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. */ - return MCE_KEEP_SEVERITY; + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; + ret = MCE_PANIC_SEVERITY; + } + +out: + if (msg && panic_msg) + *msg = panic_msg; + + return ret; } static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f955d25076ba..239ff5fcec6a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -758,9 +758,9 @@ static struct subsys_interface mc_cpu_interface = { }; /** - * mc_bp_resume - Update boot CPU microcode during resume. + * microcode_bsp_resume - Update boot CPU microcode during resume. */ -static void mc_bp_resume(void) +void microcode_bsp_resume(void) { int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -772,7 +772,7 @@ static void mc_bp_resume(void) } static struct syscore_ops mc_syscore_ops = { - .resume = mc_bp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d28a9f8f3fec..025c8f0cd948 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -502,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -712,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4eec8889b0ff..099b6f0d96bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = aperfmperf_get_khz(cpu); - - if (!freq) - freq = cpufreq_quick_get(cpu); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + unsigned int freq = arch_freq_get_on_cpu(cpu); + + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 83f901e2c2df..f276aff521e8 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7c63a1911fae..3c24e6124d95 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,92 @@ #include "encls.h" #include "sgx.h" +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + /* * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's * follow right after the EPC data in the backing storage. In addition to the @@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; bool pcmd_page_empty; @@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); ret = sgx_encl_get_backing(encl, page_index, &b); @@ -84,6 +176,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, } memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); /* * The area for the PCMD in the page was zeroed above. Check if the @@ -94,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + get_page(b.pcmd); + sgx_encl_put_backing(&b); sgx_encl_truncate_backing_page(encl, page_index); - if (pcmd_page_empty) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } @@ -645,15 +746,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, /** * sgx_encl_put_backing() - Unpin the backing storage * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +void sgx_encl_put_backing(struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } - put_page(backing->pcmd); put_page(backing->contents); } diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index fec43ca65065..d44e7372151f 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8e4bc6453d26..ab4ec54bbdd9 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -308,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), @@ -320,7 +323,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -379,11 +382,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) + if (ret) { + mutex_unlock(&encl_page->encl->lock); goto skip; + } - mutex_lock(&encl_page->encl->lock); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -411,7 +417,6 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e8326a8d1c5d..9730c88530fc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -407,7 +407,7 @@ int crash_load_segments(struct kimage *image) } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..e28ab0ecc537 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..c8340156bfd2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -142,7 +142,8 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) * Non-compacted format and legacy features use the cached fixed * offsets. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE) + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* @@ -369,12 +370,12 @@ static void __init setup_init_fpu_buf(void) /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only - * works with XSAVE, but not with XSAVEOPT and XSAVES because + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the - * data when XSAVES is available because XSAVES uses xstate + * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the @@ -584,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -595,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * Supervisor state components can be managed only by * XSAVES. */ - if (!compacted && xfeature_is_supervisor(i)) { + if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1); return false; } @@ -612,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -623,6 +628,10 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; @@ -632,13 +641,13 @@ static unsigned int __init get_xsaves_size(void) * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_independent(void) +static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) - return get_xsaves_size(); + return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); @@ -647,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ - size = get_xsaves_size(); + size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); @@ -687,20 +696,21 @@ static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* - * XSAVES kernel size includes supervisor states and - * uses compacted format when available. + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. * - * XSAVE does not support supervisor states so - * kernel and user size is identical. + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. */ if (compacted) - kernel_size = get_xsaves_size_no_independent(); + kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; @@ -813,8 +823,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; - fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; @@ -837,6 +850,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -873,7 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: @@ -917,7 +935,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } @@ -1215,7 +1233,7 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); @@ -1525,7 +1543,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * vendors into extending XFD for the pre AMX states, especially * AVX512. */ - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; @@ -1687,16 +1705,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ -long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; - if (tsk != current) - return -EPERM; - switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index d22ace092ca2..5ad47031383b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -16,7 +16,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } @@ -79,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void) /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" @@ -97,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void) : "memory") /* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. @@ -111,8 +114,9 @@ static inline u64 xfeatures_mask_independent(void) * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ + asm volatile(ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..bd4a34100ed0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include <asm/extable.h> #include <asm/trapnr.h> #include <asm/sev.h> +#include <asm/tdx.h> /* * Manage page tables very early on. @@ -143,7 +144,20 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv if (sme_get_me_mask()) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + i = pmd_index(vaddr); pmd[i] -= sme_get_me_mask(); } @@ -192,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr, if (load_delta & ~PMD_PAGE_MASK) for (;;); - /* Activate Secure Memory Encryption (SME) if supported and enabled */ - sme_enable(bp); - /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -308,15 +319,6 @@ unsigned long __head __startup_64(unsigned long physaddr, return sme_postprocess_startup(bp, pmd); } -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { @@ -416,6 +418,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) return; + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + early_fixup_exception(regs, trapnr); } @@ -514,6 +519,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* @@ -600,8 +608,10 @@ static void startup_64_load_idt(unsigned long physbase) void early_setup_idt(void) { /* VMM Communication Exception */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } bringup_idt_descr.address = (unsigned long)bringup_idt_table; native_load_idt(&bringup_idt_descr); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b8e3019547a5..92c4afa2b729 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,10 +65,39 @@ SYM_CODE_START_NOALIGN(startup_64) leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + pushq %rsi call startup_64_setup_env popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. + */ + movq %rsi, %rdi + pushq %rsi + call sme_enable + popq %rsi +#endif + /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS leaq .Lon_kernel_cs(%rip), %rax @@ -134,16 +163,32 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -249,13 +294,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx btsl $_EFER_SCE, %eax /* Enable System Call */ btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: /* Setup cr0 */ movl $CR0_STATE, %eax /* Make changes effective */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 608eb63bf044..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e73f7df362f5..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 36e84d904260..319fef37d9dc 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -21,6 +21,7 @@ #include <asm/sections.h> #include <asm/io.h> #include <asm/setup_arch.h> +#include <asm/sev.h> static struct resource system_rom_resource = { .name = "System ROM", @@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length) void __init probe_roms(void) { - const unsigned char *rom; unsigned long start, length, upper; + const unsigned char *rom; unsigned char c; int i; + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..58fb48d3004f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> +#include <asm/tdx.h> #include "process.h" @@ -160,6 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -191,10 +193,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - if (unlikely(p->flags & PF_IO_WORKER)) { /* * An IO thread is a user space thread, but it doesn't @@ -334,7 +332,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -405,7 +403,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { @@ -686,9 +684,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* @@ -873,6 +868,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } @@ -985,20 +983,19 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: - return fpu_xstate_prctl(task, option, arg2); + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..2f314b170c9f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); show_ip(regs, log_lvl); @@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); @@ -222,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e459253649be..1962008fe743 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98d10ef60571..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -170,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -210,7 +210,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c95b9ac5a457..249981bf3d8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -756,6 +756,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -896,9 +920,7 @@ void __init setup_arch(char **cmdline_p) /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ce987688bbc0..b478edf43bec 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,68 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -24,15 +86,12 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERM_REASON(0, reason); + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -42,6 +101,42 @@ static void __noreturn sev_es_terminate(unsigned int reason) asm volatile("hlt\n" : : : "memory"); } +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +static void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + static bool sev_es_negotiate_protocol(void) { u64 val; @@ -54,10 +149,12 @@ static bool sev_es_negotiate_protocol(void) if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) return false; + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + return true; } @@ -104,10 +201,7 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; /* Check if exception information from hypervisor is sane. */ if ((info & SVM_EVTINJ_VALID) && @@ -130,7 +224,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, u64 exit_info_1, u64 exit_info_2) { /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, exit_code); @@ -150,6 +244,290 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, return verify_exception_info(ghcb, ctxt); } +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int sev_cpuid_hv(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); +} + /* * Boot VC Handler - This is the first VC handler during boot, there is no GHCB * page yet, so it only supports the MSR based communication with the @@ -157,40 +535,33 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, */ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { + unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; + struct cpuid_leaf leaf; + int ret; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) goto fail; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->ax = val >> 32; + leaf.fn = fn; + leaf.subfn = subfn; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->bx = val >> 32; + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (ret != -EOPNOTSUPP) goto fail; - regs->cx = val >> 32; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (sev_cpuid_hv(&leaf)) goto fail; - regs->dx = val >> 32; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; /* * This is a VC handler and the #VC is only raised when SEV-ES is @@ -221,7 +592,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, @@ -481,12 +852,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + static enum es_result vc_handle_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; u32 cr4 = native_read_cr4(); enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; ghcb_set_rax(ghcb, regs->ax); ghcb_set_rcx(ghcb, regs->cx); @@ -538,3 +934,67 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e6d316a01fdd..c05f0124c410 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -18,6 +18,10 @@ #include <linux/memblock.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/platform_device.h> +#include <linux/io.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> @@ -31,9 +35,28 @@ #include <asm/svm.h> #include <asm/smp.h> #include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> #define DR7_RESET_VALUE 0x400 +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -41,7 +64,10 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); * Needs to be in the .data section because we need it NULL before bss is * cleared */ -static struct ghcb __initdata *boot_ghcb; +static struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { @@ -87,6 +113,15 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +struct sev_config { + __u64 debug : 1, + __reserved : 63; +}; + +static struct sev_config sev_cfg __read_mostly; + static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -523,13 +558,68 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } -static u64 get_jump_table_addr(void) +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + void __iomem *mem; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + layout = (__force struct snp_secrets_page_layout *)mem; + + addr = layout->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; +} + +static u64 __init get_jump_table_addr(void) { struct ghcb_state state; unsigned long flags; struct ghcb *ghcb; u64 ret = 0; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + local_irq_save(flags); ghcb = __sev_get_ghcb(&state); @@ -553,7 +643,496 @@ static u64 get_jump_table_addr(void) return ret; } -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; phys_addr_t jump_table_pa; @@ -644,15 +1223,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) +static void snp_register_per_cpu_ghcb(void) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) { + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + /* First make sure the hypervisor talks a supported protocol. */ if (!sev_es_negotiate_protocol()) - return false; + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } /* * Clear the boot_ghcb. The first exception comes in before the bss @@ -663,7 +1266,9 @@ static bool __init sev_es_setup_ghcb(void) /* Alright - Make the boot-ghcb public */ boot_ghcb = &boot_ghcb_page; - return true; + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); } #ifdef CONFIG_HOTPLUG_CPU @@ -766,6 +1371,17 @@ void __init sev_es_init_vc_handling(void) if (!sev_es_check_cpu_features()) panic("SEV-ES CPU Features missing"); + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + /* Enable SEV-ES special handling */ static_branch_enable(&sev_es_enable_key); @@ -1337,7 +1953,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1383,10 +1999,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) struct es_em_ctxt ctxt; enum es_result result; - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); - vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -1425,6 +2037,215 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) fail: show_regs(regs); - while (true) - halt(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + setup_cpuid_table(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} + +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + if (sev_cfg.debug) + dump_cpuid_table(); + + return 0; +} +arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device sev_guest_device = { + .name = "sev-guest", + .id = -1, +}; + +static int __init snp_init_platform_device(void) +{ + struct sev_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&sev_guest_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; } +device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e439eb14325f..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs, return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, sc.gs); + loadsegment(gs, sc.gs); regs->fs = sc.fs; regs->es = sc.es; regs->ds = sc.ds; @@ -146,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { #ifdef CONFIG_X86_32 - unsafe_put_user(get_user_gs(regs), - (unsigned int __user *)&sc->gs, Efault); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..5e7f9532a10d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,7 +56,6 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> -#include <linux/syscore_ops.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -82,6 +81,7 @@ #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> #include <asm/stackprotector.h> +#include <asm/sev.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -187,7 +187,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true, false); + ap_init_aperfmperf(); /* * Get our bogomips. @@ -1082,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1123,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, @@ -1397,7 +1405,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - init_freq_invariance(false, false); smp_sanity_check(); switch (apic_intr_mode) { @@ -1430,6 +1437,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) @@ -1847,357 +1856,3 @@ void native_play_dead(void) } #endif - -#ifdef CONFIG_X86_64 -/* - * APERF/MPERF frequency ratio computation. - * - * The scheduler wants to do frequency invariant accounting and needs a <1 - * ratio to account for the 'current' frequency, corresponding to - * freq_curr / freq_max. - * - * Since the frequency freq_curr on x86 is controlled by micro-controller and - * our P-state setting is little more than a request/hint, we need to observe - * the effective frequency 'BusyMHz', i.e. the average frequency over a time - * interval after discarding idle time. This is given by: - * - * BusyMHz = delta_APERF / delta_MPERF * freq_base - * - * where freq_base is the max non-turbo P-state. - * - * The freq_max term has to be set to a somewhat arbitrary value, because we - * can't know which turbo states will be available at a given point in time: - * it all depends on the thermal headroom of the entire package. We set it to - * the turbo level with 4 cores active. - * - * Benchmarks show that's a good compromise between the 1C turbo ratio - * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, - * which would ignore the entire turbo range (a conspicuous part, making - * freq_curr/freq_max always maxed out). - * - * An exception to the heuristic above is the Atom uarch, where we choose the - * highest turbo level for freq_max since Atom's are generally oriented towards - * power efficiency. - * - * Setting freq_max to anything less than the 1C turbo ratio makes the ratio - * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. - */ - -DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); - -static DEFINE_PER_CPU(u64, arch_prev_aperf); -static DEFINE_PER_CPU(u64, arch_prev_mperf); -static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; -static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; - -void arch_set_max_freq_ratio(bool turbo_disabled) -{ - arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : - arch_turbo_freq_ratio; -} -EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - -static bool turbo_disabled(void) -{ - u64 misc_en; - int err; - - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); - if (err) - return false; - - return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); -} - -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - int err; - - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ - *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ - - return true; -} - -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) - -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), - {} -}; - -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - X86_MATCH(SKYLAKE_X), - {} -}; - -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), - {} -}; - -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) -{ - int fratio, delta_fratio, found; - int err, i; - u64 msr; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - fratio = (msr >> 8) & 0xFF; - i = 16; - found = 0; - do { - if (found >= num_delta_fratio) { - *turbo_freq = fratio; - return true; - } - - delta_fratio = (msr >> (i + 5)) & 0x7; - - if (delta_fratio) { - found += 1; - fratio -= delta_fratio; - } - - i += 8; - } while (i < 64); - - return true; -} - -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) -{ - u64 ratios, counts; - u32 group_size; - int err, i; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); - if (err) - return false; - - for (i = 0; i < 64; i += 8) { - group_size = (counts >> i) & 0xFF; - if (group_size >= size) { - *turbo_freq = (ratios >> i) & 0xFF; - return true; - } - } - - return false; -} - -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - u64 msr; - int err; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ - - /* The CPU may have less than 4 cores */ - if (!*turbo_freq) - *turbo_freq = msr & 0xFF; /* 1C turbo */ - - return true; -} - -static bool intel_set_max_freq_ratio(void) -{ - u64 base_freq, turbo_freq; - u64 turbo_ratio; - - if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - if (x86_match_cpu(has_glm_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_knl_turbo_ratio_limits) && - knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_skx_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) - goto out; - - if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - return false; - -out: - /* - * Some hypervisors advertise X86_FEATURE_APERFMPERF - * but then fill all MSR's with zeroes. - * Some CPUs have turbo boost but don't declare any turbo ratio - * in MSR_TURBO_RATIO_LIMIT. - */ - if (!base_freq || !turbo_freq) { - pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); - return false; - } - - turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); - if (!turbo_ratio) { - pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); - return false; - } - - arch_turbo_freq_ratio = turbo_ratio; - arch_set_max_freq_ratio(turbo_disabled()); - - return true; -} - -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); -} - -#ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { - .resume = init_counter_refs, -}; - -static void register_freq_invariance_syscore_ops(void) -{ - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - - register_syscore_ops(&freq_invariance_syscore_ops); -} -#else -static inline void register_freq_invariance_syscore_ops(void) {} -#endif - -void init_freq_invariance(bool secondary, bool cppc_ready) -{ - bool ret = false; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; - - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } - - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { - return; - } - ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); - } - - if (ret) { - init_counter_refs(); - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } -} - -static void disable_freq_invariance_workfn(struct work_struct *work) -{ - static_branch_disable(&arch_scale_freq_key); -} - -static DECLARE_WORK(disable_freq_invariance_work, - disable_freq_invariance_workfn); - -DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; - -void arch_scale_freq_tick(void) -{ - u64 freq_scale; - u64 aperf, mperf; - u64 acnt, mcnt; - - if (!arch_scale_freq_invariant()) - return; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - acnt = aperf - this_cpu_read(arch_prev_aperf); - mcnt = mperf - this_cpu_read(arch_prev_mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); - - if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) - goto error; - - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) - goto error; - - freq_scale = div64_u64(acnt, mcnt); - if (!freq_scale) - goto error; - - if (freq_scale > SCHED_CAPACITY_SCALE) - freq_scale = SCHED_CAPACITY_SCALE; - - this_cpu_write(arch_freq_scale, freq_scale); - return; - -error: - pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); - schedule_work(&disable_freq_invariance_work); -} -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 660b78827638..8cc653ffdccd 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..d62b2cb85cea 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> +#include <asm/tdx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -686,13 +687,40 @@ static bool try_fixup_enqcmd_gp(void) #endif } +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; @@ -711,40 +739,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) return; } - tsk = current; - if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - goto exit; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - goto exit; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) goto exit; if (error_code) @@ -892,14 +898,10 @@ sync: } #endif -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; - -asmlinkage __visible noinstr -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -908,19 +910,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack tmp, *new_stack = - (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); - BUG_ON(!user_mode(&new_stack->regs)); + BUG_ON(!user_mode(new_stack)); return new_stack; } #endif @@ -1343,6 +1344,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted (by BIOS or with tdx_enc_status_changed()). + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 794fdef2501a..38185aedf7d1 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -339,11 +339,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, struct stack_info *info = &state->stack_info; void *addr = (void *)_addr; - if (!on_stack(info, addr, len) && - (get_stack_info(addr, state->task, info, &state->stack_mask))) - return false; + if (on_stack(info, addr, len)) + return true; - return true; + return !get_stack_info(addr, state->task, info, &state->stack_mask) && + on_stack(info, addr, len); } static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c21bcd668284..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -151,7 +151,7 @@ exit_vm86: memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; return; @@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); |