diff options
Diffstat (limited to 'arch/x86/kernel')
28 files changed, 394 insertions, 385 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..8824d01c0c35 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,6 +61,7 @@ obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-y += irqflags.o obj-y += process.o obj-y += fpu/ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2aabd4cb0e3f..07fa222f0c52 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,6 +573,9 @@ static u32 skx_deadline_rev(void) case 0x04: return 0x02000014; } + if (boot_cpu_data.x86_stepping > 4) + return 0; + return ~0U; } @@ -937,7 +940,7 @@ static int __init calibrate_APIC_clock(void) if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); - return -1; + return -1; } return 0; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 35aaee4fc028..0954315842c0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -218,7 +218,8 @@ static int reserve_irq_vector(struct irq_data *irqd) return 0; } -static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); bool resvd = apicd->has_reserved; @@ -245,22 +246,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) return -EBUSY; vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); - if (vector > 0) - apic_update_vector(irqd, vector, cpu); trace_vector_alloc(irqd->irq, vector, resvd, vector); - return vector; -} - -static int assign_vector_locked(struct irq_data *irqd, - const struct cpumask *dest) -{ - struct apic_chip_data *apicd = apic_chip_data(irqd); - int vector = allocate_vector(irqd, dest); - if (vector < 0) return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); - apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } @@ -433,7 +424,7 @@ static int activate_managed(struct irq_data *irqd) pr_err("Managed startup irq %u, no vector available\n", irqd->irq); } - return ret; + return ret; } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index efaf2d4f9c3c..d492752f79e1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/reboot.h> +#include <linux/memory.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -392,6 +393,51 @@ extern int uv_hub_info_version(void) } EXPORT_SYMBOL(uv_hub_info_version); +/* Default UV memory block size is 2GB */ +static unsigned long mem_block_size = (2UL << 30); + +/* Kernel parameter to specify UV mem block size */ +static int parse_mem_block_size(char *ptr) +{ + unsigned long size = memparse(ptr, NULL); + + /* Size will be rounded down by set_block_size() below */ + mem_block_size = size; + return 0; +} +early_param("uv_memblksize", parse_mem_block_size); + +static __init int adj_blksize(u32 lgre) +{ + unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; + unsigned long size; + + for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1) + if (IS_ALIGNED(base, size)) + break; + + if (size >= mem_block_size) + return 0; + + mem_block_size = size; + return 1; +} + +static __init void set_block_size(void) +{ + unsigned int order = ffs(mem_block_size); + + if (order) { + /* adjust for ffs return of 1..64 */ + set_memory_block_size_order(order - 1); + pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size); + } else { + /* bad or zero value, default to 1UL << 31 (2GB) */ + pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size); + set_memory_block_size_order(31); + } +} + /* Build GAM range lookup table: */ static __init void build_uv_gr_table(void) { @@ -1180,23 +1226,30 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) << UV_GAM_RANGE_SHFT); int order = 0; char suffix[] = " KMGTPE"; + int flag = ' '; while (size > 9999 && order < sizeof(suffix)) { size /= 1024; order++; } + /* adjust max block size to current range start */ + if (gre->type == 1 || gre->type == 2) + if (adj_blksize(lgre)) + flag = '*'; + if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); + pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, - size, suffix[order], + flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) sock_min = gre->sockid; @@ -1427,6 +1480,7 @@ static void __init uv_system_init_hub(void) build_socket_tables(); build_uv_gr_table(); + set_block_size(); uv_init_hub_info(&hub_info); uv_possible_blades = num_possible_nodes(); if (!_node_to_pnode) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d0de79fdab0..ec00d1ff5098 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -240,6 +240,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/nospec-branch.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -614,11 +615,13 @@ static long __apm_bios_call(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); @@ -690,10 +693,12 @@ static long __apm_bios_call_simple(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 082d7875cef8..38915fbfae73 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -543,7 +543,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) nodes_per_socket = ((value >> 3) & 7) + 1; } - if (c->x86 >= 0x15 && c->x86 <= 0x17) { + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && + c->x86 >= 0x15 && c->x86 <= 0x17) { unsigned int bit; switch (c->x86) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cd0fda1fff6d..5c0ea39311fe 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -27,6 +27,7 @@ #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> +#include <asm/hypervisor.h> static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); @@ -154,7 +155,8 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) hostval |= ssbd_tif_to_spec_ctrl(ti->flags); if (hostval != guestval) { @@ -532,9 +534,10 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ - if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { x86_amd_ssb_disable(); - else { + } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); @@ -664,6 +667,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr if (boot_cpu_has(X86_FEATURE_PTI)) return sprintf(buf, "Mitigation: PTI\n"); + if (hypervisor_is_type(X86_HYPER_XEN_PV)) + return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + break; case X86_BUG_SPECTRE_V1: diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 38354c66df81..0c5fcbd998cf 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -671,7 +671,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache) - 1; + int bits = get_count_order(num_sharing_cache); per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0df7151cfef4..eb4cb3efd20e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,3 +1,6 @@ +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/bootmem.h> #include <linux/linkage.h> #include <linux/bitops.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5bbd06f38ff6..f34d89c01edc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -160,6 +160,11 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), + MCESEV( + PANIC, "Data load in unrecoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e4cf6ff1c2e1..4b767284b7f5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - /* We hope get_seconds stays lockless */ - m->time = get_seconds(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; @@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll); static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { - int i, ret = 0; char *tmp; + int i; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(msr_ops.status(i)); - if (m->status & MCI_STATUS_VAL) { - __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); - } + if (!(m->status & MCI_STATUS_VAL)) + continue; + + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); *msg = tmp; - ret = 1; + return 1; } } - return ret; + return 0; } /* @@ -1102,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn) } #endif + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static bool __mc_check_crashing_cpu(int cpu) +{ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1116,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn) */ void do_machine_check(struct pt_regs *regs, long error_code) { + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; + int cpu = smp_processor_id(); + char *msg = "Unknown"; struct mce m, *final; - int i; int worst = 0; - int severity; /* * Establish sequential order between the CPUs entering the machine * check handler. */ int order = -1; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; + /* * If kill_it gets set, there might be a way to recover from this * error. */ int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - char *msg = "Unknown"; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ int lmce = 1; - int cpu = smp_processor_id(); - - /* - * Cases where we avoid rendezvous handler timeout: - * 1) If this CPU is offline. - * - * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to - * skip those CPUs which remain looping in the 1st kernel - see - * crash_nmi_callback(). - * - * Note: there still is a small window between kexec-ing and the new, - * kdump kernel establishing a new #MC handler where a broadcasted MCE - * might not get handled properly. - */ - if (cpu_is_offline(cpu) || - (crashing_cpu != -1 && crashing_cpu != cpu)) { - u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - return; - } - } + if (__mc_check_crashing_cpu(cpu)) + return; ist_enter(regs); this_cpu_inc(mce_exception_count); - if (!cfg->banks) - goto out; - mce_gather_info(&m, regs); m.tsc = rdtsc(); @@ -1205,75 +1279,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* + * Local machine check may already know that we have to panic. + * Broadcast machine check begins rendezvous in mce_start() * Go through all banks in exclusion of the other CPUs. This way we * don't report duplicated events on shared banks because the first one - * to see it will clear it. If this is a Local MCE, then no need to - * perform rendezvous. + * to see it will clear it. */ - if (!lmce) + if (lmce) { + if (no_way_out) + mce_panic("Fatal local machine check", &m, msg); + } else { order = mce_start(&no_way_out); - - for (i = 0; i < cfg->banks; i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) - continue; - if (!mce_banks[i].ctl) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - m.status = mce_rdmsrl(msr_ops.status(i)); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected or non signaled errors are handled by - * machine_check_poll. Leave them alone, unless this panics. - */ - if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && - !no_way_out) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - severity = mce_severity(&m, cfg->tolerant, NULL, true); - - /* - * When machine check was for corrected/deferred handler don't - * touch, unless we're panicing. - */ - if ((severity == MCE_KEEP_SEVERITY || - severity == MCE_UCNA_SEVERITY) && !no_way_out) - continue; - __set_bit(i, toclear); - if (severity == MCE_NO_SEVERITY) { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - mce_read_aux(&m, i); - - /* assuming valid severity level != 0 */ - m.severity = severity; - - mce_log(&m); - - if (severity > worst) { - *final = m; - worst = severity; - } } - /* mce_clear_state will clear *final, save locally for use later */ - m = *final; + __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1287,12 +1306,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; } else { /* - * Local MCE skipped calling mce_reign() - * If we found a fatal error, we need to panic here. + * If there was a fatal machine check we should have + * already called mce_panic earlier in this function. + * Since we re-read the banks, we might have found + * something new. Check again to see if we found a + * fatal error. We call "mce_severity()" again to + * make sure we have the right "msg". */ - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Machine check from unknown source", - NULL, NULL); + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + mce_severity(&m, cfg->tolerant, &msg, true); + mce_panic("Local fatal machine check!", &m, msg); + } } /* @@ -1307,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); -out: + sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) @@ -2153,9 +2177,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1c2cfa0644aa..97ccf4c3b45b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -190,8 +190,11 @@ static void save_microcode_patch(void *data, unsigned int size) p = memdup_patch(data, size); if (!p) pr_err("Error allocating buffer %p\n", data); - else + else { list_replace(&iter->plist, &p->plist); + kfree(iter->data); + kfree(iter); + } } } diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 4021d3859499..40eee6cc4124 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -106,7 +106,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) memset(line, 0, LINE_SIZE); - length = strncpy_from_user(line, buf, LINE_SIZE - 1); + len = min_t(size_t, len, LINE_SIZE - 1); + length = strncpy_from_user(line, buf, len); if (length < 0) return length; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d1f25c831447..c88c23c658c1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1248,6 +1248,7 @@ void __init e820__memblock_setup(void) { int i; u64 end; + u64 addr = 0; /* * The bootstrap memblock region count maximum is 128 entries @@ -1264,13 +1265,21 @@ void __init e820__memblock_setup(void) struct e820_entry *entry = &e820_table->entries[i]; end = entry->addr + entry->size; + if (addr < entry->addr) + memblock_reserve(addr, entry->addr - addr); + addr = end; if (end != (resource_size_t)end) continue; + /* + * all !E820_TYPE_RAM ranges (including gap ranges) are put + * into memblock.reserved to make sure that struct pages in + * such regions are not left uninitialized after bootup. + */ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) - continue; - - memblock_add(entry->addr, entry->size); + memblock_reserve(entry->addr, entry->size); + else + memblock_add(entry->addr, entry->size); } /* Throw away partial pages: */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a21d6ace648e..8047379e575a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -44,7 +44,7 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled __initdata; +unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..34a5c1715148 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_dr_addr_mask(0, i); } -/* - * Check for virtual address in kernel space. - */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +static int arch_bp_generic_len(int x86_len) { - unsigned int len; - unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - va = info->address; - len = bp->attr.bp_len; - - /* - * We don't need to worry about va + len - 1 overflowing: - * we already require that va is aligned to a multiple of len. - */ - return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { + int len; + /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: @@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type, } /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_1: - *gen_len = HW_BREAKPOINT_LEN_1; - break; - case X86_BREAKPOINT_LEN_2: - *gen_len = HW_BREAKPOINT_LEN_2; - break; - case X86_BREAKPOINT_LEN_4: - *gen_len = HW_BREAKPOINT_LEN_4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - *gen_len = HW_BREAKPOINT_LEN_8; - break; -#endif - default: + len = arch_bp_generic_len(x86_len); + if (len < 0) return -EINVAL; - } + *gen_len = len; return 0; } - -static int arch_build_bp_info(struct perf_event *bp) +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long va; + int len; - info->address = bp->attr.bp_addr; + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + hw->address = attr->bp_addr; + hw->mask = 0; /* Type */ - switch (bp->attr.bp_type) { + switch (attr->bp_type) { case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; + hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; + hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* @@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp) * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ - if (bp->attr.bp_addr >= TASK_SIZE_MAX) { + if (attr->bp_addr >= TASK_SIZE_MAX) { #ifdef CONFIG_KPROBES - if (within_kprobe_blacklist(bp->attr.bp_addr)) + if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; #else return -EINVAL; #endif } - info->type = X86_BREAKPOINT_EXECUTE; + hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ - if (bp->attr.bp_len == sizeof(long)) { - info->len = X86_BREAKPOINT_LEN_X; + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; return 0; } default: @@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp) } /* Len */ - info->mask = 0; - - switch (bp->attr.bp_len) { + switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: - info->len = X86_BREAKPOINT_LEN_1; + hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - info->len = X86_BREAKPOINT_LEN_2; + hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - info->len = X86_BREAKPOINT_LEN_4; + hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: - info->len = X86_BREAKPOINT_LEN_8; + hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ - if (!is_power_of_2(bp->attr.bp_len)) + if (!is_power_of_2(attr->bp_len)) return -EINVAL; - if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - info->mask = bp->attr.bp_len - 1; - info->len = X86_BREAKPOINT_LEN_1; + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; } return 0; @@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp) /* * Validate the arch-specific HW Breakpoint register settings */ -int arch_validate_hwbkpt_settings(struct perf_event *bp) +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned int align; int ret; - ret = arch_build_bp_info(bp); + ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - switch (info->len) { + switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; - if (info->mask) - align = info->mask; + if (hw->mask) + align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; @@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (info->address & align) + if (hw->address & align) return -EINVAL; return 0; diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000000..ddeeaac8adda --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/asm.h> +#include <asm/export.h> +#include <linux/linkage.h> + +/* + * unsigned long native_save_fl(void) + */ +ENTRY(native_save_fl) + pushf + pop %_ASM_AX + ret +ENDPROC(native_save_fl) +EXPORT_SYMBOL(native_save_fl) + +/* + * void native_restore_fl(unsigned long flags) + * %eax/%rdi: flags + */ +ENTRY(native_restore_fl) + push %_ASM_ARG1 + popf + ret +ENDPROC(native_restore_fl) +EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index ae38dccf0c8f..2b949f4fd4d8 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig } #endif -#ifdef CONFIG_KPROBES_ON_FTRACE -extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb); -#else -static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - return 0; -} -#endif #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6f4d42377fe5..b0d1e81c96bb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -66,8 +66,6 @@ #include "common.h" -void jprobe_return_end(void); - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif @@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, * Raise a BUG or we'll continue in an endless reentering loop * and eventually a stack overflow. */ - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", - p->addr); + pr_err("Unrecoverable kprobe detected.\n"); dump_kprobe(p); BUG(); default: @@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs) addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); /* - * We don't want to be preempted for the entire - * duration of kprobe processing. We conditionally - * re-enable preemption at the end of this function, - * and also in reenter_kprobe() and setup_singlestep(). + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. */ - preempt_disable(); kcb = get_kprobe_ctlblk(); p = get_kprobe(addr); @@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry - * for jprobe processing, so get out doing nothing - * more here. + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. */ if (!p->pre_handler || !p->pre_handler(p, regs)) setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); return 1; } } else if (*addr != BREAKPOINT_INSTRUCTION) { @@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs) * the original instruction. */ regs->ip = (unsigned long)addr; - preempt_enable_no_resched(); return 1; - } else if (kprobe_running()) { - p = __this_cpu_read(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - if (!skip_singlestep(p, regs, kcb)) - setup_singlestep(p, regs, kcb, 0); - return 1; - } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); return 0; } NOKPROBE_SYMBOL(kprobe_int3_handler); @@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); - /* * if somebody else is singlestepping across a probe point, flags * will have TF set, in which case, continue the remaining processing @@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* @@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_sp = stack_addr(regs); - addr = (unsigned long)(kcb->jprobe_saved_sp); - - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy - * raw stack chunk with redzones: - */ - __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->ip = (unsigned long)(jp->entry); - - /* - * jprobes use jprobe_return() which skips the normal return - * path of the function, and this messes up the accounting of the - * function graph tracer to get messed up. - * - * Pause function graph tracing while performing the jprobe function. - */ - pause_graph_tracing(); - return 1; -} -NOKPROBE_SYMBOL(setjmp_pre_handler); - -void jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - /* Unpoison stack redzones in the frames we are going to jump over. */ - kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); - - asm volatile ( -#ifdef CONFIG_X86_64 - " xchg %%rbx,%%rsp \n" -#else - " xchgl %%ebx,%%esp \n" -#endif - " int3 \n" - " .globl jprobe_return_end\n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_sp):"memory"); -} -NOKPROBE_SYMBOL(jprobe_return); -NOKPROBE_SYMBOL(jprobe_return_end); - -int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->ip - 1); - struct jprobe *jp = container_of(p, struct jprobe, kp); - void *saved_sp = kcb->jprobe_saved_sp; - - if ((addr > (u8 *) jprobe_return) && - (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != saved_sp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk(KERN_ERR - "current sp %p does not match saved sp %p\n", - stack_addr(regs), saved_sp); - printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_regs(saved_regs); - printk(KERN_ERR "Current registers\n"); - show_regs(regs); - BUG(); - } - /* It's OK to start function graph tracing again */ - unpause_graph_tracing(); - *regs = kcb->jprobe_saved_regs; - __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(longjmp_break_handler); - bool arch_within_kprobe_blacklist(unsigned long addr) { bool is_in_entry_trampoline_section = false; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..ef819e19650b 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,36 +25,6 @@ #include "common.h" -static nokprobe_inline -void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, unsigned long orig_ip) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); - if (orig_ip) - regs->ip = orig_ip; -} - -int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - if (kprobe_ftrace(p)) { - __skip_singlestep(p, regs, kcb, 0); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(skip_singlestep); - /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); - /* To emulate trap based kprobes, preempt_disable here */ - preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - __skip_singlestep(p, regs, kcb, orig_ip); - preempt_enable_no_resched(); + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; } /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe, and keep preempt count +1. + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. */ + __this_cpu_write(current_kprobe, NULL); } } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 203d398802a3..eaf02f2e7300 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; if (!reenter) reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index bf8d1eb7fca3..3b8e7c13c614 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -138,6 +138,7 @@ static unsigned long kvm_get_tsc_khz(void) src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); put_cpu(); + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return tsc_khz; } @@ -319,6 +320,8 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + pvclock_set_pvti_cpu0_va(hv_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -366,14 +369,11 @@ int __init kvm_setup_vsyscall_timeinfo(void) vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif return 0; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 697a4ce04308..736348ead421 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) /* Skylake */ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) { - u32 capid0; + u32 capid0, capid5; pci_read_config_dword(pdev, 0x84, &capid0); + pci_read_config_dword(pdev, 0x98, &capid5); - if ((capid0 & 0xc0) == 0xc0) + /* + * CAPID0{7:6} indicate whether this is an advanced RAS SKU + * CAPID5{8:5} indicate that various NVDIMM usage modes are + * enabled, so memory machine check recovery is also enabled. + */ + if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) static_branch_inc(&mcsafe_key); + } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 445ca11ff863..92a3b312a53c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -692,7 +692,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) * Increment event counter and perform fixup for the pre-signal * frame. */ - rseq_signal_deliver(regs); + rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2f7d1d2a5c3..db9656e13ea0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -221,6 +221,11 @@ static void notrace start_secondary(void *unused) #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); + /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a535dd64de63..e6db475164ed 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -835,16 +835,18 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) - return; cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - task->thread.error_code = error_code; - task->thread.trap_nr = trapnr; + if (fixup_exception(regs, trapnr)) + return; + + task->thread.error_code = error_code; + task->thread.trap_nr = trapnr; + + if (notify_die(DIE_TRAP, str, regs, error_code, + trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, error_code); - } return; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 58d8d800875d..deb576b23b7c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -293,7 +293,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); - if (WARN_ON_ONCE(!insn_complete(insn))) + if (!insn_complete(insn)) return -ENOEXEC; if (is_prefix_bad(insn)) |