diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/apic/apic.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/apic/vector.c | 19 | ||||
-rw-r--r-- | arch/x86/kernel/apm_32.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/bugs.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 205 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/if.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/hw_breakpoint.c | 131 | ||||
-rw-r--r-- | arch/x86/kernel/irqflags.S | 26 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/common.h | 10 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 124 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/ftrace.c | 49 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 12 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 5 |
17 files changed, 261 insertions, 351 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..8824d01c0c35 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,6 +61,7 @@ obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o +obj-y += irqflags.o obj-y += process.o obj-y += fpu/ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2aabd4cb0e3f..07fa222f0c52 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,6 +573,9 @@ static u32 skx_deadline_rev(void) case 0x04: return 0x02000014; } + if (boot_cpu_data.x86_stepping > 4) + return 0; + return ~0U; } @@ -937,7 +940,7 @@ static int __init calibrate_APIC_clock(void) if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); - return -1; + return -1; } return 0; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 35aaee4fc028..0954315842c0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -218,7 +218,8 @@ static int reserve_irq_vector(struct irq_data *irqd) return 0; } -static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); bool resvd = apicd->has_reserved; @@ -245,22 +246,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) return -EBUSY; vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); - if (vector > 0) - apic_update_vector(irqd, vector, cpu); trace_vector_alloc(irqd->irq, vector, resvd, vector); - return vector; -} - -static int assign_vector_locked(struct irq_data *irqd, - const struct cpumask *dest) -{ - struct apic_chip_data *apicd = apic_chip_data(irqd); - int vector = allocate_vector(irqd, dest); - if (vector < 0) return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); - apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } @@ -433,7 +424,7 @@ static int activate_managed(struct irq_data *irqd) pr_err("Managed startup irq %u, no vector available\n", irqd->irq); } - return ret; + return ret; } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d0de79fdab0..ec00d1ff5098 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -240,6 +240,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/nospec-branch.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -614,11 +615,13 @@ static long __apm_bios_call(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); @@ -690,10 +693,12 @@ static long __apm_bios_call_simple(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 082d7875cef8..38915fbfae73 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -543,7 +543,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) nodes_per_socket = ((value >> 3) & 7) + 1; } - if (c->x86 >= 0x15 && c->x86 <= 0x17) { + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && + c->x86 >= 0x15 && c->x86 <= 0x17) { unsigned int bit; switch (c->x86) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 404df26b7de8..5c0ea39311fe 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -155,7 +155,8 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) hostval |= ssbd_tif_to_spec_ctrl(ti->flags); if (hostval != guestval) { @@ -533,9 +534,10 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ - if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { x86_amd_ssb_disable(); - else { + } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c102ad51025e..4b767284b7f5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - /* We hope get_seconds stays lockless */ - m->time = get_seconds(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; @@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn) } #endif + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static bool __mc_check_crashing_cpu(int cpu) +{ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn) */ void do_machine_check(struct pt_regs *regs, long error_code) { + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; + int cpu = smp_processor_id(); + char *msg = "Unknown"; struct mce m, *final; - int i; int worst = 0; - int severity; /* * Establish sequential order between the CPUs entering the machine * check handler. */ int order = -1; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; + /* * If kill_it gets set, there might be a way to recover from this * error. */ int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - char *msg = "Unknown"; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ int lmce = 1; - int cpu = smp_processor_id(); - - /* - * Cases where we avoid rendezvous handler timeout: - * 1) If this CPU is offline. - * - * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to - * skip those CPUs which remain looping in the 1st kernel - see - * crash_nmi_callback(). - * - * Note: there still is a small window between kexec-ing and the new, - * kdump kernel establishing a new #MC handler where a broadcasted MCE - * might not get handled properly. - */ - if (cpu_is_offline(cpu) || - (crashing_cpu != -1 && crashing_cpu != cpu)) { - u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - return; - } - } + if (__mc_check_crashing_cpu(cpu)) + return; ist_enter(regs); this_cpu_inc(mce_exception_count); - if (!cfg->banks) - goto out; - mce_gather_info(&m, regs); m.tsc = rdtsc(); @@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); } - for (i = 0; i < cfg->banks; i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) - continue; - if (!mce_banks[i].ctl) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - m.status = mce_rdmsrl(msr_ops.status(i)); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected or non signaled errors are handled by - * machine_check_poll. Leave them alone, unless this panics. - */ - if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && - !no_way_out) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - severity = mce_severity(&m, cfg->tolerant, NULL, true); - - /* - * When machine check was for corrected/deferred handler don't - * touch, unless we're panicing. - */ - if ((severity == MCE_KEEP_SEVERITY || - severity == MCE_UCNA_SEVERITY) && !no_way_out) - continue; - __set_bit(i, toclear); - if (severity == MCE_NO_SEVERITY) { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - mce_read_aux(&m, i); - - /* assuming valid severity level != 0 */ - m.severity = severity; - - mce_log(&m); - - if (severity > worst) { - *final = m; - worst = severity; - } - } - - /* mce_clear_state will clear *final, save locally for use later */ - m = *final; + __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); -out: + sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) @@ -2165,9 +2177,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 4021d3859499..40eee6cc4124 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -106,7 +106,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) memset(line, 0, LINE_SIZE); - length = strncpy_from_user(line, buf, LINE_SIZE - 1); + len = min_t(size_t, len, LINE_SIZE - 1); + length = strncpy_from_user(line, buf, len); if (length < 0) return length; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..34a5c1715148 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_dr_addr_mask(0, i); } -/* - * Check for virtual address in kernel space. - */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +static int arch_bp_generic_len(int x86_len) { - unsigned int len; - unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - va = info->address; - len = bp->attr.bp_len; - - /* - * We don't need to worry about va + len - 1 overflowing: - * we already require that va is aligned to a multiple of len. - */ - return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { + int len; + /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: @@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type, } /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_1: - *gen_len = HW_BREAKPOINT_LEN_1; - break; - case X86_BREAKPOINT_LEN_2: - *gen_len = HW_BREAKPOINT_LEN_2; - break; - case X86_BREAKPOINT_LEN_4: - *gen_len = HW_BREAKPOINT_LEN_4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - *gen_len = HW_BREAKPOINT_LEN_8; - break; -#endif - default: + len = arch_bp_generic_len(x86_len); + if (len < 0) return -EINVAL; - } + *gen_len = len; return 0; } - -static int arch_build_bp_info(struct perf_event *bp) +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long va; + int len; - info->address = bp->attr.bp_addr; + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + hw->address = attr->bp_addr; + hw->mask = 0; /* Type */ - switch (bp->attr.bp_type) { + switch (attr->bp_type) { case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; + hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; + hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* @@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp) * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ - if (bp->attr.bp_addr >= TASK_SIZE_MAX) { + if (attr->bp_addr >= TASK_SIZE_MAX) { #ifdef CONFIG_KPROBES - if (within_kprobe_blacklist(bp->attr.bp_addr)) + if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; #else return -EINVAL; #endif } - info->type = X86_BREAKPOINT_EXECUTE; + hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ - if (bp->attr.bp_len == sizeof(long)) { - info->len = X86_BREAKPOINT_LEN_X; + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; return 0; } default: @@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp) } /* Len */ - info->mask = 0; - - switch (bp->attr.bp_len) { + switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: - info->len = X86_BREAKPOINT_LEN_1; + hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - info->len = X86_BREAKPOINT_LEN_2; + hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - info->len = X86_BREAKPOINT_LEN_4; + hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: - info->len = X86_BREAKPOINT_LEN_8; + hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ - if (!is_power_of_2(bp->attr.bp_len)) + if (!is_power_of_2(attr->bp_len)) return -EINVAL; - if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - info->mask = bp->attr.bp_len - 1; - info->len = X86_BREAKPOINT_LEN_1; + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; } return 0; @@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp) /* * Validate the arch-specific HW Breakpoint register settings */ -int arch_validate_hwbkpt_settings(struct perf_event *bp) +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned int align; int ret; - ret = arch_build_bp_info(bp); + ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - switch (info->len) { + switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; - if (info->mask) - align = info->mask; + if (hw->mask) + align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; @@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (info->address & align) + if (hw->address & align) return -EINVAL; return 0; diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000000..ddeeaac8adda --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/asm.h> +#include <asm/export.h> +#include <linux/linkage.h> + +/* + * unsigned long native_save_fl(void) + */ +ENTRY(native_save_fl) + pushf + pop %_ASM_AX + ret +ENDPROC(native_save_fl) +EXPORT_SYMBOL(native_save_fl) + +/* + * void native_restore_fl(unsigned long flags) + * %eax/%rdi: flags + */ +ENTRY(native_restore_fl) + push %_ASM_ARG1 + popf + ret +ENDPROC(native_restore_fl) +EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index ae38dccf0c8f..2b949f4fd4d8 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig } #endif -#ifdef CONFIG_KPROBES_ON_FTRACE -extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb); -#else -static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - return 0; -} -#endif #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6f4d42377fe5..b0d1e81c96bb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -66,8 +66,6 @@ #include "common.h" -void jprobe_return_end(void); - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif @@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, * Raise a BUG or we'll continue in an endless reentering loop * and eventually a stack overflow. */ - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", - p->addr); + pr_err("Unrecoverable kprobe detected.\n"); dump_kprobe(p); BUG(); default: @@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs) addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); /* - * We don't want to be preempted for the entire - * duration of kprobe processing. We conditionally - * re-enable preemption at the end of this function, - * and also in reenter_kprobe() and setup_singlestep(). + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. */ - preempt_disable(); kcb = get_kprobe_ctlblk(); p = get_kprobe(addr); @@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry - * for jprobe processing, so get out doing nothing - * more here. + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. */ if (!p->pre_handler || !p->pre_handler(p, regs)) setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); return 1; } } else if (*addr != BREAKPOINT_INSTRUCTION) { @@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs) * the original instruction. */ regs->ip = (unsigned long)addr; - preempt_enable_no_resched(); return 1; - } else if (kprobe_running()) { - p = __this_cpu_read(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - if (!skip_singlestep(p, regs, kcb)) - setup_singlestep(p, regs, kcb, 0); - return 1; - } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); return 0; } NOKPROBE_SYMBOL(kprobe_int3_handler); @@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); - /* * if somebody else is singlestepping across a probe point, flags * will have TF set, in which case, continue the remaining processing @@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* @@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_sp = stack_addr(regs); - addr = (unsigned long)(kcb->jprobe_saved_sp); - - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy - * raw stack chunk with redzones: - */ - __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->ip = (unsigned long)(jp->entry); - - /* - * jprobes use jprobe_return() which skips the normal return - * path of the function, and this messes up the accounting of the - * function graph tracer to get messed up. - * - * Pause function graph tracing while performing the jprobe function. - */ - pause_graph_tracing(); - return 1; -} -NOKPROBE_SYMBOL(setjmp_pre_handler); - -void jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - /* Unpoison stack redzones in the frames we are going to jump over. */ - kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); - - asm volatile ( -#ifdef CONFIG_X86_64 - " xchg %%rbx,%%rsp \n" -#else - " xchgl %%ebx,%%esp \n" -#endif - " int3 \n" - " .globl jprobe_return_end\n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_sp):"memory"); -} -NOKPROBE_SYMBOL(jprobe_return); -NOKPROBE_SYMBOL(jprobe_return_end); - -int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->ip - 1); - struct jprobe *jp = container_of(p, struct jprobe, kp); - void *saved_sp = kcb->jprobe_saved_sp; - - if ((addr > (u8 *) jprobe_return) && - (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != saved_sp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk(KERN_ERR - "current sp %p does not match saved sp %p\n", - stack_addr(regs), saved_sp); - printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_regs(saved_regs); - printk(KERN_ERR "Current registers\n"); - show_regs(regs); - BUG(); - } - /* It's OK to start function graph tracing again */ - unpause_graph_tracing(); - *regs = kcb->jprobe_saved_regs; - __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(longjmp_break_handler); - bool arch_within_kprobe_blacklist(unsigned long addr) { bool is_in_entry_trampoline_section = false; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..ef819e19650b 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,36 +25,6 @@ #include "common.h" -static nokprobe_inline -void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, unsigned long orig_ip) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); - if (orig_ip) - regs->ip = orig_ip; -} - -int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - if (kprobe_ftrace(p)) { - __skip_singlestep(p, regs, kcb, 0); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(skip_singlestep); - /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); - /* To emulate trap based kprobes, preempt_disable here */ - preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - __skip_singlestep(p, regs, kcb, orig_ip); - preempt_enable_no_resched(); + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; } /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe, and keep preempt count +1. + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. */ + __this_cpu_write(current_kprobe, NULL); } } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 203d398802a3..eaf02f2e7300 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; if (!reenter) reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index bf8d1eb7fca3..3b8e7c13c614 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -138,6 +138,7 @@ static unsigned long kvm_get_tsc_khz(void) src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); put_cpu(); + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return tsc_khz; } @@ -319,6 +320,8 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + pvclock_set_pvti_cpu0_va(hv_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -366,14 +369,11 @@ int __init kvm_setup_vsyscall_timeinfo(void) vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif return 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2f7d1d2a5c3..db9656e13ea0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -221,6 +221,11 @@ static void notrace start_secondary(void *unused) #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); + /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); __flush_tlb_all(); #endif load_current_idt(); |