summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c5
-rw-r--r--arch/x86/kernel/apic/vector.c19
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c60
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/bugs.c12
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c247
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c3
-rw-r--r--arch/x86/kernel/e820.c15
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c131
-rw-r--r--arch/x86/kernel/irqflags.S26
-rw-r--r--arch/x86/kernel/kprobes/common.h10
-rw-r--r--arch/x86/kernel/kprobes/core.c124
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c49
-rw-r--r--arch/x86/kernel/kprobes/opt.c1
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/kvmclock.c12
-rw-r--r--arch/x86/kernel/quirks.c11
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kernel/traps.c14
-rw-r--r--arch/x86/kernel/uprobes.c2
28 files changed, 394 insertions, 385 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 02d6f5cf4e70..8824d01c0c35 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -61,6 +61,7 @@ obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-y += irqflags.o
obj-y += process.o
obj-y += fpu/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2aabd4cb0e3f..07fa222f0c52 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,6 +573,9 @@ static u32 skx_deadline_rev(void)
case 0x04: return 0x02000014;
}
+ if (boot_cpu_data.x86_stepping > 4)
+ return 0;
+
return ~0U;
}
@@ -937,7 +940,7 @@ static int __init calibrate_APIC_clock(void)
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
pr_warning("APIC timer disabled due to verification failure\n");
- return -1;
+ return -1;
}
return 0;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 35aaee4fc028..0954315842c0 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -218,7 +218,8 @@ static int reserve_irq_vector(struct irq_data *irqd)
return 0;
}
-static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
bool resvd = apicd->has_reserved;
@@ -245,22 +246,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
return -EBUSY;
vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
- if (vector > 0)
- apic_update_vector(irqd, vector, cpu);
trace_vector_alloc(irqd->irq, vector, resvd, vector);
- return vector;
-}
-
-static int assign_vector_locked(struct irq_data *irqd,
- const struct cpumask *dest)
-{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
- int vector = allocate_vector(irqd, dest);
-
if (vector < 0)
return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
return 0;
}
@@ -433,7 +424,7 @@ static int activate_managed(struct irq_data *irqd)
pr_err("Managed startup irq %u, no vector available\n",
irqd->irq);
}
- return ret;
+ return ret;
}
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index efaf2d4f9c3c..d492752f79e1 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -26,6 +26,7 @@
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/reboot.h>
+#include <linux/memory.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -392,6 +393,51 @@ extern int uv_hub_info_version(void)
}
EXPORT_SYMBOL(uv_hub_info_version);
+/* Default UV memory block size is 2GB */
+static unsigned long mem_block_size = (2UL << 30);
+
+/* Kernel parameter to specify UV mem block size */
+static int parse_mem_block_size(char *ptr)
+{
+ unsigned long size = memparse(ptr, NULL);
+
+ /* Size will be rounded down by set_block_size() below */
+ mem_block_size = size;
+ return 0;
+}
+early_param("uv_memblksize", parse_mem_block_size);
+
+static __init int adj_blksize(u32 lgre)
+{
+ unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT;
+ unsigned long size;
+
+ for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1)
+ if (IS_ALIGNED(base, size))
+ break;
+
+ if (size >= mem_block_size)
+ return 0;
+
+ mem_block_size = size;
+ return 1;
+}
+
+static __init void set_block_size(void)
+{
+ unsigned int order = ffs(mem_block_size);
+
+ if (order) {
+ /* adjust for ffs return of 1..64 */
+ set_memory_block_size_order(order - 1);
+ pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size);
+ } else {
+ /* bad or zero value, default to 1UL << 31 (2GB) */
+ pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size);
+ set_memory_block_size_order(31);
+ }
+}
+
/* Build GAM range lookup table: */
static __init void build_uv_gr_table(void)
{
@@ -1180,23 +1226,30 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
<< UV_GAM_RANGE_SHFT);
int order = 0;
char suffix[] = " KMGTPE";
+ int flag = ' ';
while (size > 9999 && order < sizeof(suffix)) {
size /= 1024;
order++;
}
+ /* adjust max block size to current range start */
+ if (gre->type == 1 || gre->type == 2)
+ if (adj_blksize(lgre))
+ flag = '*';
+
if (!index) {
pr_info("UV: GAM Range Table...\n");
- pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
+ pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
}
- pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n",
+ pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n",
index++,
(unsigned long)lgre << UV_GAM_RANGE_SHFT,
(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
- size, suffix[order],
+ flag, size, suffix[order],
gre->type, gre->nasid, gre->sockid, gre->pnode);
+ /* update to next range start */
lgre = gre->limit;
if (sock_min > gre->sockid)
sock_min = gre->sockid;
@@ -1427,6 +1480,7 @@ static void __init uv_system_init_hub(void)
build_socket_tables();
build_uv_gr_table();
+ set_block_size();
uv_init_hub_info(&hub_info);
uv_possible_blades = num_possible_nodes();
if (!_node_to_pnode)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5d0de79fdab0..ec00d1ff5098 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -240,6 +240,7 @@
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
+#include <asm/nospec-branch.h>
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
@@ -614,11 +615,13 @@ static long __apm_bios_call(void *_call)
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
&call->esi);
APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
@@ -690,10 +693,12 @@ static long __apm_bios_call_simple(void *_call)
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 082d7875cef8..38915fbfae73 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -543,7 +543,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
nodes_per_socket = ((value >> 3) & 7) + 1;
}
- if (c->x86 >= 0x15 && c->x86 <= 0x17) {
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
unsigned int bit;
switch (c->x86) {
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index cd0fda1fff6d..5c0ea39311fe 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -27,6 +27,7 @@
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
+#include <asm/hypervisor.h>
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
@@ -154,7 +155,8 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
/* SSBD controlled in MSR_SPEC_CTRL */
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD))
hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
if (hostval != guestval) {
@@ -532,9 +534,10 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
* use a completely different MSR and bit dependent on family.
*/
- if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
x86_amd_ssb_disable();
- else {
+ } else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
@@ -664,6 +667,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
if (boot_cpu_has(X86_FEATURE_PTI))
return sprintf(buf, "Mitigation: PTI\n");
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
break;
case X86_BUG_SPECTRE_V1:
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 38354c66df81..0c5fcbd998cf 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -671,7 +671,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
if (num_sharing_cache) {
- int bits = get_count_order(num_sharing_cache) - 1;
+ int bits = get_count_order(num_sharing_cache);
per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0df7151cfef4..eb4cb3efd20e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,3 +1,6 @@
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/bootmem.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 5bbd06f38ff6..f34d89c01edc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -160,6 +160,11 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
USER
),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e4cf6ff1c2e1..4b767284b7f5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
- /* We hope get_seconds stays lockless */
- m->time = get_seconds();
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
@@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
- int i, ret = 0;
char *tmp;
+ int i;
for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(msr_ops.status(i));
- if (m->status & MCI_STATUS_VAL) {
- __set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
- }
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(m, i);
*msg = tmp;
- ret = 1;
+ return 1;
}
}
- return ret;
+ return 0;
}
/*
@@ -1102,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn)
}
#endif
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static bool __mc_check_crashing_cpu(int cpu)
+{
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void __mc_scan_banks(struct mce *m, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks,
+ int no_way_out, int *worst)
+{
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i;
+
+ for (i = 0; i < cfg->banks; i++) {
+ __clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ severity = mce_severity(m, cfg->tolerant, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ __set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(m, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ mce_log(m);
+
+ if (severity > *worst) {
+ *final = *m;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *m = *final;
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1116,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn)
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
+ int cpu = smp_processor_id();
+ char *msg = "Unknown";
struct mce m, *final;
- int i;
int worst = 0;
- int severity;
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
int order = -1;
+
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
+
/*
* If kill_it gets set, there might be a way to recover from this
* error.
*/
int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- char *msg = "Unknown";
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
* on Intel.
*/
int lmce = 1;
- int cpu = smp_processor_id();
-
- /*
- * Cases where we avoid rendezvous handler timeout:
- * 1) If this CPU is offline.
- *
- * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
- * skip those CPUs which remain looping in the 1st kernel - see
- * crash_nmi_callback().
- *
- * Note: there still is a small window between kexec-ing and the new,
- * kdump kernel establishing a new #MC handler where a broadcasted MCE
- * might not get handled properly.
- */
- if (cpu_is_offline(cpu) ||
- (crashing_cpu != -1 && crashing_cpu != cpu)) {
- u64 mcgstatus;
- mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- if (mcgstatus & MCG_STATUS_RIPV) {
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- return;
- }
- }
+ if (__mc_check_crashing_cpu(cpu))
+ return;
ist_enter(regs);
this_cpu_inc(mce_exception_count);
- if (!cfg->banks)
- goto out;
-
mce_gather_info(&m, regs);
m.tsc = rdtsc();
@@ -1205,75 +1279,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
* Go through all banks in exclusion of the other CPUs. This way we
* don't report duplicated events on shared banks because the first one
- * to see it will clear it. If this is a Local MCE, then no need to
- * perform rendezvous.
+ * to see it will clear it.
*/
- if (!lmce)
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &m, msg);
+ } else {
order = mce_start(&no_way_out);
-
- for (i = 0; i < cfg->banks; i++) {
- __clear_bit(i, toclear);
- if (!test_bit(i, valid_banks))
- continue;
- if (!mce_banks[i].ctl)
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- m.status = mce_rdmsrl(msr_ops.status(i));
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected or non signaled errors are handled by
- * machine_check_poll. Leave them alone, unless this panics.
- */
- if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- severity = mce_severity(&m, cfg->tolerant, NULL, true);
-
- /*
- * When machine check was for corrected/deferred handler don't
- * touch, unless we're panicing.
- */
- if ((severity == MCE_KEEP_SEVERITY ||
- severity == MCE_UCNA_SEVERITY) && !no_way_out)
- continue;
- __set_bit(i, toclear);
- if (severity == MCE_NO_SEVERITY) {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- mce_read_aux(&m, i);
-
- /* assuming valid severity level != 0 */
- m.severity = severity;
-
- mce_log(&m);
-
- if (severity > worst) {
- *final = m;
- worst = severity;
- }
}
- /* mce_clear_state will clear *final, save locally for use later */
- m = *final;
+ __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1287,12 +1306,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY;
} else {
/*
- * Local MCE skipped calling mce_reign()
- * If we found a fatal error, we need to panic here.
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
*/
- if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source",
- NULL, NULL);
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ mce_severity(&m, cfg->tolerant, &msg, true);
+ mce_panic("Local fatal machine check!", &m, msg);
+ }
}
/*
@@ -1307,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (worst > 0)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-out:
+
sync_core();
if (worst != MCE_AR_SEVERITY && !kill_it)
@@ -2153,9 +2177,6 @@ static ssize_t store_int_with_restart(struct device *s,
if (check_interval == old_check_interval)
return ret;
- if (check_interval < 1)
- check_interval = 1;
-
mutex_lock(&mce_sysfs_mutex);
mce_restart();
mutex_unlock(&mce_sysfs_mutex);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 1c2cfa0644aa..97ccf4c3b45b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -190,8 +190,11 @@ static void save_microcode_patch(void *data, unsigned int size)
p = memdup_patch(data, size);
if (!p)
pr_err("Error allocating buffer %p\n", data);
- else
+ else {
list_replace(&iter->plist, &p->plist);
+ kfree(iter->data);
+ kfree(iter);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4021d3859499..40eee6cc4124 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -106,7 +106,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
memset(line, 0, LINE_SIZE);
- length = strncpy_from_user(line, buf, LINE_SIZE - 1);
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
if (length < 0)
return length;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d1f25c831447..c88c23c658c1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1248,6 +1248,7 @@ void __init e820__memblock_setup(void)
{
int i;
u64 end;
+ u64 addr = 0;
/*
* The bootstrap memblock region count maximum is 128 entries
@@ -1264,13 +1265,21 @@ void __init e820__memblock_setup(void)
struct e820_entry *entry = &e820_table->entries[i];
end = entry->addr + entry->size;
+ if (addr < entry->addr)
+ memblock_reserve(addr, entry->addr - addr);
+ addr = end;
if (end != (resource_size_t)end)
continue;
+ /*
+ * all !E820_TYPE_RAM ranges (including gap ranges) are put
+ * into memblock.reserved to make sure that struct pages in
+ * such regions are not left uninitialized after bootup.
+ */
if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
- continue;
-
- memblock_add(entry->addr, entry->size);
+ memblock_reserve(entry->addr, entry->size);
+ else
+ memblock_add(entry->addr, entry->size);
}
/* Throw away partial pages: */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a21d6ace648e..8047379e575a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -44,7 +44,7 @@ static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
#ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled __initdata;
+unsigned int __pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39;
EXPORT_SYMBOL(pgdir_shift);
unsigned int ptrs_per_p4d __ro_after_init = 1;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 8771766d46b6..34a5c1715148 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
set_dr_addr_mask(0, i);
}
-/*
- * Check for virtual address in kernel space.
- */
-int arch_check_bp_in_kernelspace(struct perf_event *bp)
+static int arch_bp_generic_len(int x86_len)
{
- unsigned int len;
- unsigned long va;
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
-
- va = info->address;
- len = bp->attr.bp_len;
-
- /*
- * We don't need to worry about va + len - 1 overflowing:
- * we already require that va is aligned to a multiple of len.
- */
- return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ return HW_BREAKPOINT_LEN_1;
+ case X86_BREAKPOINT_LEN_2:
+ return HW_BREAKPOINT_LEN_2;
+ case X86_BREAKPOINT_LEN_4:
+ return HW_BREAKPOINT_LEN_4;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ return HW_BREAKPOINT_LEN_8;
+#endif
+ default:
+ return -EINVAL;
+ }
}
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
+ int len;
+
/* Type */
switch (x86_type) {
case X86_BREAKPOINT_EXECUTE:
@@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
}
/* Len */
- switch (x86_len) {
- case X86_BREAKPOINT_LEN_1:
- *gen_len = HW_BREAKPOINT_LEN_1;
- break;
- case X86_BREAKPOINT_LEN_2:
- *gen_len = HW_BREAKPOINT_LEN_2;
- break;
- case X86_BREAKPOINT_LEN_4:
- *gen_len = HW_BREAKPOINT_LEN_4;
- break;
-#ifdef CONFIG_X86_64
- case X86_BREAKPOINT_LEN_8:
- *gen_len = HW_BREAKPOINT_LEN_8;
- break;
-#endif
- default:
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
return -EINVAL;
- }
+ *gen_len = len;
return 0;
}
-
-static int arch_build_bp_info(struct perf_event *bp)
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long va;
+ int len;
- info->address = bp->attr.bp_addr;
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
/* Type */
- switch (bp->attr.bp_type) {
+ switch (attr->bp_type) {
case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
+ hw->type = X86_BREAKPOINT_WRITE;
break;
case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
+ hw->type = X86_BREAKPOINT_RW;
break;
case HW_BREAKPOINT_X:
/*
@@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp)
* acceptable for kprobes. On non-kprobes kernels, we don't
* allow kernel breakpoints at all.
*/
- if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
#ifdef CONFIG_KPROBES
- if (within_kprobe_blacklist(bp->attr.bp_addr))
+ if (within_kprobe_blacklist(attr->bp_addr))
return -EINVAL;
#else
return -EINVAL;
#endif
}
- info->type = X86_BREAKPOINT_EXECUTE;
+ hw->type = X86_BREAKPOINT_EXECUTE;
/*
* x86 inst breakpoints need to have a specific undefined len.
* But we still need to check userspace is not trying to setup
* an unsupported length, to get a range breakpoint for example.
*/
- if (bp->attr.bp_len == sizeof(long)) {
- info->len = X86_BREAKPOINT_LEN_X;
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
return 0;
}
default:
@@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp)
}
/* Len */
- info->mask = 0;
-
- switch (bp->attr.bp_len) {
+ switch (attr->bp_len) {
case HW_BREAKPOINT_LEN_1:
- info->len = X86_BREAKPOINT_LEN_1;
+ hw->len = X86_BREAKPOINT_LEN_1;
break;
case HW_BREAKPOINT_LEN_2:
- info->len = X86_BREAKPOINT_LEN_2;
+ hw->len = X86_BREAKPOINT_LEN_2;
break;
case HW_BREAKPOINT_LEN_4:
- info->len = X86_BREAKPOINT_LEN_4;
+ hw->len = X86_BREAKPOINT_LEN_4;
break;
#ifdef CONFIG_X86_64
case HW_BREAKPOINT_LEN_8:
- info->len = X86_BREAKPOINT_LEN_8;
+ hw->len = X86_BREAKPOINT_LEN_8;
break;
#endif
default:
/* AMD range breakpoint */
- if (!is_power_of_2(bp->attr.bp_len))
+ if (!is_power_of_2(attr->bp_len))
return -EINVAL;
- if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+ if (attr->bp_addr & (attr->bp_len - 1))
return -EINVAL;
if (!boot_cpu_has(X86_FEATURE_BPEXT))
@@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp)
* breakpoints, then we'll have to check for kprobe-blacklisted
* addresses anywhere in the range.
*/
- info->mask = bp->attr.bp_len - 1;
- info->len = X86_BREAKPOINT_LEN_1;
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
}
return 0;
@@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp)
/*
* Validate the arch-specific HW Breakpoint register settings
*/
-int arch_validate_hwbkpt_settings(struct perf_event *bp)
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
int ret;
- ret = arch_build_bp_info(bp);
+ ret = arch_build_bp_info(bp, attr, hw);
if (ret)
return ret;
- switch (info->len) {
+ switch (hw->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
- if (info->mask)
- align = info->mask;
+ if (hw->mask)
+ align = hw->mask;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
@@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
* Check that the low-order bits of the address are appropriate
* for the alignment implied by len.
*/
- if (info->address & align)
+ if (hw->address & align)
return -EINVAL;
return 0;
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
new file mode 100644
index 000000000000..ddeeaac8adda
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <asm/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+ENTRY(native_save_fl)
+ pushf
+ pop %_ASM_AX
+ ret
+ENDPROC(native_save_fl)
+EXPORT_SYMBOL(native_save_fl)
+
+/*
+ * void native_restore_fl(unsigned long flags)
+ * %eax/%rdi: flags
+ */
+ENTRY(native_restore_fl)
+ push %_ASM_ARG1
+ popf
+ ret
+ENDPROC(native_restore_fl)
+EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index ae38dccf0c8f..2b949f4fd4d8 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig
}
#endif
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb);
-#else
-static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- return 0;
-}
-#endif
#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6f4d42377fe5..b0d1e81c96bb 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -66,8 +66,6 @@
#include "common.h"
-void jprobe_return_end(void);
-
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
- (u8 *) real;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
- pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
- src, real, insn->displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(insn);
@@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
* stepping.
*/
regs->ip = (unsigned long)p->ainsn.insn;
- preempt_enable_no_resched();
return;
}
#endif
@@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
* Raise a BUG or we'll continue in an endless reentering loop
* and eventually a stack overflow.
*/
- printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
- p->addr);
+ pr_err("Unrecoverable kprobe detected.\n");
dump_kprobe(p);
BUG();
default:
@@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs)
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
- * We don't want to be preempted for the entire
- * duration of kprobe processing. We conditionally
- * re-enable preemption at the end of this function,
- * and also in reenter_kprobe() and setup_singlestep().
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
*/
- preempt_disable();
kcb = get_kprobe_ctlblk();
p = get_kprobe(addr);
@@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs)
/*
* If we have no pre-handler or it returned 0, we
* continue with normal processing. If we have a
- * pre-handler and it returned non-zero, it prepped
- * for calling the break_handler below on re-entry
- * for jprobe processing, so get out doing nothing
- * more here.
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
return 1;
}
} else if (*addr != BREAKPOINT_INSTRUCTION) {
@@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
* the original instruction.
*/
regs->ip = (unsigned long)addr;
- preempt_enable_no_resched();
return 1;
- } else if (kprobe_running()) {
- p = __this_cpu_read(current_kprobe);
- if (p->break_handler && p->break_handler(p, regs)) {
- if (!skip_singlestep(p, regs, kcb))
- setup_singlestep(p, regs, kcb, 0);
- return 1;
- }
} /* else: not a kprobe fault; let the kernel handle it */
- preempt_enable_no_resched();
return 0;
}
NOKPROBE_SYMBOL(kprobe_int3_handler);
@@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs)
}
reset_current_kprobe();
out:
- preempt_enable_no_resched();
-
/*
* if somebody else is singlestepping across a probe point, flags
* will have TF set, in which case, continue the remaining processing
@@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
- preempt_enable_no_resched();
} else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/*
@@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- unsigned long addr;
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- kcb->jprobe_saved_regs = *regs;
- kcb->jprobe_saved_sp = stack_addr(regs);
- addr = (unsigned long)(kcb->jprobe_saved_sp);
-
- /*
- * As Linus pointed out, gcc assumes that the callee
- * owns the argument space and could overwrite it, e.g.
- * tailcall optimization. So, to be absolutely safe
- * we also save and restore enough stack bytes to cover
- * the argument area.
- * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy
- * raw stack chunk with redzones:
- */
- __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
- regs->ip = (unsigned long)(jp->entry);
-
- /*
- * jprobes use jprobe_return() which skips the normal return
- * path of the function, and this messes up the accounting of the
- * function graph tracer to get messed up.
- *
- * Pause function graph tracing while performing the jprobe function.
- */
- pause_graph_tracing();
- return 1;
-}
-NOKPROBE_SYMBOL(setjmp_pre_handler);
-
-void jprobe_return(void)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- /* Unpoison stack redzones in the frames we are going to jump over. */
- kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp);
-
- asm volatile (
-#ifdef CONFIG_X86_64
- " xchg %%rbx,%%rsp \n"
-#else
- " xchgl %%ebx,%%esp \n"
-#endif
- " int3 \n"
- " .globl jprobe_return_end\n"
- " jprobe_return_end: \n"
- " nop \n"::"b"
- (kcb->jprobe_saved_sp):"memory");
-}
-NOKPROBE_SYMBOL(jprobe_return);
-NOKPROBE_SYMBOL(jprobe_return_end);
-
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- u8 *addr = (u8 *) (regs->ip - 1);
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- void *saved_sp = kcb->jprobe_saved_sp;
-
- if ((addr > (u8 *) jprobe_return) &&
- (addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != saved_sp) {
- struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
- printk(KERN_ERR
- "current sp %p does not match saved sp %p\n",
- stack_addr(regs), saved_sp);
- printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_regs(saved_regs);
- printk(KERN_ERR "Current registers\n");
- show_regs(regs);
- BUG();
- }
- /* It's OK to start function graph tracing again */
- unpause_graph_tracing();
- *regs = kcb->jprobe_saved_regs;
- __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(longjmp_break_handler);
-
bool arch_within_kprobe_blacklist(unsigned long addr)
{
bool is_in_entry_trampoline_section = false;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..ef819e19650b 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,36 +25,6 @@
#include "common.h"
-static nokprobe_inline
-void __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb, unsigned long orig_ip)
-{
- /*
- * Emulate singlestep (and also recover regs->ip)
- * as if there is a 5byte nop
- */
- regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
- if (unlikely(p->post_handler)) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- p->post_handler(p, regs, 0);
- }
- __this_cpu_write(current_kprobe, NULL);
- if (orig_ip)
- regs->ip = orig_ip;
-}
-
-int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- if (kprobe_ftrace(p)) {
- __skip_singlestep(p, regs, kcb, 0);
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(skip_singlestep);
-
/* Ftrace callback handler for kprobes -- called under preepmt disabed */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *regs)
@@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
regs->ip = ip + sizeof(kprobe_opcode_t);
- /* To emulate trap based kprobes, preempt_disable here */
- preempt_disable();
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs)) {
- __skip_singlestep(p, regs, kcb, orig_ip);
- preempt_enable_no_resched();
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ regs->ip = orig_ip;
}
/*
- * If pre_handler returns !0, it sets regs->ip and
- * resets current kprobe, and keep preempt count +1.
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
*/
+ __this_cpu_write(current_kprobe, NULL);
}
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 203d398802a3..eaf02f2e7300 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
if (!reenter)
reset_current_kprobe();
- preempt_enable_no_resched();
return 1;
}
return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..a37bda38d205 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
for (;;) {
if (!n.halted)
- prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
- swake_up(&n->wq);
+ swake_up_one(&n->wq);
}
static void apf_task_wake_all(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index bf8d1eb7fca3..3b8e7c13c614 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -138,6 +138,7 @@ static unsigned long kvm_get_tsc_khz(void)
src = &hv_clock[cpu].pvti;
tsc_khz = pvclock_tsc_khz(src);
put_cpu();
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
return tsc_khz;
}
@@ -319,6 +320,8 @@ void __init kvmclock_init(void)
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
+ pvclock_set_pvti_cpu0_va(hv_clock);
+
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
@@ -366,14 +369,11 @@ int __init kvm_setup_vsyscall_timeinfo(void)
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
- if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
- put_cpu();
- return 1;
- }
-
- pvclock_set_pvti_cpu0_va(hv_clock);
put_cpu();
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 1;
+
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
return 0;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 697a4ce04308..736348ead421 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
/* Skylake */
static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
{
- u32 capid0;
+ u32 capid0, capid5;
pci_read_config_dword(pdev, 0x84, &capid0);
+ pci_read_config_dword(pdev, 0x98, &capid5);
- if ((capid0 & 0xc0) == 0xc0)
+ /*
+ * CAPID0{7:6} indicate whether this is an advanced RAS SKU
+ * CAPID5{8:5} indicate that various NVDIMM usage modes are
+ * enabled, so memory machine check recovery is also enabled.
+ */
+ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
static_branch_inc(&mcsafe_key);
+
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 445ca11ff863..92a3b312a53c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -692,7 +692,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
* Increment event counter and perform fixup for the pre-signal
* frame.
*/
- rseq_signal_deliver(regs);
+ rseq_signal_deliver(ksig, regs);
/* Set up the stack frame */
if (is_ia32_frame(ksig)) {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2f7d1d2a5c3..db9656e13ea0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -221,6 +221,11 @@ static void notrace start_secondary(void *unused)
#ifdef CONFIG_X86_32
/* switch away from the initial page table */
load_cr3(swapper_pg_dir);
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
__flush_tlb_all();
#endif
load_current_idt();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a535dd64de63..e6db475164ed 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -835,16 +835,18 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
"simd exception";
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
- return;
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
- if (!fixup_exception(regs, trapnr)) {
- task->thread.error_code = error_code;
- task->thread.trap_nr = trapnr;
+ if (fixup_exception(regs, trapnr))
+ return;
+
+ task->thread.error_code = error_code;
+ task->thread.trap_nr = trapnr;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code,
+ trapnr, SIGFPE) != NOTIFY_STOP)
die(str, regs, error_code);
- }
return;
}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 58d8d800875d..deb576b23b7c 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -293,7 +293,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
/* has the side-effect of processing the entire instruction */
insn_get_length(insn);
- if (WARN_ON_ONCE(!insn_complete(insn)))
+ if (!insn_complete(insn))
return -ENOEXEC;
if (is_prefix_bad(insn))