diff options
author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2019-07-29 08:25:19 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2019-07-29 08:25:19 +0200 |
commit | 6de465a50a900b29793aaba67f41f98f0e5354a0 (patch) | |
tree | f13eaeb75f0bfa3f7e609a58bcb891a869164042 /kernel | |
parent | 41db5f8397eee75afff82655a4884b5786a1d302 (diff) | |
parent | 609488bc979f99f805f34e9a32c1e3b71179d10b (diff) | |
download | linux-6de465a50a900b29793aaba67f41f98f0e5354a0.tar.gz linux-6de465a50a900b29793aaba67f41f98f0e5354a0.tar.bz2 linux-6de465a50a900b29793aaba67f41f98f0e5354a0.zip |
Merge 5.3-rc2 into char-misc-next
We want the char/misc fixes in here as well.
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 8 | ||||
-rw-r--r-- | kernel/cred.c | 21 | ||||
-rw-r--r-- | kernel/events/core.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 13 | ||||
-rw-r--r-- | kernel/locking/lockdep_proc.c | 3 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 11 | ||||
-rw-r--r-- | kernel/locking/rwsem.c | 28 | ||||
-rw-r--r-- | kernel/sched/fair.c | 144 |
10 files changed, 167 insertions, 66 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fc020c09b7e8..deff97217496 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT_LL +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT + select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making @@ -58,7 +58,7 @@ config PREEMPT_LL config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT - select PREEMPT + select PREEMPTION help This option turns the kernel into a real-time kernel by replacing various locking primitives (spinlocks, rwlocks, etc.) with @@ -77,6 +77,6 @@ endchoice config PREEMPT_COUNT bool -config PREEMPT +config PREEMPTION bool select PREEMPT_COUNT diff --git a/kernel/cred.c b/kernel/cred.c index f9a0ce66c9c3..c0a4c12d38b2 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -144,7 +144,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -261,6 +264,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..0463c1151bae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11274,7 +11274,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/exit.c b/kernel/exit.c index a75b6a7f458a..4436158a6d30 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..2852d0e76ea3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 341f52117f88..4861cf8e274b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -448,7 +448,7 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the graph_lock. @@ -491,7 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Locking printouts: */ @@ -2969,7 +2969,7 @@ static void check_chain_key(struct task_struct *curr) #endif } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); @@ -3608,7 +3608,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#else /* CONFIG_PROVE_LOCKING */ static inline int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) @@ -3627,7 +3627,7 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: @@ -4321,8 +4321,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie */ static void check_flags(unsigned long flags) { -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 65b6a1600c8f..bda006f8a88b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps = 0; #ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index edd1c082dbf5..5e069734363c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -908,6 +908,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, might_sleep(); +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + ww = container_of(lock, struct ww_mutex, base); if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) @@ -1379,8 +1383,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..bd0f0d05724c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -666,7 +666,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + /* + * Don't check the read-owner as the entry may be stale. + */ + if ((flags & nonspinnable) || + (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); @@ -1000,6 +1004,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) atomic_long_add(-RWSEM_READER_BIAS, &sem->count); adjustment = 0; if (rwsem_optimistic_spin(sem, false)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ /* * Wake up other readers in the wait list if the front * waiter is a reader. @@ -1014,6 +1019,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) } return sem; } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ return sem; } @@ -1032,6 +1038,8 @@ queue: */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); @@ -1065,15 +1073,18 @@ queue: wake_up_q(&wake_q); /* wait to be given the lock */ - while (true) { + for (;;) { set_current_state(state); - if (!waiter.task) + if (!smp_load_acquire(&waiter.task)) { + /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; + } if (signal_pending_state(state, current)) { raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) goto out_nolock; raw_spin_unlock_irq(&sem->wait_lock); + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } schedule(); @@ -1083,6 +1094,7 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); return sem; + out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) { @@ -1123,8 +1135,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* do optimistic spinning and steal lock if possible */ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) + rwsem_optimistic_spin(sem, true)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; + } /* * Disable reader optimistic spinning for this rwsem after @@ -1184,9 +1198,11 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) wait: /* wait until we successfully acquire the lock */ set_current_state(state); - while (true) { - if (rwsem_try_write_lock(sem, wstate)) + for (;;) { + if (rwsem_try_write_lock(sem, wstate)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ break; + } raw_spin_unlock_irq(&sem->wait_lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 036be95a87e9..bc9cfeaac8bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1086,6 +1086,21 @@ struct numa_group { unsigned long faults[0]; }; +/* + * For functions that can be called in multiple contexts that permit reading + * ->numa_group (see struct task_struct for locking rules). + */ +static struct numa_group *deref_task_numa_group(struct task_struct *p) +{ + return rcu_dereference_check(p->numa_group, p == current || + (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); +} + +static struct numa_group *deref_curr_numa_group(struct task_struct *p) +{ + return rcu_dereference_protected(p->numa_group, p == current); +} + static inline unsigned long group_faults_priv(struct numa_group *ng); static inline unsigned long group_faults_shared(struct numa_group *ng); @@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long period = smin; + struct numa_group *ng; /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p) period *= shared + 1; period /= private + shared + 1; } + rcu_read_unlock(); return max(smin, period); } @@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long smax; + struct numa_group *ng; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + ng = deref_curr_numa_group(p); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); unsigned long period = smax; @@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; - p->numa_group = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) pid_t task_numa_group_id(struct task_struct *p) { - return p->numa_group ? p->numa_group->gid : 0; + struct numa_group *ng; + pid_t gid = 0; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) + gid = ng->gid; + rcu_read_unlock(); + + return gid; } /* @@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) static inline unsigned long group_faults(struct task_struct *p, int nid) { - if (!p->numa_group) + struct numa_group *ng = deref_task_numa_group(p); + + if (!ng) return 0; - return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + - p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) @@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, static inline unsigned long group_weight(struct task_struct *p, int nid, int dist) { + struct numa_group *ng = deref_task_numa_group(p); unsigned long faults, total_faults; - if (!p->numa_group) + if (!ng) return 0; - total_faults = p->numa_group->total_faults; + total_faults = ng->total_faults; if (!total_faults) return 0; @@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { - struct numa_group *ng = p->numa_group; + struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; @@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load, static void task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); + long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; - long load; - long imp = env->p->numa_group ? groupimp : taskimp; - long moveimp = imp; int dist = env->dist; + long moveimp = imp; + long load; if (READ_ONCE(dst_rq->numa_migrate_on)) return; @@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - if (cur->numa_group == env->p->numa_group) { + cur_ng = rcu_dereference(cur->numa_group); + if (cur_ng == p_ng) { imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. */ - if (cur->numa_group) + if (cur_ng) imp -= imp / 16; } else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead. */ - if (cur->numa_group && env->p->numa_group) + if (cur_ng && p_ng) imp += group_weight(cur, env->src_nid, dist) - group_weight(cur, env->dst_nid, dist); else @@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p) .best_imp = 0, .best_cpu = -1, }; + unsigned long taskweight, groupweight; struct sched_domain *sd; + long taskimp, groupimp; + struct numa_group *ng; struct rq *best_rq; - unsigned long taskweight, groupweight; int nid, ret, dist; - long taskimp, groupimp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { + ng = deref_curr_numa_group(p); + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group) { + if (ng) { if (env.best_cpu == -1) nid = env.src_nid; else @@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + struct numa_group *ng; /* * The p->mm->numa_scan_seq field gets updated without @@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p) runtime = numa_get_avg_runtime(p, &period); /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; + ng = deref_curr_numa_group(p); + if (ng) { + group_lock = &ng->lock; spin_lock_irq(group_lock); } @@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults[cpu_idx] += f_diff; faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; - if (p->numa_group) { + if (ng) { /* * safe because we can only change our own group * @@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p) * nid and priv in a specific region because it * is at the beginning of the numa_faults array. */ - p->numa_group->faults[mem_idx] += diff; - p->numa_group->faults_cpu[mem_idx] += f_diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[mem_idx]; + ng->faults[mem_idx] += diff; + ng->faults_cpu[mem_idx] += f_diff; + ng->total_faults += diff; + group_faults += ng->faults[mem_idx]; } } - if (!p->numa_group) { + if (!ng) { if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p) } } - if (p->numa_group) { - numa_group_count_active_nodes(p->numa_group); + if (ng) { + numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_nid); } @@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, int cpu = cpupid_to_cpu(cpupid); int i; - if (unlikely(!p->numa_group)) { + if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + 4*nr_node_ids*sizeof(unsigned long); @@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) goto no_join; - my_grp = p->numa_group; + my_grp = deref_curr_numa_group(p); if (grp == my_grp) goto no_join; @@ -2353,13 +2390,24 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { - struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + /* safe: p either is current or is being freed by current */ + struct numa_group *grp = rcu_dereference_raw(p->numa_group); + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2372,8 +2420,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* @@ -2426,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - ng = p->numa_group; + ng = deref_curr_numa_group(p); if (!priv && !local && ng && ng->active_nodes > 1 && numa_is_active_node(cpu_node, ng) && numa_is_active_node(mem_node, ng)) @@ -10444,18 +10498,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) { int node; unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + struct numa_group *ng; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } - if (p->numa_group) { - gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], - gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + if (ng) { + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); } + rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ |