summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt68
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--include/linux/rculist.h2
-rw-r--r--include/linux/rcupdate.h53
-rw-r--r--include/linux/rcupdate_trace.h4
-rw-r--r--include/linux/rcutiny.h20
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/torture.h5
-rw-r--r--include/trace/events/rcu.h19
-rw-r--r--kernel/locking/lockdep.c4
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/rcu/Kconfig.debug19
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcuperf.c25
-rw-r--r--kernel/rcu/rcutorture.c117
-rw-r--r--kernel/rcu/refscale.c717
-rw-r--r--kernel/rcu/srcutree.c16
-rw-r--r--kernel/rcu/tasks.h37
-rw-r--r--kernel/rcu/tiny.c7
-rw-r--r--kernel/rcu/tree.c407
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/rcu/tree_stall.h5
-rw-r--r--kernel/rcu/update.c16
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/torture.c6
-rw-r--r--lib/test_vmalloc.c103
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/mmap.c1
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configinit.sh4
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/console-badness.sh16
-rw-r--r--tools/testing/selftests/rcutorture/bin/functions.sh23
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/jitter.sh6
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-build.sh6
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-check-branches.sh108
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh71
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh20
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh27
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-transform.sh51
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh19
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh27
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/CFLIST2
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT18
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/PREEMPT18
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh16
47 files changed, 1886 insertions, 267 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..d35fd3ced0db 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4038,6 +4038,14 @@
latencies, which will choose a value aligned
with the appropriate hardware boundaries.
+ rcutree.rcu_min_cached_objs= [KNL]
+ Minimum number of objects which are cached and
+ maintained per one CPU. Object size is equal
+ to PAGE_SIZE. The cache allows to reduce the
+ pressure to page allocator, also it makes the
+ whole algorithm to behave better in low memory
+ condition.
+
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
@@ -4258,6 +4266,20 @@
Set time (jiffies) between CPU-hotplug operations,
or zero to disable CPU-hotplug testing.
+ rcutorture.read_exit= [KNL]
+ Set the number of read-then-exit kthreads used
+ to test the interaction of RCU updaters and
+ task-exit processing.
+
+ rcutorture.read_exit_burst= [KNL]
+ The number of times in a given read-then-exit
+ episode that a set of read-then-exit kthreads
+ is spawned.
+
+ rcutorture.read_exit_delay= [KNL]
+ The delay, in seconds, between successive
+ read-then-exit testing episodes.
+
rcutorture.shuffle_interval= [KNL]
Set task-shuffle interval (s). Shuffling tasks
allows some CPUs to go into dyntick-idle mode
@@ -4407,6 +4429,45 @@
reboot_cpu is s[mp]#### with #### being the processor
to be used for rebooting.
+ refscale.holdoff= [KNL]
+ Set test-start holdoff period. The purpose of
+ this parameter is to delay the start of the
+ test until boot completes in order to avoid
+ interference.
+
+ refscale.loops= [KNL]
+ Set the number of loops over the synchronization
+ primitive under test. Increasing this number
+ reduces noise due to loop start/end overhead,
+ but the default has already reduced the per-pass
+ noise to a handful of picoseconds on ca. 2020
+ x86 laptops.
+
+ refscale.nreaders= [KNL]
+ Set number of readers. The default value of -1
+ selects N, where N is roughly 75% of the number
+ of CPUs. A value of zero is an interesting choice.
+
+ refscale.nruns= [KNL]
+ Set number of runs, each of which is dumped onto
+ the console log.
+
+ refscale.readdelay= [KNL]
+ Set the read-side critical-section duration,
+ measured in microseconds.
+
+ refscale.scale_type= [KNL]
+ Specify the read-protection implementation to test.
+
+ refscale.shutdown= [KNL]
+ Shut down the system at the end of the performance
+ test. This defaults to 1 (shut it down) when
+ rcuperf is built into the kernel and to 0 (leave
+ it running) when rcuperf is built as a module.
+
+ refscale.verbose= [KNL]
+ Enable additional printk() statements.
+
relax_domain_level=
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/admin-guide/cgroup-v1/cpusets.rst.
@@ -5082,6 +5143,13 @@
Prevent the CPU-hotplug component of torturing
until after init has spawned.
+ torture.ftrace_dump_at_shutdown= [KNL]
+ Dump the ftrace buffer at torture-test shutdown,
+ even if there were no errors. This can be a
+ very costly operation when many torture tests
+ are running concurrently, especially on systems
+ with rotating-rust storage.
+
tp720= [HW,PS2]
tpm_suspend_pcr=[HW,TPM]
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68c96057ad2d..704239546093 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4515,6 +4515,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index df587d181844..7eed65b5f713 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -512,7 +512,7 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
* @right: The hlist head on the right
*
* The lists start out as [@left ][node1 ... ] and
- [@right ][node2 ... ]
+ * [@right ][node2 ... ]
* The lists end up as [@left ][node2 ... ]
* [@right ][node1 ... ]
*/
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 659cbfa7581a..d15d46db61f7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
/*
* Does the specified offset indicate that the corresponding rcu_head
- * structure can be handled by kfree_rcu()?
+ * structure can be handled by kvfree_rcu()?
*/
-#define __is_kfree_rcu_offset(offset) ((offset) < 4096)
+#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
/*
* Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
*/
-#define __kfree_rcu(head, offset) \
+#define __kvfree_rcu(head, offset) \
do { \
- BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
- kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
+ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
+ kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
} while (0)
/**
@@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* Because the functions are not allowed in the low-order 4096 bytes of
* kernel virtual memory, offsets up to 4095 bytes can be accommodated.
* If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kfree_rcu(). If this error is triggered, you can
+ * be generated in __kvfree_rcu(). If this error is triggered, you can
* either fall back to use of call_rcu() or rearrange the structure to
* position the rcu_head structure into the first 4096 bytes.
*
@@ -872,7 +872,46 @@ do { \
typeof (ptr) ___p = (ptr); \
\
if (___p) \
- __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+ __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+} while (0)
+
+/**
+ * kvfree_rcu() - kvfree an object after a grace period.
+ *
+ * This macro consists of one or two arguments and it is
+ * based on whether an object is head-less or not. If it
+ * has a head then a semantic stays the same as it used
+ * to be before:
+ *
+ * kvfree_rcu(ptr, rhf);
+ *
+ * where @ptr is a pointer to kvfree(), @rhf is the name
+ * of the rcu_head structure within the type of @ptr.
+ *
+ * When it comes to head-less variant, only one argument
+ * is passed and that is just a pointer which has to be
+ * freed after a grace period. Therefore the semantic is
+ *
+ * kvfree_rcu(ptr);
+ *
+ * where @ptr is a pointer to kvfree().
+ *
+ * Please note, head-less way of freeing is permitted to
+ * use from a context that has to follow might_sleep()
+ * annotation. Otherwise, please switch and embed the
+ * rcu_head structure within the type of @ptr.
+ */
+#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \
+ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
+
+#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
+#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu_arg_1(ptr) \
+do { \
+ typeof(ptr) ___p = (ptr); \
+ \
+ if (___p) \
+ kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \
} while (0)
/*
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 4c25a41f8b27..d9015aac78c6 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting);
/**
* rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
*
- * When synchronize_rcu_trace() is invoked by one task, then that task
- * is guaranteed to block until all other tasks exit their read-side
+ * When synchronize_rcu_tasks_trace() is invoked by one task, then that
+ * task is guaranteed to block until all other tasks exit their read-side
* critical sections. Similarly, if call_rcu_trace() is invoked on one
* task while other tasks are within RCU read-side critical sections,
* invocation of the corresponding RCU callback is deferred until after
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 8512caeb7682..5cc9637cac16 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void)
synchronize_rcu();
}
-static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+/*
+ * Add one more declaration of kvfree() here. It is
+ * not so straight forward to just include <linux/mm.h>
+ * where it is defined due to getting many compile
+ * errors caused by that include.
+ */
+extern void kvfree(const void *addr);
+
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- call_rcu(head, func);
+ if (head) {
+ call_rcu(head, func);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree((void *) func);
}
void rcu_qs(void);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d5cc9d675987..d2f4064ebd1d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
}
void synchronize_rcu_expedited(void);
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier(void);
bool rcu_eqs_special_set(int cpu);
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 629b66e6c161..7f65bd1dd307 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -55,6 +55,11 @@ struct torture_random_state {
#define DEFINE_TORTURE_RANDOM_PERCPU(name) \
DEFINE_PER_CPU(struct torture_random_state, name)
unsigned long torture_random(struct torture_random_state *trsp);
+static inline void torture_random_init(struct torture_random_state *trsp)
+{
+ trsp->trs_state = 0;
+ trsp->trs_count = 0;
+}
/* Task shuffler, which causes CPUs to occasionally go idle. */
void torture_shuffle_task_register(struct task_struct *tp);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f9a7811148e2..ced71237b7e4 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -435,11 +435,12 @@ TRACE_EVENT_RCU(rcu_fqs,
#endif /* #if defined(CONFIG_TREE_RCU) */
/*
- * Tracepoint for dyntick-idle entry/exit events. These take a string
- * as argument: "Start" for entering dyntick-idle mode, "Startirq" for
- * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it
- * to irq/NMI, "--=" for events moving towards idle, and "++=" for events
- * moving away from idle.
+ * Tracepoint for dyntick-idle entry/exit events. These take 2 strings
+ * as argument:
+ * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not
+ * being in dyntick-idle mode.
+ * context: "USER" or "IDLE" or "IRQ".
+ * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context.
*
* These events also take a pair of numbers, which indicate the nesting
* depth before and after the event of interest, and a third number that is
@@ -506,13 +507,13 @@ TRACE_EVENT_RCU(rcu_callback,
/*
* Tracepoint for the registration of a single RCU callback of the special
- * kfree() form. The first argument is the RCU type, the second argument
+ * kvfree() form. The first argument is the RCU type, the second argument
* is a pointer to the RCU callback, the third argument is the offset
* of the callback within the enclosing RCU-protected data structure,
* the fourth argument is the number of lazy callbacks queued, and the
* fifth argument is the total number of callbacks queued.
*/
-TRACE_EVENT_RCU(rcu_kfree_callback,
+TRACE_EVENT_RCU(rcu_kvfree_callback,
TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset,
long qlen),
@@ -596,12 +597,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback,
/*
* Tracepoint for the invocation of a single RCU callback of the special
- * kfree() form. The first argument is the RCU flavor, the second
+ * kvfree() form. The first argument is the RCU flavor, the second
* argument is a pointer to the RCU callback, and the third argument
* is the offset of the callback within the enclosing RCU-protected
* data structure.
*/
-TRACE_EVENT_RCU(rcu_invoke_kfree_callback,
+TRACE_EVENT_RCU(rcu_invoke_kvfree_callback,
TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset),
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..0a7549d159ed 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
- ? "RCU used illegally from idle CPU!\n"
- : "",
+ : "",
rcu_scheduler_active, debug_locks);
/*
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..8ff6f50e06a0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg)
cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
- lock_is_write_held = 1;
+ lock_is_write_held = true;
if (WARN_ON_ONCE(lock_is_read_held))
lwsp->n_lock_fail++; /* rare, but... */
lwsp->n_lock_acquired++;
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = 0;
+ lock_is_write_held = false;
cxt.cur_ops->writeunlock();
stutter_wait("lock_torture_writer");
@@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock();
- lock_is_read_held = 1;
+ lock_is_read_held = true;
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = 0;
+ lock_is_read_held = false;
cxt.cur_ops->readunlock();
stutter_wait("lock_torture_reader");
@@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
- bool fail = 0;
+ bool fail = false;
int i, n_stress;
long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
long long sum = 0;
@@ -904,7 +904,7 @@ static int __init lock_torture_init(void)
/* Initialize the statistics so that each run gets its own numbers. */
if (nwriters_stress) {
- lock_is_write_held = 0;
+ lock_is_write_held = false;
cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
sizeof(*cxt.lwsa),
GFP_KERNEL);
@@ -935,7 +935,7 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = 0;
+ lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 452feae8de20..3cf6132a4bb9 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,6 +61,25 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
+config RCU_REF_SCALE_TEST
+ tristate "Scalability tests for read-side synchronization (RCU and others)"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ select TASKS_RUDE_RCU
+ select TASKS_TRACE_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance tests
+ useful comparing RCU with various read-side synchronization mechanisms.
+ The kernel module may be built after the fact on the running kernel to be
+ tested, if desired.
+
+ Say Y here if you want these performance tests built into the kernel.
+ Say M if you want to build it as a module instead.
+ Say N if you are unsure.
+
config RCU_CPU_STALL_TIMEOUT
int "RCU CPU stall timeout in seconds"
depends on RCU_STALL_COMMON
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index f91f2c2cf138..95f5117ef8da 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TINY_RCU) += tiny.o
obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 16dd1e6b7c09..d906ca987936 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
* value specified by nr_cpus for a read-only test.
*
* Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers. The reader performance statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
*/
#ifdef MODULE
@@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void)
}
/*
- * RCU perf reader kthread. Repeatedly does empty RCU read-side
- * critical section, minimizing update-side interference.
+ * RCU perf reader kthread. Repeatedly does empty RCU read-side critical
+ * section, minimizing update-side interference. However, the point of
+ * this test is not to evaluate reader performance, but instead to serve
+ * as a test load for update-side performance testing.
*/
static int
rcu_perf_reader(void *arg)
@@ -576,11 +583,8 @@ static int compute_real(int n)
static int
rcu_perf_shutdown(void *arg)
{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_perf_writer_finished) >=
- nrealwriters);
- } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+ wait_event(shutdown_wq,
+ atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_perf_cleanup();
kernel_power_off();
@@ -693,11 +697,8 @@ kfree_perf_cleanup(void)
static int
kfree_perf_shutdown(void *arg)
{
- do {
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_perf_thread_ended) >=
- kfree_nrealthreads);
- } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+ wait_event(shutdown_wq,
+ atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8205295fc33e..d0d265304d14 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -109,6 +109,10 @@ torture_param(int, object_debug, 0,
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0,
"Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, read_exit_delay, 13,
+ "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16,
+ "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -146,6 +150,7 @@ static struct task_struct *stall_task;
static struct task_struct *fwd_prog_task;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -177,6 +182,7 @@ static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
@@ -1166,6 +1172,7 @@ rcu_torture_writer(void *arg)
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
}
} while (!torture_must_stop());
+ rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
if (expediting > 0)
expediting = -expediting;
@@ -1370,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
struct rt_read_seg *rtrsp1;
unsigned long long ts;
+ WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
started = cur_ops->get_gp_seq();
@@ -1539,10 +1547,11 @@ rcu_torture_stats_print(void)
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
torture_onoff_stats();
- pr_cont("barrier: %ld/%ld:%ld\n",
+ pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
+ pr_cont("read-exits: %ld\n", data_race(n_read_exits));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -1634,7 +1643,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
- "onoff_interval=%d onoff_holdoff=%d\n",
+ "onoff_interval=%d onoff_holdoff=%d "
+ "read_exit_delay=%d read_exit_burst=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +1653,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block,
n_barrier_cbs,
- onoff_interval, onoff_holdoff);
+ onoff_interval, onoff_holdoff,
+ read_exit_delay, read_exit_burst);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2175,7 +2186,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
- bool lastphase = 0;
+ bool lastphase = false;
bool newphase;
struct rcu_head rcu;
@@ -2338,6 +2349,99 @@ static bool rcu_torture_can_boost(void)
return true;
}
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+ struct torture_random_state *trsp = trsp_in;
+
+ set_user_nice(current, MAX_NICE);
+ // Minimize time between reading and exiting.
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(1);
+ (void)rcu_torture_one_read(trsp);
+ return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+ int count = 0;
+ bool errexit = false;
+ int i;
+ struct task_struct *tsp;
+ DEFINE_TORTURE_RANDOM(trs);
+
+ // Allocate and initialize.
+ set_user_nice(current, MAX_NICE);
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+ // Each pass through this loop does one read-exit episode.
+ do {
+ if (++count > read_exit_burst) {
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ for (i = 0; i < read_exit_delay; i++) {
+ schedule_timeout_uninterruptible(HZ);
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ }
+ if (!READ_ONCE(read_exit_child_stop))
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ count = 0;
+ }
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s",
+ "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ tsp = NULL;
+ break;
+ }
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits ++;
+ stutter_wait("rcu_torture_read_exit");
+ } while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+ // Clean up and exit.
+ smp_store_release(&read_exit_child_stopped, true); // After reaping.
+ smp_mb(); // Store before wakeup.
+ wake_up(&read_exit_wq);
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+ torture_kthread_stopping("rcu_torture_read_exit");
+ return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+ if (read_exit_burst <= 0)
+ return -EINVAL;
+ init_waitqueue_head(&read_exit_wq);
+ read_exit_child_stop = false;
+ read_exit_child_stopped = false;
+ return torture_create_kthread(rcu_torture_read_exit, NULL,
+ read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+ if (!read_exit_task)
+ return;
+ WRITE_ONCE(read_exit_child_stop, true);
+ smp_mb(); // Above write before wait.
+ wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+ torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -2359,6 +2463,7 @@ rcu_torture_cleanup(void)
}
show_rcu_gp_kthreads();
+ rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2370,7 +2475,6 @@ rcu_torture_cleanup(void)
reader_tasks[i]);
kfree(reader_tasks);
}
- rcu_torture_current = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
@@ -2682,6 +2786,9 @@ rcu_torture_init(void)
firsterr = rcu_torture_barrier_init();
if (firsterr)
goto unwind;
+ firsterr = rcu_torture_read_exit_init();
+ if (firsterr)
+ goto unwind;
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
new file mode 100644
index 000000000000..d9291f883b54
--- /dev/null
+++ b/kernel/rcu/refscale.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Scalability test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define SCALE_FLAG "-ref-scale: "
+
+#define SCALEOUT(s, x...) \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
+
+#define VERBOSE_SCALEOUT(s, x...) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+
+#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+// Number of loops per experiment, all readers execute operations concurrently.
+torture_param(long, loops, 10000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+
+#ifdef MODULE
+# define REFSCALE_SHUTDOWN 0
+#else
+# define REFSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+
+struct reader_task {
+ struct task_struct *task;
+ int start_reader;
+ wait_queue_head_t wq;
+ u64 last_duration_ns;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_scale_ops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ void (*readsection)(const int nloops);
+ void (*delaysection)(const int nloops, const int udl, const int ndl);
+ const char *name;
+};
+
+static struct ref_scale_ops *cur_ops;
+
+static void un_delay(const int udl, const int ndl)
+{
+ if (udl)
+ udelay(udl);
+ if (ndl)
+ ndelay(ndl);
+}
+
+static void ref_rcu_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ un_delay(udl, ndl);
+ rcu_read_unlock();
+ }
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct ref_scale_ops rcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_rcu_read_section,
+ .delaysection = ref_rcu_delay_section,
+ .name = "rcu"
+};
+
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
+
+static void srcu_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static struct ref_scale_ops srcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_ref_scale_read_section,
+ .delaysection = srcu_ref_scale_delay_section,
+ .name = "srcu"
+};
+
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ continue;
+}
+
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ un_delay(udl, ndl);
+}
+
+static struct ref_scale_ops rcu_tasks_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_tasks_ref_scale_read_section,
+ .delaysection = rcu_tasks_ref_scale_delay_section,
+ .name = "rcu-tasks"
+};
+
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ rcu_read_unlock_trace();
+ }
+}
+
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ un_delay(udl, ndl);
+ rcu_read_unlock_trace();
+ }
+}
+
+static struct ref_scale_ops rcu_trace_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_trace_ref_scale_read_section,
+ .delaysection = rcu_trace_ref_scale_delay_section,
+ .name = "rcu-trace"
+};
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static void ref_refcnt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ atomic_dec(&refcnt);
+ }
+}
+
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ un_delay(udl, ndl);
+ atomic_dec(&refcnt);
+ }
+}
+
+static struct ref_scale_ops refcnt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_refcnt_section,
+ .delaysection = ref_refcnt_delay_section,
+ .name = "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static void ref_rwlock_init(void)
+{
+ rwlock_init(&test_rwlock);
+}
+
+static void ref_rwlock_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ un_delay(udl, ndl);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static struct ref_scale_ops rwlock_ops = {
+ .init = ref_rwlock_init,
+ .readsection = ref_rwlock_section,
+ .delaysection = ref_rwlock_delay_section,
+ .name = "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static void ref_rwsem_init(void)
+{
+ init_rwsem(&test_rwsem);
+}
+
+static void ref_rwsem_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ up_read(&test_rwsem);
+ }
+}
+
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ un_delay(udl, ndl);
+ up_read(&test_rwsem);
+ }
+}
+
+static struct ref_scale_ops rwsem_ops = {
+ .init = ref_rwsem_init,
+ .readsection = ref_rwsem_section,
+ .delaysection = ref_rwsem_delay_section,
+ .name = "rwsem"
+};
+
+static void rcu_scale_one_reader(void)
+{
+ if (readdelay <= 0)
+ cur_ops->readsection(loops);
+ else
+ cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
+}
+
+// Reader kthread. Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_scale_reader(void *arg)
+{
+ unsigned long flags;
+ long me = (long)arg;
+ struct reader_task *rt = &(reader_tasks[me]);
+ u64 start;
+ s64 duration;
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_init);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+repeat:
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+
+ // Wait for signal that this reader can start.
+ wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
+ torture_must_stop());
+
+ if (torture_must_stop())
+ goto end;
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ WARN_ON_ONCE(smp_processor_id() != me);
+
+ WRITE_ONCE(rt->start_reader, 0);
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started))
+ cpu_relax();
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+
+
+ // To reduce noise, do an initial cache-warming invocation, check
+ // in, and then keep warming until everyone has checked in.
+ rcu_scale_one_reader();
+ if (!atomic_dec_return(&n_warmedup))
+ while (atomic_read_acquire(&n_warmedup))
+ rcu_scale_one_reader();
+ // Also keep interrupts disabled. This also has the effect
+ // of preventing entries into slow path for rcu_read_unlock().
+ local_irq_save(flags);
+ start = ktime_get_mono_fast_ns();
+
+ rcu_scale_one_reader();
+
+ duration = ktime_get_mono_fast_ns() - start;
+ local_irq_restore(flags);
+
+ rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+ // To reduce runtime-skew noise, do maintain-load invocations until
+ // everyone is done.
+ if (!atomic_dec_return(&n_cooleddown))
+ while (atomic_read_acquire(&n_cooleddown))
+ rcu_scale_one_reader();
+
+ if (atomic_dec_and_test(&nreaders_exp))
+ wake_up(&main_wq);
+
+ VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
+
+ if (!torture_must_stop())
+ goto repeat;
+end:
+ torture_kthread_stopping("ref_scale_reader");
+ return 0;
+}
+
+static void reset_readers(void)
+{
+ int i;
+ struct reader_task *rt;
+
+ for (i = 0; i < nreaders; i++) {
+ rt = &(reader_tasks[i]);
+
+ rt->last_duration_ns = 0;
+ }
+}
+
+// Print the results of each reader and return the sum of all their durations.
+static u64 process_durations(int n)
+{
+ int i;
+ struct reader_task *rt;
+ char buf1[64];
+ char *buf;
+ u64 sum = 0;
+
+ buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ if (!buf)
+ return 0;
+ buf[0] = 0;
+ sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
+
+ for (i = 0; i < n && !torture_must_stop(); i++) {
+ rt = &(reader_tasks[i]);
+ sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+ if (i % 5 == 0)
+ strcat(buf, "\n");
+ strcat(buf, buf1);
+
+ sum += rt->last_duration_ns;
+ }
+ strcat(buf, "\n");
+
+ SCALEOUT("%s\n", buf);
+
+ kfree(buf);
+ return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments. For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+ bool errexit = false;
+ int exp, r;
+ char buf1[64];
+ char *buf;
+ u64 *result_avg;
+
+ set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ VERBOSE_SCALEOUT("main_func task started");
+ result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+ buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ if (!result_avg || !buf) {
+ VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ errexit = true;
+ }
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ // Wait for all threads to start.
+ atomic_inc(&n_init);
+ while (atomic_read(&n_init) < nreaders + 1)
+ schedule_timeout_uninterruptible(1);
+
+ // Start exp readers up per experiment
+ for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+ if (errexit)
+ break;
+ if (torture_must_stop())
+ goto end;
+
+ reset_readers();
+ atomic_set(&nreaders_exp, nreaders);
+ atomic_set(&n_started, nreaders);
+ atomic_set(&n_warmedup, nreaders);
+ atomic_set(&n_cooleddown, nreaders);
+
+ exp_idx = exp;
+
+ for (r = 0; r < nreaders; r++) {
+ smp_store_release(&reader_tasks[r].start_reader, 1);
+ wake_up(&reader_tasks[r].wq);
+ }
+
+ VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
+ nreaders);
+
+ wait_event(main_wq,
+ !atomic_read(&nreaders_exp) || torture_must_stop());
+
+ VERBOSE_SCALEOUT("main_func: experiment ended");
+
+ if (torture_must_stop())
+ goto end;
+
+ result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
+ }
+
+ // Print the average of all experiments
+ SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+ buf[0] = 0;
+ strcat(buf, "\n");
+ strcat(buf, "Runs\tTime(ns)\n");
+
+ for (exp = 0; exp < nruns; exp++) {
+ u64 avg;
+ u32 rem;
+
+ if (errexit)
+ break;
+ avg = div_u64_rem(result_avg[exp], 1000, &rem);
+ sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
+ strcat(buf, buf1);
+ }
+
+ if (!errexit)
+ SCALEOUT("%s", buf);
+
+ // This will shutdown everything including us.
+ if (shutdown) {
+ shutdown_start = 1;
+ wake_up(&shutdown_wq);
+ }
+
+ // Wait for torture to stop us
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+
+end:
+ torture_kthread_stopping("main_func");
+ kfree(result_avg);
+ kfree(buf);
+ return 0;
+}
+
+static void
+ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+}
+
+static void
+ref_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nreaders; i++)
+ torture_stop_kthread("ref_scale_reader",
+ reader_tasks[i].task);
+ }
+ kfree(reader_tasks);
+
+ torture_stop_kthread("main_task", main_task);
+ kfree(main_task);
+
+ // Do scale-type-specific cleanup operations.
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+// Shutdown kthread. Just waits to be awakened, then shuts down system.
+static int
+ref_scale_shutdown(void *arg)
+{
+ wait_event(shutdown_wq, shutdown_start);
+
+ smp_mb(); // Wake before output.
+ ref_scale_cleanup();
+ kernel_power_off();
+
+ return -EINVAL;
+}
+
+static int __init
+ref_scale_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct ref_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
+ &refcnt_ops, &rwlock_ops, &rwsem_ops,
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ ref_scale_print_module_parms(cur_ops, "Start of test");
+
+ // Shutdown task
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ // Reader tasks (default to ~75% of online CPUs).
+ if (nreaders < 0)
+ nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+ reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (!reader_tasks) {
+ VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+
+ for (i = 0; i < nreaders; i++) {
+ firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
+ reader_tasks[i].task);
+ if (firsterr)
+ goto unwind;
+
+ init_waitqueue_head(&(reader_tasks[i].wq));
+ }
+
+ // Main Task
+ init_waitqueue_head(&main_wq);
+ firsterr = torture_create_kthread(main_func, NULL, main_task);
+ if (firsterr)
+ goto unwind;
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ ref_scale_cleanup();
+ return firsterr;
+}
+
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d3ef700fb0e..c100acf332ed 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* it, if this function was preempted for enough time for the counters
* to wrap, it really doesn't matter whether or not we expedite the grace
* period. The extra overhead of a needlessly expedited grace period is
- * negligible when amoritized over that time period, and the extra latency
+ * negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
static bool srcu_might_be_idle(struct srcu_struct *ssp)
@@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long t;
unsigned long tlast;
+ check_init_srcu_struct(ssp);
/* If the local srcu_data structure has callbacks, not idle. */
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
return false; /* Callbacks already present, so not idle. */
}
- local_irq_restore(flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
* No local callbacks, so probabalistically probe global state.
@@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
}
rhp->func = func;
idx = srcu_read_lock(ssp);
- local_irq_save(flags);
- sdp = this_cpu_ptr(ssp->sda);
- spin_lock_rcu_node(sdp);
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_gp_seq));
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..835e2df8590a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
#define RTGS_WAIT_READERS 9
#define RTGS_INVOKE_CBS 10
#define RTGS_WAIT_CBS 11
+#ifndef CONFIG_TINY_RCU
static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INIT",
"RTGS_WAIT_WAIT_CBS",
@@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = {
"RTGS_INVOKE_CBS",
"RTGS_WAIT_CBS",
};
+#endif /* #ifndef CONFIG_TINY_RCU */
////////////////////////////////////////////////////////////////////////
//
@@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
rtp->gp_jiffies = jiffies;
}
+#ifndef CONFIG_TINY_RCU
/* Return state name. */
static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
{
@@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
return "???";
return rcu_tasks_gp_state_names[j];
}
+#endif /* #ifndef CONFIG_TINY_RCU */
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
@@ -205,7 +209,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
if (!rtp->cbs_head) {
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_interruptible(HZ/10);
+ schedule_timeout_idle(HZ/10);
}
continue;
}
@@ -227,7 +231,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
cond_resched();
}
/* Paranoid sleep to keep this from entering a tight loop */
- schedule_timeout_uninterruptible(HZ/10);
+ schedule_timeout_idle(HZ/10);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
}
@@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
".C"[!!data_race(rtp->cbs_head)],
s);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -336,7 +342,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
/* Slowly back off waiting for holdouts */
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_interruptible(HZ/fract);
+ schedule_timeout_idle(HZ/fract);
if (fract > 1)
fract--;
@@ -402,7 +408,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
}
/* Processing between scanning taskslist and draining the holdout list. */
-void rcu_tasks_postscan(struct list_head *hop)
+static void rcu_tasks_postscan(struct list_head *hop)
{
/*
* Wait for tasks that are in the process of exiting. This
@@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void)
}
core_initcall(rcu_spawn_tasks_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_classic_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
+#endif /* #ifndef CONFIG_TINY_RCU */
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
}
core_initcall(rcu_spawn_tasks_rude_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_rude_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RUDE_RCU */
static void show_rcu_tasks_rude_gp_kthread(void) {}
@@ -727,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
+static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
+static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -835,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
bool ofl = cpu_is_offline(cpu);
if (task_curr(t)) {
- WARN_ON_ONCE(ofl & !is_idle_task(t));
+ WARN_ON_ONCE(ofl && !is_idle_task(t));
// If no chance of heavyweight readers, do it the hard way.
if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -1118,11 +1128,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
* synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
*
* Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed. These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-tasks read-side critical sections have elapsed. These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -1164,6 +1173,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
}
core_initcall(rcu_spawn_tasks_trace_kthread);
+#ifndef CONFIG_TINY_RCU
static void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
@@ -1174,18 +1184,21 @@ static void show_rcu_tasks_trace_gp_kthread(void)
data_race(n_heavy_reader_attempts));
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
+#ifndef CONFIG_TINY_RCU
void show_rcu_tasks_gp_kthreads(void)
{
show_rcu_tasks_classic_gp_kthread();
show_rcu_tasks_rude_gp_kthread();
show_rcu_tasks_trace_gp_kthread();
}
+#endif /* #ifndef CONFIG_TINY_RCU */
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index dd572ce7c747..aa897c3f2e92 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/prefetch.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "rcu.h"
@@ -84,9 +85,9 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- trace_rcu_invoke_kfree_callback("", head, offset);
- kfree((void *)head - offset);
+ if (__is_kvfree_rcu_offset(offset)) {
+ trace_rcu_invoke_kvfree_callback("", head, offset);
+ kvfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..ac7198ed3197 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -175,6 +177,15 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -954,7 +965,6 @@ void __rcu_irq_enter_check_tick(void)
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
*
* If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
* rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
@@ -990,8 +1000,11 @@ noinstr void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
// ... but is watching here.
- if (!in_nmi())
+ if (!in_nmi()) {
+ instrumentation_begin();
rcu_cleanup_after_idle();
+ instrumentation_end();
+ }
instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
@@ -1638,7 +1651,7 @@ static void rcu_gp_slow(int delay)
if (delay > 0 &&
!(rcu_seq_ctr(rcu_state.gp_seq) %
(rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
- schedule_timeout_uninterruptible(delay);
+ schedule_timeout_idle(delay);
}
static unsigned long sleep_duration;
@@ -1661,7 +1674,7 @@ static void rcu_gp_torture_wait(void)
duration = xchg(&sleep_duration, 0UL);
if (duration > 0) {
pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
- schedule_timeout_uninterruptible(duration);
+ schedule_timeout_idle(duration);
pr_alert("%s: Wait complete\n", __func__);
}
}
@@ -2443,6 +2456,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
count = -rcl.len;
+ rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2726,7 +2740,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
@@ -2894,8 +2908,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rcu_state.name, head,
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
else
@@ -2957,53 +2971,53 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
+#define FREE_N_CHANNELS 2
/**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
* @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
* @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
+ * @records: Array of the kvfree_rcu() pointers
*/
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
unsigned long nr_records;
- void *records[KFREE_BULK_MAX_ENTR];
- struct kfree_rcu_bulk_data *next;
- struct rcu_head *head_free_debug;
+ struct kvfree_rcu_bulk_data *next;
+ void *records[];
};
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kfree_rcu_bulk_data *bhead_free;
+ struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
+ * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
+ * @count: Number of objects for which GP not started
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -3012,28 +3026,84 @@ struct kfree_rcu_cpu_work {
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
- struct kfree_rcu_bulk_data *bhead;
- struct kfree_rcu_bulk_data *bcached;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- spinlock_t lock;
+ raw_spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
- // Number of objects for which GP not started
int count;
+
+ /*
+ * A simple cache list that contains objects for
+ * reuse purpose. In order to save some per-cpu
+ * space the list is singular. Even though it is
+ * lockless an access has to be protected by the
+ * per-cpu lock.
+ */
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
};
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- for (; head; head = head->next)
- debug_rcu_head_unqueue(head);
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
#endif
}
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock(&krcp->lock);
+ local_irq_restore(flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ krcp->nr_bkv_objs--;
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ krcp->nr_bkv_objs++;
+ return true;
+
+}
+
/*
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3041,38 +3111,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head)
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
struct rcu_head *head, *next;
- struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ int i, j;
krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
- spin_lock_irqsave(&krcp->lock, flags);
- head = krwp->head_free;
- krwp->head_free = NULL;
- bhead = krwp->bhead_free;
- krwp->bhead_free = NULL;
- spin_unlock_irqrestore(&krcp->lock, flags);
-
- /* "bhead" is now private, so traverse locklessly. */
- for (; bhead; bhead = bnext) {
- bnext = bhead->next;
- debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ bkvhead[i] = krwp->bkvhead_free[i];
+ krwp->bkvhead_free[i] = NULL;
+ }
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
- bhead->nr_records, bhead->records);
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle two first channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ for (; bkvhead[i]; bkvhead[i] = bnext) {
+ bnext = bkvhead[i]->next;
+ debug_rcu_bhead_unqueue(bkvhead[i]);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (i == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+
+ kfree_bulk(bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+ } else { // vmalloc() / vfree().
+ for (j = 0; j < bkvhead[i]->nr_records; j++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name,
+ bkvhead[i]->records[j], 0);
+
+ vfree(bkvhead[i]->records[j]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
- kfree_bulk(bhead->nr_records, bhead->records);
- rcu_lock_release(&rcu_callback_map);
+ krcp = krc_this_cpu_lock(&flags);
+ if (put_cached_bnode(krcp, bkvhead[i]))
+ bkvhead[i] = NULL;
+ krc_this_cpu_unlock(krcp, flags);
- if (cmpxchg(&krcp->bcached, NULL, bhead))
- free_page((unsigned long) bhead);
+ if (bkvhead[i])
+ free_page((unsigned long) bkvhead[i]);
- cond_resched_tasks_rcu_qs();
+ cond_resched_tasks_rcu_qs();
+ }
}
/*
@@ -3082,14 +3177,15 @@ static void kfree_rcu_work(struct work_struct *work)
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
+ void *ptr = (void *)head - offset;
next = head->next;
- debug_rcu_head_unqueue(head);
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
- kfree((void *)head - offset);
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -3105,8 +3201,8 @@ static void kfree_rcu_work(struct work_struct *work)
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
struct kfree_rcu_cpu_work *krwp;
- bool queued = false;
- int i;
+ bool repeat = false;
+ int i, j;
lockdep_assert_held(&krcp->lock);
@@ -3114,21 +3210,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
krwp = &(krcp->krw_arr[i]);
/*
- * Try to detach bhead or head and attach it over any
+ * Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so
* return false to tell caller to retry.
*/
- if ((krcp->bhead && !krwp->bhead_free) ||
+ if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+ (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
- /* Channel 1. */
- if (!krwp->bhead_free) {
- krwp->bhead_free = krcp->bhead;
- krcp->bhead = NULL;
+ // Channel 1 corresponds to SLAB ptrs.
+ // Channel 2 corresponds to vmalloc ptrs.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (!krwp->bkvhead_free[j]) {
+ krwp->bkvhead_free[j] = krcp->bkvhead[j];
+ krcp->bkvhead[j] = NULL;
+ }
}
- /* Channel 2. */
+ // Channel 3 corresponds to emergency path.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
@@ -3137,17 +3237,21 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0);
/*
- * One work is per one batch, so there are two "free channels",
- * "bhead_free" and "head_free" the batch can handle. It can be
- * that the work is in the pending state when two channels have
- * been detached following each other, one by one.
+ * One work is per one batch, so there are three
+ * "free channels", the batch can handle. It can
+ * be that the work is in the pending state when
+ * channels have been detached following by each
+ * other.
*/
queue_rcu_work(system_wq, &krwp->rcu_work);
- queued = true;
}
+
+ // Repeat if any "free" corresponding channel is still busy.
+ if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
+ repeat = true;
}
- return queued;
+ return !repeat;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -3157,14 +3261,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
@@ -3177,32 +3281,50 @@ static void kfree_rcu_monitor(struct work_struct *work)
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
- struct rcu_head *head, rcu_callback_t func)
+kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
- struct kfree_rcu_bulk_data *bnode;
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
if (unlikely(!krcp->initialized))
return false;
lockdep_assert_held(&krcp->lock);
+ idx = !!is_vmalloc_addr(ptr);
/* Check if a new block is required. */
- if (!krcp->bhead ||
- krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
- bnode = xchg(&krcp->bcached, NULL);
+ if (!krcp->bkvhead[idx] ||
+ krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(krcp);
if (!bnode) {
- WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
+ /*
+ * To keep this path working on raw non-preemptible
+ * sections, prevent the optional entry into the
+ * allocator as it uses sleeping locks. In fact, even
+ * if the caller of kfree_rcu() is preemptible, this
+ * path still is not, as krcp->lock is a raw spinlock.
+ * With additional page pre-allocation in the works,
+ * hitting this return is going to be much less likely.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
- bnode = (struct kfree_rcu_bulk_data *)
+ /*
+ * NOTE: For one argument of kvfree_rcu() we can
+ * drop the lock and get the page in sleepable
+ * context. That would allow to maintain an array
+ * for the CONFIG_PREEMPT_RT as well if no cached
+ * pages are available.
+ */
+ bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
}
@@ -3212,53 +3334,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
/* Initialize the new block. */
bnode->nr_records = 0;
- bnode->next = krcp->bhead;
- bnode->head_free_debug = NULL;
+ bnode->next = krcp->bkvhead[idx];
/* Attach it to the head. */
- krcp->bhead = bnode;
+ krcp->bkvhead[idx] = bnode;
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- head->func = func;
- head->next = krcp->bhead->head_free_debug;
- krcp->bhead->head_free_debug = head;
-#endif
-
/* Finally insert. */
- krcp->bhead->records[krcp->bhead->nr_records++] =
- (void *) head - (unsigned long) func;
+ krcp->bkvhead[idx]->records
+ [krcp->bkvhead[idx]->nr_records++] = ptr;
return true;
}
/*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of appropriate free routine after a
+ * grace period. Please note there are three paths are maintained, two are the
+ * main ones that use array of pointers interface and third one is emergency
+ * one, that is used only when the main path can not be maintained temporary,
+ * due to memory pressure.
*
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
+ bool success;
+ void *ptr;
- local_irq_save(flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- if (krcp->initialized)
- spin_lock(&krcp->lock);
+ if (head) {
+ ptr = (void *) head - (unsigned long) func;
+ } else {
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ might_sleep();
+ ptr = (unsigned long *) func;
+ }
+
+ krcp = krc_this_cpu_lock(&flags);
// Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(head)) {
+ if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
+
+ // Mark as success and leave.
+ success = true;
goto unlock_return;
}
@@ -3266,10 +3397,16 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
- if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+ success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+ if (!success) {
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
head->func = func;
head->next = krcp->head;
krcp->head = head;
+ success = true;
}
WRITE_ONCE(krcp->count, krcp->count + 1);
@@ -3282,11 +3419,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
unlock_return:
- if (krcp->initialized)
- spin_unlock(&krcp->lock);
- local_irq_restore(flags);
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
@@ -3315,11 +3461,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count = krcp->count;
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
sc->nr_to_scan -= count;
freed += count;
@@ -3328,7 +3474,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- return freed;
+ return freed == 0 ? SHRINK_STOP : freed;
}
static struct shrinker kfree_rcu_shrinker = {
@@ -3346,15 +3492,15 @@ void __init kfree_rcu_scheduler_running(void)
for_each_online_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3842,10 +3988,9 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
- int nbits;
- unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ bool newcpu;
if (per_cpu(rcu_cpu_started, cpu))
return;
@@ -3857,12 +4002,10 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
- oldmask = rnp->expmaskinitnext;
+ newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
- oldmask ^= rnp->expmaskinitnext;
- nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
/* Allow lockless access for expedited grace periods. */
- smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4249,13 +4392,23 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ struct kvfree_rcu_bulk_data *bnode;
- spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+ if (bnode)
+ put_cached_bnode(krcp, bnode);
+ else
+ pr_err("Failed to preallocate for %d CPU!\n", cpu);
+ }
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 43991a40b084..c96ae351688b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -41,7 +41,7 @@ struct rcu_node {
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
/* some rcu_state fields as well as */
/* following. */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */
+ unsigned long gp_seq; /* Track rsp->gp_seq. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -73,9 +73,9 @@ struct rcu_node {
unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
- int grplo; /* lowest-numbered CPU or group here. */
- int grphi; /* highest-numbered CPU or group here. */
- u8 grpnum; /* CPU/group number for next level up. */
+ int grplo; /* lowest-numbered CPU here. */
+ int grphi; /* highest-numbered CPU here. */
+ u8 grpnum; /* group number for next level up. */
u8 level; /* root is at level 0. */
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
@@ -149,7 +149,7 @@ union rcu_noqs {
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
- unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */
+ unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool core_needs_qs; /* Core waits for quiesc state. */
@@ -171,6 +171,7 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -301,6 +302,8 @@ struct rcu_state {
u8 boost ____cacheline_internodealigned_in_smp;
/* Subject to priority boost. */
unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
@@ -346,8 +349,6 @@ struct rcu_state {
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
/* GP start. */
- unsigned long gp_max; /* Maximum GP duration in */
- /* jiffies. */
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 72952edad1e4..1888c0eb1216 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -403,7 +403,7 @@ retry_ipi:
/* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_idle(1);
goto retry_ipi;
}
/* CPU really is offline, so we must report its QS. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 352223664ebd..982fc5be5269 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg)
if (spincnt > 10) {
WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
@@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_interruptible(1);
+ schedule_timeout_idle(1);
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b04256cd7e12..b5d3b4794db4 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr {
*/
static bool check_slow_task(struct task_struct *t, void *arg)
{
- struct rcu_node *rnp;
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
return false; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
- rnp = t->rcu_blocked_node;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
return true;
}
@@ -649,6 +647,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
*/
void show_rcu_gp_kthreads(void)
{
+ unsigned long cbs = 0;
int cpu;
unsigned long j;
unsigned long ja;
@@ -690,9 +689,11 @@ void show_rcu_gp_kthreads(void)
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
+ cbs += data_race(rdp->n_cbs_invoked);
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
+ pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..2de49b5d8dd2 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -42,6 +42,7 @@
#include <linux/kprobes.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
#define CREATE_TRACE_POINTS
@@ -207,7 +208,7 @@ void rcu_end_inkernel_boot(void)
rcu_unexpedite_gp();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
- rcu_boot_ended = 1;
+ rcu_boot_ended = true;
}
/*
@@ -279,6 +280,7 @@ struct lockdep_map rcu_sched_lock_map = {
};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+// Tell lockdep when RCU callbacks are being invoked.
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
@@ -390,13 +392,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
might_sleep();
continue;
}
- init_rcu_head_on_stack(&rs_array[i].head);
- init_completion(&rs_array[i].completion);
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
}
/* Wait for all callbacks to be invoked. */
@@ -407,9 +410,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
- if (j == i)
+ if (j == i) {
wait_for_completion(&rs_array[i].completion);
- destroy_rcu_head_on_stack(&rs_array[i].head);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e2dc9b8858c..f0199a4ba1ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
- * per task timers.
+ * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- /*
- * We could optimize this with just kicking the target running the task
- * if that noise matters for nohz full users.
- */
- tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+ if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
+ if (tsk == current) {
+ preempt_disable();
+ tick_nohz_full_kick();
+ preempt_enable();
+ } else {
+ /*
+ * Some future tick_nohz_full_kick_task()
+ * should optimize this.
+ */
+ tick_nohz_full_kick_all();
+ }
+ }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
diff --git a/kernel/torture.c b/kernel/torture.c
index a1a41484ff6d..1061492f14bd 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static bool disable_onoff_at_boot;
module_param(disable_onoff_at_boot, bool, 0444);
+static bool ftrace_dump_at_shutdown;
+module_param(ftrace_dump_at_shutdown, bool, 0444);
+
static char *torture_type;
static int verbose;
@@ -527,7 +530,8 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
- rcu_ftrace_dump(DUMP_ALL);
+ if (ftrace_dump_at_shutdown)
+ rcu_ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index ddc9685702b1..5cf2fe9aab9e 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -15,6 +15,8 @@
#include <linux/delay.h>
#include <linux/rwsem.h>
#include <linux/mm.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
#define __param(type, name, init, msg) \
static type name = init; \
@@ -35,14 +37,18 @@ __param(int, test_loop_count, 1000000,
__param(int, run_test_mask, INT_MAX,
"Set tests specified in the mask.\n\n"
- "\t\tid: 1, name: fix_size_alloc_test\n"
- "\t\tid: 2, name: full_fit_alloc_test\n"
- "\t\tid: 4, name: long_busy_list_alloc_test\n"
- "\t\tid: 8, name: random_size_alloc_test\n"
- "\t\tid: 16, name: fix_align_alloc_test\n"
- "\t\tid: 32, name: random_size_align_alloc_test\n"
- "\t\tid: 64, name: align_shift_alloc_test\n"
- "\t\tid: 128, name: pcpu_alloc_test\n"
+ "\t\tid: 1, name: fix_size_alloc_test\n"
+ "\t\tid: 2, name: full_fit_alloc_test\n"
+ "\t\tid: 4, name: long_busy_list_alloc_test\n"
+ "\t\tid: 8, name: random_size_alloc_test\n"
+ "\t\tid: 16, name: fix_align_alloc_test\n"
+ "\t\tid: 32, name: random_size_align_alloc_test\n"
+ "\t\tid: 64, name: align_shift_alloc_test\n"
+ "\t\tid: 128, name: pcpu_alloc_test\n"
+ "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n"
+ "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n"
+ "\t\tid: 1024, name: kvfree_rcu_1_arg_slab_test\n"
+ "\t\tid: 2048, name: kvfree_rcu_2_arg_slab_test\n"
/* Add a new test case description here. */
);
@@ -316,6 +322,83 @@ pcpu_alloc_test(void)
return rv;
}
+struct test_kvfree_rcu {
+ struct rcu_head rcu;
+ unsigned char array[20];
+};
+
+static int
+kvfree_rcu_1_arg_vmalloc_test(void)
+{
+ struct test_kvfree_rcu *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ p = vmalloc(1 * PAGE_SIZE);
+ if (!p)
+ return -1;
+
+ p->array[0] = 'a';
+ kvfree_rcu(p);
+ }
+
+ return 0;
+}
+
+static int
+kvfree_rcu_2_arg_vmalloc_test(void)
+{
+ struct test_kvfree_rcu *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ p = vmalloc(1 * PAGE_SIZE);
+ if (!p)
+ return -1;
+
+ p->array[0] = 'a';
+ kvfree_rcu(p, rcu);
+ }
+
+ return 0;
+}
+
+static int
+kvfree_rcu_1_arg_slab_test(void)
+{
+ struct test_kvfree_rcu *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -1;
+
+ p->array[0] = 'a';
+ kvfree_rcu(p);
+ }
+
+ return 0;
+}
+
+static int
+kvfree_rcu_2_arg_slab_test(void)
+{
+ struct test_kvfree_rcu *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -1;
+
+ p->array[0] = 'a';
+ kvfree_rcu(p, rcu);
+ }
+
+ return 0;
+}
+
struct test_case_desc {
const char *test_name;
int (*test_func)(void);
@@ -330,6 +413,10 @@ static struct test_case_desc test_case_array[] = {
{ "random_size_align_alloc_test", random_size_align_alloc_test },
{ "align_shift_alloc_test", align_shift_alloc_test },
{ "pcpu_alloc_test", pcpu_alloc_test },
+ { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test },
+ { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test },
+ { "kvfree_rcu_1_arg_slab_test", kvfree_rcu_1_arg_slab_test },
+ { "kvfree_rcu_2_arg_slab_test", kvfree_rcu_2_arg_slab_test },
/* Add a new test case here. */
};
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9222910ab1cb..e825804b3928 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
/*
* This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu().
+ * and nobody can use it. So, there is no need to use kvfree_rcu_local().
*/
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
kvfree(memcg_lrus);
}
-static void kvfree_rcu(struct rcu_head *head)
+static void kvfree_rcu_local(struct rcu_head *head)
{
struct list_lru_memcg *mlru;
@@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- call_rcu(&old->rcu, kvfree_rcu);
+ call_rcu(&old->rcu, kvfree_rcu_local);
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 59a4682ebf3f..972f839c6ec8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3159,6 +3159,7 @@ void exit_mmap(struct mm_struct *mm)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ cond_resched();
}
vm_unacct_memory(nr_accounted);
}
diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh
index 93e80a42249a..d6e5ce084b1c 100755
--- a/tools/testing/selftests/rcutorture/bin/configinit.sh
+++ b/tools/testing/selftests/rcutorture/bin/configinit.sh
@@ -32,11 +32,11 @@ if test -z "$TORTURE_TRUST_MAKE"
then
make clean > $resdir/Make.clean 2>&1
fi
-make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1
+make $TORTURE_KMAKE_ARG $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1
mv .config .config.sav
sh $T/upd.sh < .config.sav > .config
cp .config .config.new
-yes '' | make oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err
+yes '' | make $TORTURE_KMAKE_ARG oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err
# verify new config matches specification.
configcheck.sh .config $c
diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh
new file mode 100755
index 000000000000..0e4c0b2eb7f0
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Scan standard input for error messages, dumping any found to standard
+# output.
+#
+# Usage: console-badness.sh
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' |
+grep -v 'ODEBUG: ' |
+grep -v 'This means that this is a DEBUG kernel and it is' |
+grep -v 'Warning: unable to open an initial console'
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 12810229fddc..51f3464b96d3 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -215,9 +215,6 @@ identify_qemu_args () {
then
echo -device spapr-vlan,netdev=net0,mac=$TORTURE_QEMU_MAC
echo -netdev bridge,br=br0,id=net0
- elif test -n "$TORTURE_QEMU_INTERACTIVE"
- then
- echo -net nic -net user
fi
;;
esac
@@ -234,7 +231,7 @@ identify_qemu_args () {
# Returns the number of virtual CPUs available to the aggregate of the
# guest OSes.
identify_qemu_vcpus () {
- lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://'
+ lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ ]*//g'
}
# print_bug
@@ -275,3 +272,21 @@ specify_qemu_cpus () {
esac
fi
}
+
+# specify_qemu_net qemu-args
+#
+# Appends a string containing "-net none" to qemu-args, unless the incoming
+# qemu-args already contains "-smp" or unless the TORTURE_QEMU_INTERACTIVE
+# environment variable is set, in which case the string that is be added is
+# instead "-net nic -net user".
+specify_qemu_net () {
+ if echo $1 | grep -q -e -net
+ then
+ echo $1
+ elif test -n "$TORTURE_QEMU_INTERACTIVE"
+ then
+ echo $1 -net nic -net user
+ else
+ echo $1 -net none
+ fi
+}
diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 30cb5b27d32e..188b864bc4bf 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -46,6 +46,12 @@ do
exit 0;
fi
+ # Check for stop request.
+ if test -f "$TORTURE_STOPFILE"
+ then
+ exit 1;
+ fi
+
# Set affinity to randomly selected online CPU
if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 |
sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index 18d6518504ee..115e1822b26f 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -9,6 +9,12 @@
#
# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+if test -f "$TORTURE_STOPFILE"
+then
+ echo "kvm-build.sh early exit due to run STOP request"
+ exit 1
+fi
+
config_template=${1}
if test -z "$config_template" -o ! -f "$config_template" -o ! -r "$config_template"
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
new file mode 100755
index 000000000000..6e65c134e5f1
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
@@ -0,0 +1,108 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a group of kvm.sh tests on the specified commits. This currently
+# unconditionally does three-minute runs on each scenario in CFLIST,
+# taking advantage of all available CPUs and trusting the "make" utility.
+# In the short term, adjustments can be made by editing this script and
+# CFLIST. If some adjustments appear to have ongoing value, this script
+# might grow some command-line arguments.
+#
+# Usage: kvm-check-branches.sh commit1 commit2..commit3 commit4 ...
+#
+# This script considers its arguments one at a time. If more elaborate
+# specification of commits is needed, please use "git rev-list" to
+# produce something that this simple script can understand. The reason
+# for retaining the simplicity is that it allows the user to more easily
+# see which commit came from which branch.
+#
+# This script creates a yyyy.mm.dd-hh.mm.ss-group entry in the "res"
+# directory. The calls to kvm.sh create the usual entries, but this script
+# moves them under the yyyy.mm.dd-hh.mm.ss-group entry, each in its own
+# directory numbered in run order, that is, "0001", "0002", and so on.
+# For successful runs, the large build artifacts are removed. Doing this
+# reduces the disk space required by about two orders of magnitude for
+# successful runs.
+#
+# Copyright (C) Facebook, 2020
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+if ! git status > /dev/null 2>&1
+then
+ echo '!!!' This script needs to run in a git archive. 1>&2
+ echo '!!!' Giving up. 1>&2
+ exit 1
+fi
+
+# Remember where we started so that we can get back and the end.
+curcommit="`git status | head -1 | awk '{ print $NF }'`"
+
+nfail=0
+ntry=0
+resdir="tools/testing/selftests/rcutorture/res"
+ds="`date +%Y.%m.%d-%H.%M.%S`-group"
+if ! test -e $resdir
+then
+ mkdir $resdir || :
+fi
+mkdir $resdir/$ds
+echo Results directory: $resdir/$ds
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+cpus="`identify_qemu_vcpus`"
+echo Using up to $cpus CPUs.
+
+# Each pass through this loop does one command-line argument.
+for gitbr in $@
+do
+ echo ' --- git branch ' $gitbr
+
+ # Each pass through this loop tests one commit.
+ for i in `git rev-list "$gitbr"`
+ do
+ ntry=`expr $ntry + 1`
+ idir=`awk -v ntry="$ntry" 'END { printf "%04d", ntry; }' < /dev/null`
+ echo ' --- commit ' $i from branch $gitbr
+ date
+ mkdir $resdir/$ds/$idir
+ echo $gitbr > $resdir/$ds/$idir/gitbr
+ echo $i >> $resdir/$ds/$idir/gitbr
+
+ # Test the specified commit.
+ git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
+ echo git checkout return code: $? "(Commit $ntry: $i)"
+ kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
+ ret=$?
+ echo kvm.sh return code $ret for commit $i from branch $gitbr
+
+ # Move the build products to their resting place.
+ runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
+ mv $runresdir $resdir/$ds/$idir
+ rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
+ echo Run results: $resdir/$ds/$idir/$rrd
+ if test "$ret" -ne 0
+ then
+ # Failure, so leave all evidence intact.
+ nfail=`expr $nfail + 1`
+ else
+ # Success, so remove large files to save about 1GB.
+ ( cd $resdir/$ds/$idir/$rrd; rm -f */vmlinux */bzImage */System.map */Module.symvers )
+ fi
+ done
+done
+date
+
+# Go back to the original commit.
+git checkout "$curcommit"
+
+if test $nfail -ne 0
+then
+ echo '!!! ' $nfail failures in $ntry 'runs!!!'
+ exit 1
+else
+ echo No failures in $ntry runs.
+ exit 0
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
new file mode 100755
index 000000000000..35a463dddffe
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for refscale performance measurements.
+#
+# Usage: kvm-recheck-refscale.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' |
+awk -v configfile="$configfile" '
+/^[ ]*Runs Time\(ns\) *$/ {
+ if (dataphase + 0 == 0) {
+ dataphase = 1;
+ # print configfile, $0;
+ }
+ next;
+}
+
+/[^ ]*[0-9][0-9]* [0-9][0-9]*\.[0-9][0-9]*$/ {
+ if (dataphase == 1) {
+ # print $0;
+ readertimes[++n] = $2;
+ sum += $2;
+ }
+ next;
+}
+
+{
+ if (dataphase == 1)
+ dataphase == 2;
+ next;
+}
+
+END {
+ print configfile " results:";
+ newNR = asort(readertimes);
+ if (newNR <= 0) {
+ print "No refscale records found???"
+ exit;
+ }
+ medianidx = int(newNR / 2);
+ if (newNR == medianidx * 2)
+ medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2;
+ else
+ medianvalue = readertimes[medianidx];
+ points = "Points:";
+ for (i = 1; i <= newNR; i++)
+ points = points " " readertimes[i];
+ print points;
+ print "Average reader duration: " sum / newNR " nanoseconds";
+ print "Minimum reader duration: " readertimes[1];
+ print "Median reader duration: " medianvalue;
+ print "Maximum reader duration: " readertimes[newNR];
+ print "Computed from refscale printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 736f04749b90..840a4679a0d7 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -31,6 +31,7 @@ do
head -1 $resdir/log
fi
TORTURE_SUITE="`cat $i/../TORTURE_SUITE`"
+ configfile=`echo $i | sed -e 's,^.*/,,'`
rm -f $i/console.log.*.diags
kvm-recheck-${TORTURE_SUITE}.sh $i
if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137
@@ -43,7 +44,8 @@ do
then
echo QEMU killed
fi
- configcheck.sh $i/.config $i/ConfigFragment
+ configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1
+ cat $T
if test -r $i/Make.oldconfig.err
then
cat $i/Make.oldconfig.err
@@ -55,15 +57,15 @@ do
cat $i/Warnings
fi
else
- if test -f "$i/qemu-cmd"
- then
- print_bug qemu failed
- echo " $i"
- elif test -f "$i/buildonly"
+ if test -f "$i/buildonly"
then
echo Build-only run, no boot/test
configcheck.sh $i/.config $i/ConfigFragment
parse-build.sh $i/Make.out $configfile
+ elif test -f "$i/qemu-cmd"
+ then
+ print_bug qemu failed
+ echo " $i"
else
print_bug Build failed
echo " $i"
@@ -72,7 +74,11 @@ do
done
if test -f "$rd/kcsan.sum"
then
- if test -s "$rd/kcsan.sum"
+ if grep -q CONFIG_KCSAN=y $T
+ then
+ echo "Compiler or architecture does not support KCSAN!"
+ echo Did you forget to switch your compiler with '--kmake-arg CC=<cc-that-supports-kcsan>'?
+ elif test -s "$rd/kcsan.sum"
then
echo KCSAN summary in $rd/kcsan.sum
else
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 6ff611c630d1..e07779a62634 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -124,7 +124,6 @@ seconds=$4
qemu_args=$5
boot_args=$6
-cd $KVM
kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
if test -z "$TORTURE_BUILDONLY"
then
@@ -141,6 +140,7 @@ then
cpu_count=$TORTURE_ALLOTED_CPUS
fi
qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
+qemu_args="`specify_qemu_net "$qemu_args"`"
# Generate architecture-specific and interaction-specific qemu arguments
qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
@@ -152,6 +152,7 @@ qemu_append="`identify_qemu_append "$QEMU"`"
boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
# Generate kernel-version-specific boot parameters
boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
+echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
if test -n "$TORTURE_BUILDONLY"
then
@@ -159,9 +160,16 @@ then
touch $resdir/buildonly
exit 0
fi
+
+# Decorate qemu-cmd with redirection, backgrounding, and PID capture
+sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd
+echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd
+
+# In case qemu refuses to run...
echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
-echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
-( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
+
+# Attempt to run qemu
+( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
commandcompleted=0
sleep 10 # Give qemu's pid a chance to reach the file
if test -s "$resdir/qemu_pid"
@@ -181,7 +189,7 @@ do
kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
then
- if test $kruntime -ge $seconds
+ if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE"
then
break;
fi
@@ -210,10 +218,19 @@ then
fi
if test $commandcompleted -eq 0 -a -n "$qemu_pid"
then
- echo Grace period for qemu job at pid $qemu_pid
+ if ! test -f "$TORTURE_STOPFILE"
+ then
+ echo Grace period for qemu job at pid $qemu_pid
+ fi
oldline="`tail $resdir/console.log`"
while :
do
+ if test -f "$TORTURE_STOPFILE"
+ then
+ echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1
+ kill -KILL $qemu_pid
+ break
+ fi
kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
if kill -0 $qemu_pid > /dev/null 2>&1
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
new file mode 100755
index 000000000000..c45a953ef393
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Transform a qemu-cmd file to allow reuse.
+#
+# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out
+#
+# bzImage: Kernel and initrd from the same prior kvm.sh run.
+# console.log: File into which to place console output.
+#
+# The original qemu-cmd file is provided on standard input.
+# The transformed qemu-cmd file is on standard output.
+# The transformation assumes that the qemu command is confined to a
+# single line. It also assumes no whitespace in filenames.
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+image="$1"
+if test -z "$image"
+then
+ echo Need kernel image file.
+ exit 1
+fi
+consolelog="$2"
+if test -z "$consolelog"
+then
+ echo "Need console log file name."
+ exit 1
+fi
+
+awk -v image="$image" -v consolelog="$consolelog" '
+{
+ line = "";
+ for (i = 1; i <= NF; i++) {
+ if (line == "")
+ line = $i;
+ else
+ line = line " " $i;
+ if ($i == "-serial") {
+ i++;
+ line = line " file:" consolelog;
+ }
+ if ($i == "-kernel") {
+ i++;
+ line = line " " image;
+ }
+ }
+ print line;
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index c279cf9cb010..e655983b7429 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -73,6 +73,10 @@ usage () {
while test $# -gt 0
do
case "$1" in
+ --allcpus)
+ cpus=$TORTURE_ALLOTED_CPUS
+ max_cpus=$TORTURE_ALLOTED_CPUS
+ ;;
--bootargs|--bootarg)
checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
TORTURE_BOOTARGS="$2"
@@ -180,13 +184,14 @@ do
shift
;;
--torture)
- checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--'
+ checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\)$' '^--'
TORTURE_SUITE=$2
shift
- if test "$TORTURE_SUITE" = rcuperf
+ if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale
then
- # If you really want jitter for rcuperf, specify
- # it after specifying rcuperf. (But why?)
+ # If you really want jitter for refscale or
+ # rcuperf, specify it after specifying the rcuperf
+ # or the refscale. (But why jitter in these cases?)
jitter=0
fi
;;
@@ -333,6 +338,8 @@ then
mkdir -p "$resdir" || :
fi
mkdir $resdir/$ds
+TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR
+TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE
echo Results directory: $resdir/$ds
echo $scriptname $args
touch $resdir/$ds/log
@@ -497,3 +504,7 @@ fi
# Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier
# Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop
# Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y"
+# Control buffer size: --bootargs trace_buf_size=3k
+# Get trace-buffer dumps on all oopses: --bootargs ftrace_dump_on_oops
+# Ditto, but dump only the oopsing CPU: --bootargs ftrace_dump_on_oops=orig_cpu
+# Heavy-handed way to also dump on warnings: --bootargs panic_on_warn
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 4bf62d7b1cbc..71a9f43a3918 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,8 +33,8 @@ then
fi
cat /dev/null > $file.diags
-# Check for proper termination, except that rcuperf runs don't indicate this.
-if test "$TORTURE_SUITE" != rcuperf
+# Check for proper termination, except for rcuperf and refscale.
+if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale
then
# check for abject failure
@@ -44,11 +44,23 @@ then
tail -1 |
awk '
{
- for (i=NF-8;i<=NF;i++)
+ normalexit = 1;
+ for (i=NF-8;i<=NF;i++) {
+ if (i <= 0 || i !~ /^[0-9]*$/) {
+ bangstring = $0;
+ gsub(/^\[[^]]*] /, "", bangstring);
+ print bangstring;
+ normalexit = 0;
+ exit 0;
+ }
sum+=$i;
+ }
}
- END { print sum }'`
- print_bug $title FAILURE, $nerrs instances
+ END {
+ if (normalexit)
+ print sum " instances"
+ }'`
+ print_bug $title FAILURE, $nerrs
exit
fi
@@ -104,10 +116,7 @@ then
fi
fi | tee -a $file.diags
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file |
-grep -v 'ODEBUG: ' |
-grep -v 'This means that this is a DEBUG kernel and it is' |
-grep -v 'Warning: unable to open an initial console' > $T.diags
+console-badness.sh < $file > $T.diags
if test -s $T.diags
then
print_warning "Assertion failure in $file $title"
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFLIST b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST
new file mode 100644
index 000000000000..4d62eb4a39f9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST
@@ -0,0 +1,2 @@
+NOPREEMPT
+PREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon
new file mode 100644
index 000000000000..a98b58b54bb1
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_REF_SCALE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
new file mode 100644
index 000000000000..1cd25b7314e3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_PREEMPT_RCU=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
new file mode 100644
index 000000000000..d10bc694f42c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
new file mode 100644
index 000000000000..321e82641287
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 refscale.shutdown=1 \
+ refscale.verbose=1
+}