summaryrefslogtreecommitdiff
path: root/kernel/rcu/srcutree.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu/srcutree.c')
-rw-r--r--kernel/rcu/srcutree.c639
1 files changed, 472 insertions, 167 deletions
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6833d8887181..50ba70f019de 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/srcu.h>
#include "rcu.h"
@@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444);
static ulong counter_wrap_check = (ULONG_MAX >> 2);
module_param(counter_wrap_check, ulong, 0444);
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
/* Early-boot callback-management, so early that no lock is required! */
static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
@@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work);
static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irq_rcu_node(p) \
+#define spin_unlock_irq_rcu_node(p) \
spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
/*
- * Initialize SRCU combining tree. Note that statically allocated
+ * Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
{
int cpu;
int i;
@@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
+ ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
+ if (!ssp->node)
+ return false;
/* Work out the overall tree geometry. */
ssp->level[0] = &ssp->node[0];
@@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
- snp->srcu_have_cbs[i] = 0;
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
snp->srcu_data_have_cbs[i] = 0;
}
- snp->srcu_gp_seq_needed_exp = 0;
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
if (snp == &ssp->node[0]) {
@@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
snp_first = ssp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
- rcu_segcblist_init(&sdp->srcu_cblist);
- sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->cpu = cpu;
- INIT_WORK(&sdp->work, srcu_invoke_callbacks);
- timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
- sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
+ smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
}
/*
* Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures. The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
+ ssp->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->node = NULL;
mutex_init(&ssp->srcu_cb_mutex);
mutex_init(&ssp->srcu_gp_mutex);
ssp->srcu_idx = 0;
@@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
mutex_init(&ssp->srcu_barrier_mutex);
atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->work, process_srcu);
+ ssp->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
return -ENOMEM;
- init_srcu_struct_nodes(ssp);
+ init_srcu_struct_data(ssp);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
+ if (!ssp->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ return -ENOMEM;
+ }
+ } else {
+ WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ }
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+ smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp, flags);
+ if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_size_jiffies != j) {
+ ssp->srcu_size_jiffies = j;
+ ssp->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
@@ -343,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}
-#define SRCU_INTERVAL 1
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
/*
* Return grace-period delay, zero if there are expedited grace
@@ -351,10 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
- READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
- return 0;
- return SRCU_INTERVAL;
+ unsigned long jbase = SRCU_INTERVAL;
+
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
+ jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
+ if (!jbase) {
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
+ jbase = 1;
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
/**
@@ -382,13 +561,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
return; /* Forgot srcu_barrier(), so just leak it! */
}
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
- pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
+ rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(ssp->sda);
- ssp->sda = NULL;
+ if (!ssp->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+ kfree(ssp->node);
+ ssp->node = NULL;
+ ssp->srcu_size_state = SRCU_SIZE_SMALL;
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -434,9 +620,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
+ struct srcu_data *sdp;
int state;
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ sdp = per_cpu_ptr(ssp->sda, 0);
+ else
+ sdp = this_cpu_ptr(ssp->sda);
lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
@@ -445,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp)
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&ssp->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
+ WRITE_ONCE(ssp->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
rcu_seq_start(&ssp->srcu_gp_seq);
state = rcu_seq_state(ssp->srcu_gp_seq);
@@ -517,7 +709,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
int idx;
unsigned long mask;
struct srcu_data *sdp;
+ unsigned long sgsne;
struct srcu_node *snp;
+ int ss_state;
/* Prevent more than one additional grace period. */
mutex_lock(&ssp->srcu_cb_mutex);
@@ -526,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(ssp);
+ cbdelay = !!srcu_get_delay(ssp);
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -537,38 +731,45 @@ static void srcu_gp_end(struct srcu_struct *ssp)
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
- cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
- if (last_lvl)
- cbs = snp->srcu_have_cbs[idx] == gpseq;
- snp->srcu_have_cbs[idx] = gpseq;
- rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
- if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
- mask = snp->srcu_data_have_cbs[idx];
- snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
- if (cbs)
- srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
- /* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check) && last_lvl)
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed + 100))
- sdp->srcu_gp_seq_needed = gpseq;
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed_exp + 100))
- sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- }
+ ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
}
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&ssp->srcu_cb_mutex);
@@ -583,6 +784,14 @@ static void srcu_gp_end(struct srcu_struct *ssp)
} else {
spin_unlock_irq_rcu_node(ssp);
}
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+ }
}
/*
@@ -596,20 +805,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
unsigned long s)
{
unsigned long flags;
+ unsigned long sgsne;
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
- ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
- return;
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
- return;
}
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
- spin_lock_irqsave_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(ssp, flags);
@@ -630,39 +843,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
{
unsigned long flags;
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
- struct srcu_node *snp = sdp->mynode;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
unsigned long snp_seq;
- /* Each pass through the loop does one level of the srcu_node tree. */
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
- return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
- if (snp == sdp->mynode && snp_seq == s)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
- if (snp == sdp->mynode && snp_seq != s) {
- srcu_schedule_cbs_sdp(sdp, do_norm
- ? SRCU_INTERVAL
- : 0);
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
- if (!do_norm)
- srcu_funnel_exp_start(ssp, snp, s);
- return;
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- snp->srcu_have_cbs[idx] = s;
- if (snp == sdp->mynode)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -678,9 +899,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
queue_delayed_work(rcu_gp_wq, &ssp->work,
- srcu_get_delay(ssp));
+ !!srcu_get_delay(ssp));
else if (list_empty(&ssp->work.work.entry))
list_add(&ssp->work.work.entry, &srcu_boot_list);
}
@@ -814,11 +1041,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
bool needgp = false;
unsigned long s;
struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
check_init_srcu_struct(ssp);
idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, 0);
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
@@ -834,10 +1067,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
needexp = true;
}
spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
if (needgp)
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
srcu_read_unlock(ssp, idx);
return s;
}
@@ -1097,6 +1337,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
complete(&ssp->srcu_barrier_completion);
}
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
+}
+
/**
* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
* @ssp: srcu_struct on which to wait for in-flight callbacks.
@@ -1104,7 +1366,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
- struct srcu_data *sdp;
+ int idx;
unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
check_init_srcu_struct(ssp);
@@ -1120,27 +1382,13 @@ void srcu_barrier(struct srcu_struct *ssp)
/* Initial count prevents reaching zero until all CBs are posted. */
atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
- /*
- * Each pass through this loop enqueues a callback, but only
- * on CPUs already having callbacks enqueued. Note that if
- * a CPU already has callbacks enqueue, it must have already
- * registered the need for a future grace period, so all we
- * need do is enqueue a callback that will use the same
- * grace period as the last callback already in the queue.
- */
- for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
- sdp->srcu_barrier_head.func = srcu_barrier_cb;
- debug_rcu_head_queue(&sdp->srcu_barrier_head);
- if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head)) {
- debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
- }
- spin_unlock_irq_rcu_node(sdp);
- }
+ idx = srcu_read_lock(ssp);
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ srcu_read_unlock(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
@@ -1214,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
srcu_flip(ssp);
spin_lock_irq_rcu_node(ssp);
rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_n_exp_nodelay = 0;
spin_unlock_irq_rcu_node(ssp);
}
@@ -1228,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
mutex_unlock(&ssp->srcu_gp_mutex);
return; /* readers present, retry later. */
}
+ ssp->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1318,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
+ unsigned long curdelay;
+ unsigned long j;
struct srcu_struct *ssp;
ssp = container_of(work, struct srcu_struct, work.work);
srcu_advance_state(ssp);
- srcu_reschedule(ssp, srcu_get_delay(ssp));
+ curdelay = srcu_get_delay(ssp);
+ if (curdelay) {
+ WRITE_ONCE(ssp->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(ssp->reschedule_jiffies) == j) {
+ WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
+ if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(ssp->reschedule_count, 1);
+ WRITE_ONCE(ssp->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -1337,43 +1603,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_size_state);
+ int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *sdp;
-
- sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %c)",
- cpu, c0, c1,
- "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
- s0 += c0;
- s1 += c1;
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(sdp->srcu_unlock_count[!idx]);
+ u1 = data_race(sdp->srcu_unlock_count[idx]);
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(sdp->srcu_lock_count[!idx]);
+ l1 = data_race(sdp->srcu_lock_count[idx]);
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
}
- pr_cont(" T(%ld,%ld)\n", s0, s1);
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
}
EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
@@ -1390,6 +1682,17 @@ void __init srcu_init(void)
{
struct srcu_struct *ssp;
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
+
/*
* Once that is set, call_srcu() can follow the normal path and
* queue delayed work. This must follow RCU workqueues creation
@@ -1400,6 +1703,8 @@ void __init srcu_init(void)
ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
list_del_init(&ssp->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
+ ssp->srcu_size_state = SRCU_SIZE_ALLOC;
queue_work(rcu_gp_wq, &ssp->work.work);
}
}