Add batching to arena bins.

This adds a fast-path for threads freeing a small number of allocations to bins which are not their "home-base" and which encounter lock contention in attempting to do so. In producer-consumer workflows, such small lock hold times can cause lock convoying that greatly increases overall bin mutex contention.
2024-02-09 16:08:45 -08:00 · 2024-02-09 16:08:45 -08:00 · fc615739cb
parent 44d91cf243
commit fc615739cb
16 changed files with 722 additions and 78 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -206,6 +206,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/base.c \
 	$(srcroot)test/unit/batch_alloc.c \
 	$(srcroot)test/unit/batcher.c \
+	$(srcroot)test/unit/bin_batching.c \
 	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/bit_util.c \
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -563,10 +563,11 @@ arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
 * stats updates, which happen during finish (this lets running counts get left
 * in a register).
 */
-JEMALLOC_ALWAYS_INLINE bool
+JEMALLOC_ALWAYS_INLINE void
 arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
    arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
-    void *ptr) {
+    void *ptr, edata_t **dalloc_slabs, unsigned ndalloc_slabs,
+    unsigned *dalloc_slabs_count, edata_list_active_t *dalloc_slabs_extra) {
 	const bin_info_t *bin_info = &bin_infos[binind];
 	size_t regind = arena_slab_regind(info, binind, slab, ptr);
 	slab_data_t *slab_data = edata_slab_data_get(slab);
@ -586,12 +587,17 @@ arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	if (nfree == bin_info->nregs) {
 		arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
 		    bin);
-		return true;
+
+		if (*dalloc_slabs_count < ndalloc_slabs) {
+			dalloc_slabs[*dalloc_slabs_count] = slab;
+			(*dalloc_slabs_count)++;
+		} else {
+			edata_list_active_append(dalloc_slabs_extra, slab);
+		}
 	} else if (nfree == 1 && slab != bin->slabcur) {
 		arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
 		    bin);
 	}
-	return false;
 }

 JEMALLOC_ALWAYS_INLINE void
@ -604,11 +610,129 @@ arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
 	}
 }

+JEMALLOC_ALWAYS_INLINE void
+arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    arena_dalloc_bin_locked_info_t *dalloc_bin_info, unsigned binind,
+    edata_t **dalloc_slabs, unsigned ndalloc_slabs, unsigned *dalloc_count,
+    edata_list_active_t *dalloc_slabs_extra) {
+	assert(binind < bin_info_nbatched_sizes);
+	bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
+	size_t nelems_to_pop = batcher_pop_begin(tsdn,
+	    &batched_bin->remote_frees);
+
+	bin_batching_test_mid_pop(nelems_to_pop);
+	if (nelems_to_pop == BATCHER_NO_IDX) {
+		malloc_mutex_assert_not_owner(tsdn,
+		    &batched_bin->remote_frees.mtx);
+		return;
+	} else {
+		malloc_mutex_assert_owner(tsdn,
+		    &batched_bin->remote_frees.mtx);
+	}
+
+	bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX];
+	for (size_t i = 0; i < nelems_to_pop; i++) {
+		remote_free_data[i] = batched_bin->remote_free_data[i];
+	}
+	batcher_pop_end(tsdn, &batched_bin->remote_frees);
+
+	for (size_t i = 0; i < nelems_to_pop; i++) {
+		arena_dalloc_bin_locked_step(tsdn, arena, bin, dalloc_bin_info,
+		    binind, remote_free_data[i].slab, remote_free_data[i].ptr,
+		    dalloc_slabs, ndalloc_slabs, dalloc_count,
+		    dalloc_slabs_extra);
+	}
+}
+
+typedef struct arena_bin_flush_batch_state_s arena_bin_flush_batch_state_t;
+struct arena_bin_flush_batch_state_s {
+	arena_dalloc_bin_locked_info_t info;
+
+	/*
+	 * Bin batching is subtle in that there are unusual edge cases in which
+	 * it can trigger the deallocation of more slabs than there were items
+	 * flushed (say, if every original deallocation triggered a slab
+	 * deallocation, and so did every batched one).  So we keep a small
+	 * backup array for any "extra" slabs, as well as a a list to allow a
+	 * dynamic number of ones exceeding that array.
+	 */
+	edata_t *dalloc_slabs[8];
+	unsigned dalloc_slab_count;
+	edata_list_active_t dalloc_slabs_extra;
+};
+
+JEMALLOC_ALWAYS_INLINE unsigned
+arena_bin_batch_get_ndalloc_slabs(unsigned preallocated_slabs) {
+	if (preallocated_slabs > bin_batching_test_ndalloc_slabs_max) {
+		return bin_batching_test_ndalloc_slabs_max;
+	}
+	return preallocated_slabs;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_bin_flush_batch_after_lock(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    unsigned binind, arena_bin_flush_batch_state_t *state) {
+	if (binind >= bin_info_nbatched_sizes) {
+		return;
+	}
+
+	arena_dalloc_bin_locked_begin(&state->info, binind);
+	state->dalloc_slab_count = 0;
+	edata_list_active_init(&state->dalloc_slabs_extra);
+
+	unsigned preallocated_slabs = (unsigned)(sizeof(state->dalloc_slabs)
+	    / sizeof(state->dalloc_slabs[0]));
+	unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs(
+	    preallocated_slabs);
+
+	arena_bin_flush_batch_impl(tsdn, arena, bin, &state->info, binind,
+	    state->dalloc_slabs, ndalloc_slabs,
+	    &state->dalloc_slab_count, &state->dalloc_slabs_extra);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_bin_flush_batch_before_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    unsigned binind, arena_bin_flush_batch_state_t *state) {
+	if (binind >= bin_info_nbatched_sizes) {
+		return;
+	}
+
+	arena_dalloc_bin_locked_finish(tsdn, arena, bin, &state->info);
+}
+
 static inline bool
 arena_bin_has_batch(szind_t binind) {
 	return binind < bin_info_nbatched_sizes;
 }

+JEMALLOC_ALWAYS_INLINE void
+arena_bin_flush_batch_after_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    unsigned binind, arena_bin_flush_batch_state_t *state) {
+	if (!arena_bin_has_batch(binind)) {
+		return;
+	}
+	/*
+	 * The initialization of dalloc_slabs_extra is guarded by an
+	 * arena_bin_has_batch check higher up the stack.  But the clang
+	 * analyzer forgets this down the stack, triggering a spurious error
+	 * reported here.
+	 */
+	JEMALLOC_CLANG_ANALYZER_SUPPRESS {
+		bin_batching_test_after_unlock(state->dalloc_slab_count,
+		    edata_list_active_empty(&state->dalloc_slabs_extra));
+	}
+	for (unsigned i = 0; i < state->dalloc_slab_count; i++) {
+		edata_t *slab = state->dalloc_slabs[i];
+		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
+	}
+	while (!edata_list_active_empty(&state->dalloc_slabs_extra)) {
+		edata_t *slab = edata_list_active_first(
+		    &state->dalloc_slabs_extra);
+		edata_list_active_remove(&state->dalloc_slabs_extra, slab);
+		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
+	}
+}
+
 static inline bin_t *
 arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) {
 	bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]);
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -11,6 +11,51 @@

 #define BIN_REMOTE_FREE_ELEMS_MAX 16

+#ifdef JEMALLOC_JET
+extern void (*bin_batching_test_after_push_hook)(size_t idx);
+extern void (*bin_batching_test_mid_pop_hook)(size_t elems_to_pop);
+extern void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count,
+    bool list_empty);
+#endif
+
+#ifdef JEMALLOC_JET
+extern unsigned bin_batching_test_ndalloc_slabs_max;
+#else
+static const unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1;
+#endif
+
+JEMALLOC_ALWAYS_INLINE void
+bin_batching_test_after_push(size_t idx) {
+	(void)idx;
+#ifdef JEMALLOC_JET
+	if (bin_batching_test_after_push_hook != NULL) {
+		bin_batching_test_after_push_hook(idx);
+	}
+#endif
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_batching_test_mid_pop(size_t elems_to_pop) {
+	(void)elems_to_pop;
+#ifdef JEMALLOC_JET
+	if (bin_batching_test_mid_pop_hook != NULL) {
+		bin_batching_test_mid_pop_hook(elems_to_pop);
+	}
+#endif
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_batching_test_after_unlock(unsigned slab_dalloc_count, bool list_empty) {
+	(void)slab_dalloc_count;
+	(void)list_empty;
+#ifdef JEMALLOC_JET
+	if (bin_batching_test_after_unlock_hook != NULL) {
+		bin_batching_test_after_unlock_hook(slab_dalloc_count,
+		    list_empty);
+	}
+#endif
+}
+
 /*
 * A bin contains a set of extents that are currently being used for slab
 * allocations.
@ -70,7 +115,7 @@ bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
    size_t end_size, size_t nshards);

 /* Initializes a bin to empty.  Returns true on error. */
-bool bin_init(bin_t *bin);
+bool bin_init(bin_t *bin, unsigned binind);

 /* Forking. */
 void bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch);
--- a/include/jemalloc/internal/bin_info.h
+++ b/include/jemalloc/internal/bin_info.h
@ -48,6 +48,8 @@ struct bin_info_s {
 extern size_t opt_bin_info_max_batched_size;
 /* The number of batches per batched size class. */
 extern size_t opt_bin_info_remote_free_max_batch;
+// The max number of pending elems (across all batches)
+extern size_t opt_bin_info_remote_free_max;

 extern szind_t bin_info_nbatched_sizes;
 extern unsigned bin_info_nbatched_bins;
--- a/src/arena.c
+++ b/src/arena.c
@ -661,10 +661,17 @@ arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) {
 }

 static void
-arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) {
+arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin, unsigned binind) {
 	edata_t *slab;

 	malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock);
+
+	if (arena_bin_has_batch(binind)) {
+		bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
+		batcher_init(&batched_bin->remote_frees,
+		    BIN_REMOTE_FREE_ELEMS_MAX);
+	}
+
 	if (bin->slabcur != NULL) {
 		slab = bin->slabcur;
 		bin->slabcur = NULL;
@ -815,7 +822,8 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 	/* Bins. */
 	for (unsigned i = 0; i < SC_NBINS; i++) {
 		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
-			arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j));
+			arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j),
+			    i);
 		}
 	}
 	pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard);
@ -1080,8 +1088,18 @@ arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena,
 	unsigned binshard;
 	bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard);

+	/*
+	 * This has some fields that are conditionally initialized down batch
+	 * flush pathways.  This can trigger static analysis warnings deeper
+	 * down in the static.  The accesses are guarded by the same checks as
+	 * the initialization, but the analysis isn't able to track that across
+	 * multiple stack frames.
+	 */
+	arena_bin_flush_batch_state_t batch_flush_state
+	    JEMALLOC_CLANG_ANALYZER_SILENCE_INIT({0});
 label_refill:
 	malloc_mutex_lock(tsdn, &bin->lock);
+	arena_bin_flush_batch_after_lock(tsdn, arena, bin, binind, &batch_flush_state);

 	while (filled < nfill) {
 		/* Try batch-fill from slabcur first. */
@ -1136,7 +1154,11 @@ label_refill:
 		cache_bin->tstats.nrequests = 0;
 	}

+	arena_bin_flush_batch_before_unlock(tsdn, arena, bin, binind,
+	    &batch_flush_state);
 	malloc_mutex_unlock(tsdn, &bin->lock);
+	arena_bin_flush_batch_after_unlock(tsdn, arena, bin, binind,
+	    &batch_flush_state);

 	if (alloc_and_retry) {
 		assert(fresh_slab == NULL);
@ -1427,12 +1449,16 @@ arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) {
 	malloc_mutex_lock(tsdn, &bin->lock);
 	arena_dalloc_bin_locked_info_t info;
 	arena_dalloc_bin_locked_begin(&info, binind);
-	bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin,
-	    &info, binind, edata, ptr);
+	edata_t *dalloc_slabs[1];
+	unsigned dalloc_slabs_count = 0;
+	arena_dalloc_bin_locked_step(tsdn, arena, bin, &info, binind, edata,
+	    ptr, dalloc_slabs, /* ndalloc_slabs */ 1, &dalloc_slabs_count,
+	    /* dalloc_slabs_extra */ NULL);
 	arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info);
 	malloc_mutex_unlock(tsdn, &bin->lock);

-	if (ret) {
+	if (dalloc_slabs_count != 0) {
+		assert(dalloc_slabs[0] == edata);
 		arena_slab_dalloc(tsdn, arena, edata);
 	}
 }
@ -1731,7 +1757,7 @@ arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) {
 	for (unsigned i = 0; i < SC_NBINS; i++) {
 		for (unsigned j = 0; j < bin_infos[i].n_shards; j++) {
 			bin_t *bin = arena_get_bin(arena, i, j);
-			bool err = bin_init(bin);
+			bool err = bin_init(bin, i);
 			if (err) {
 				goto label_error;
 			}
--- a/src/bin.c
+++ b/src/bin.c
@ -6,6 +6,14 @@
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/witness.h"

+#ifdef JEMALLOC_JET
+unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1;
+void (*bin_batching_test_after_push_hook)(size_t push_idx);
+void (*bin_batching_test_mid_pop_hook)(size_t nelems_to_pop);
+void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count,
+    bool list_empty);
+#endif
+
 bool
 bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size,
    size_t end_size, size_t nshards) {
@ -39,7 +47,7 @@ bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) {
 }

 bool
-bin_init(bin_t *bin) {
+bin_init(bin_t *bin, unsigned binind) {
 	if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN,
 	    malloc_mutex_rank_exclusive)) {
 		return true;
@ -50,6 +58,11 @@ bin_init(bin_t *bin) {
 	if (config_stats) {
 		memset(&bin->stats, 0, sizeof(bin_stats_t));
 	}
+	if (arena_bin_has_batch(binind)) {
+		bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin;
+		batcher_init(&batched_bin->remote_frees,
+		    opt_bin_info_remote_free_max);
+	}
 	return false;
 }

@ -57,8 +70,23 @@ void
 bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
 	malloc_mutex_prefork(tsdn, &bin->lock);
 	if (has_batch) {
+		/*
+		 * The batch mutex has lower rank than the bin mutex (as it must
+		 * -- it's acquired later).  But during forking, we go
+		 *  bin-at-a-time, so that we acquire mutex on bin 0, then on
+		 *  the bin 0 batcher, then on bin 1.  This is a safe ordering
+		 *  (it's ordered by the index of arenas and bins within those
+		 *  arenas), but will trigger witness errors that would
+		 *  otherwise force another level of arena forking that breaks
+		 *  bin encapsulation (because the witness API doesn't "know"
+		 *  about arena or bin ordering -- it just sees that the batcher
+		 *  has a lower rank than the bin).  So instead we exclude the
+		 *  batcher mutex from witness checking during fork (which is
+		 *  the only time we touch multiple bins at once) by passing
+		 *  TSDN_NULL.
+		 */
 		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_prefork(tsdn, &batched->remote_frees);
+		batcher_prefork(TSDN_NULL, &batched->remote_frees);
 	}
 }

@ -67,7 +95,7 @@ bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
 	malloc_mutex_postfork_parent(tsdn, &bin->lock);
 	if (has_batch) {
 		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_postfork_parent(tsdn, &batched->remote_frees);
+		batcher_postfork_parent(TSDN_NULL, &batched->remote_frees);
 	}
 }

@ -76,6 +104,6 @@ bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch) {
 	malloc_mutex_postfork_child(tsdn, &bin->lock);
 	if (has_batch) {
 		bin_with_batch_t *batched = (bin_with_batch_t *)bin;
-		batcher_postfork_child(tsdn, &batched->remote_frees);
+		batcher_postfork_child(TSDN_NULL, &batched->remote_frees);
 	}
 }
--- a/src/bin_info.c
+++ b/src/bin_info.c
@ -3,8 +3,19 @@

 #include "jemalloc/internal/bin_info.h"

-size_t opt_bin_info_max_batched_size;
-size_t opt_bin_info_remote_free_max_batch;
+/*
+ * We leave bin-batching disabled by default, with other settings chosen mostly
+ * empirically; across the test programs I looked at they provided the most bang
+ * for the buck.  With other default settings, these choices for bin batching
+ * result in them consuming far less memory (even in the worst case) than the
+ * tcaches themselves, the arena, etc.
+ * Note that we always try to pop all bins on every arena cache bin lock
+ * operation, so the typical memory waste is far less than this (and only on
+ * hot bins, which tend to be large anyways).
+ */
+size_t opt_bin_info_max_batched_size = 0; /* 192 is a good default. */
+size_t opt_bin_info_remote_free_max_batch = 4;
+size_t opt_bin_info_remote_free_max = BIN_REMOTE_FREE_ELEMS_MAX;

 bin_info_t bin_infos[SC_NBINS];

--- a/src/ctl.c
+++ b/src/ctl.c
@ -130,6 +130,7 @@ CTL_PROTO(opt_utrace)
 CTL_PROTO(opt_xmalloc)
 CTL_PROTO(opt_experimental_infallible_new)
 CTL_PROTO(opt_max_batched_size)
+CTL_PROTO(opt_remote_free_max)
 CTL_PROTO(opt_remote_free_max_batch)
 CTL_PROTO(opt_tcache)
 CTL_PROTO(opt_tcache_max)
@ -483,6 +484,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("experimental_infallible_new"),
 		CTL(opt_experimental_infallible_new)},
 	{NAME("max_batched_size"),	CTL(opt_max_batched_size)},
+	{NAME("remote_free_max"),	CTL(opt_remote_free_max)},
 	{NAME("remote_free_max_batch"),	CTL(opt_remote_free_max_batch)},
 	{NAME("tcache"),	CTL(opt_tcache)},
 	{NAME("tcache_max"),	CTL(opt_tcache_max)},
@ -2208,6 +2210,8 @@ CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
 CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new,
    opt_experimental_infallible_new, bool)
 CTL_RO_NL_GEN(opt_max_batched_size, opt_bin_info_max_batched_size, size_t)
+CTL_RO_NL_GEN(opt_remote_free_max, opt_bin_info_remote_free_max,
+    size_t)
 CTL_RO_NL_GEN(opt_remote_free_max_batch, opt_bin_info_remote_free_max_batch,
    size_t)
 CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool)
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -1334,6 +1334,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    BIN_REMOTE_FREE_ELEMS_MAX,
 			    CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
 			    /* clip */ true)
+			CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max,
+			    "remote_free_max", 0,
+			    BIN_REMOTE_FREE_ELEMS_MAX,
+			    CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
+			    /* clip */ true)

 			if (CONF_MATCH("tcache_ncached_max")) {
 				bool err = tcache_bin_info_default_init(
--- a/src/stats.c
+++ b/src/stats.c
@ -1556,6 +1556,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_BOOL("xmalloc")
 	OPT_WRITE_BOOL("experimental_infallible_new")
 	OPT_WRITE_SIZE_T("max_batched_size")
+	OPT_WRITE_SIZE_T("remote_free_max")
 	OPT_WRITE_SIZE_T("remote_free_max_batch")
 	OPT_WRITE_BOOL("tcache")
 	OPT_WRITE_SIZE_T("tcache_max")
--- a/src/tcache.c
+++ b/src/tcache.c
@ -325,6 +325,7 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 	assert(binind < SC_NBINS);
 	arena_t *tcache_arena = tcache_slow->arena;
 	assert(tcache_arena != NULL);
+	unsigned tcache_binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind];

 	/*
 	 * Variable length array must have > 0 length; the last element is never
@ -341,6 +342,18 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 	unsigned dalloc_count = 0;
 	VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1);

+	/*
+	 * There's an edge case where we need to deallocate more slabs than we
+	 * have elements of dalloc_slabs.  This can if we end up deallocating
+	 * items batched by another thread in addition to ones flushed from the
+	 * cache.  Since this is not very likely (most small object
+	 * deallocations don't free up a whole slab), we don't want to burn the
+	 * stack space to keep those excess slabs in an array.  Instead we'll
+	 * maintain an overflow list.
+	 */
+	edata_list_active_t dalloc_slabs_extra;
+	edata_list_active_init(&dalloc_slabs_extra);
+
 	/*
 	 * We're about to grab a bunch of locks.  If one of them happens to be
 	 * the one guarding the arena-level stats counters we flush our
@ -418,40 +431,136 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 			}
 		}

-		/* Actually do the flushing. */
-		malloc_mutex_lock(tsdn, &cur_bin->lock);
 		/*
-		 * Flush stats first, if that was the right lock.  Note that we
-		 * don't actually have to flush stats into the current thread's
-		 * binshard. Flushing into any binshard in the same arena is
-		 * enough; we don't expose stats on per-binshard basis (just
-		 * per-bin).
+		 * We never batch when flushing to our home-base bin shard,
+		 * since it's likely that we'll have to acquire that lock anyway
+		 * when flushing stats.
+		 *
+		 * A plausible check we could add to can_batch is
+		 * '&& arena_is_auto(cur_arena)'.  The motivation would be that
+		 * we have a higher tolerance for dubious user assumptions
+		 * around non-auto arenas (e.g. "if I deallocate every object I
+		 * allocated, and then call tcache.flush, then the arena stats
+		 * must reflect zero live allocations").
+		 *
+		 * This is dubious for a couple reasons:
+		 * - We already don't provide perfect fidelity for stats
+		 *   counting (e.g. for profiled allocations, whose size can
+		 *   inflate in stats).
+		 * - Hanging load-bearing guarantees around stats impedes
+		 *   scalability in general.
+		 *
+		 * There are some "complete" strategies we could do instead:
+		 * - Add a arena.<i>.quiesce call to pop all bins for users who
+		 *   do want those stats accounted for.
+		 * - Make batchability a user-controllable per-arena option.
+		 * - Do a batch pop after every mutex acquisition for which we
+		 *   want to provide accurate stats.  This gives perfectly
+		 *   accurate stats, but can cause weird performance effects
+		 *   (because doing stats collection can now result in slabs
+		 *   becoming empty, and therefore purging, large mutex
+		 *   acquisition, etc.).
+		 * - Propagate the "why" behind a flush down to the level of the
+		 *   batcher, and include a batch pop attempt down full tcache
+		 *   flushing pathways.  This is just a lot of plumbing and
+		 *   internal complexity.
+		 *
+		 * We don't do any of these right now, but the decision calculus
+		 * and tradeoffs are subtle enough that the reasoning was worth
+		 * leaving in this comment.
 		 */
-		if (config_stats && tcache_arena == cur_arena
-		    && !merged_stats) {
-			merged_stats = true;
-			cur_bin->stats.nflushes++;
-			cur_bin->stats.nrequests +=
-			    cache_bin->tstats.nrequests;
-			cache_bin->tstats.nrequests = 0;
+		bool bin_is_batched = arena_bin_has_batch(binind);
+		bool home_binshard = (cur_arena == tcache_arena
+		    && cur_binshard == tcache_binshard);
+		bool can_batch = (flush_start - prev_flush_start
+		    <= opt_bin_info_remote_free_max_batch)
+		    && !home_binshard && bin_is_batched;
+
+		/*
+		 * We try to avoid the batching pathway if we can, so we always
+		 * at least *try* to lock.
+		 */
+		bool locked = false;
+		bool batched = false;
+		if (can_batch) {
+			locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock);
 		}
-		/* Next flush objects. */
-		/* Init only to avoid used-uninitialized warning. */
-		arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
-		arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
-		for (unsigned i = prev_flush_start; i < flush_start; i++) {
-			void *ptr = ptrs->ptr[i];
-			edata_t *edata = item_edata[i].edata;
-			if (arena_dalloc_bin_locked_step(tsdn,
-			    cur_arena, cur_bin, &dalloc_bin_info,
-			    binind, edata, ptr)) {
-				dalloc_slabs[dalloc_count] = edata;
-				dalloc_count++;
+		if (can_batch && !locked) {
+			bin_with_batch_t *batched_bin =
+			    (bin_with_batch_t *)cur_bin;
+			size_t push_idx = batcher_push_begin(tsdn,
+			    &batched_bin->remote_frees,
+			    flush_start - prev_flush_start);
+			bin_batching_test_after_push(push_idx);
+
+			if (push_idx != BATCHER_NO_IDX) {
+				batched = true;
+				unsigned nbatched
+				    = flush_start - prev_flush_start;
+				for (unsigned i = 0; i < nbatched; i++) {
+					unsigned src_ind = prev_flush_start + i;
+					batched_bin->remote_free_data[
+					    push_idx + i].ptr
+						= ptrs->ptr[src_ind];
+					batched_bin->remote_free_data[
+					    push_idx + i].slab
+						= item_edata[src_ind].edata;
+				}
+				batcher_push_end(tsdn,
+				    &batched_bin->remote_frees);
 			}
 		}
-		arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
-		    &dalloc_bin_info);
-		malloc_mutex_unlock(tsdn, &cur_bin->lock);
+		if (!batched) {
+			if (!locked) {
+				malloc_mutex_lock(tsdn, &cur_bin->lock);
+			}
+			/*
+			 * Flush stats first, if that was the right lock.  Note
+			 * that we don't actually have to flush stats into the
+			 * current thread's binshard. Flushing into any binshard
+			 * in the same arena is enough; we don't expose stats on
+			 * per-binshard basis (just per-bin).
+			 */
+			if (config_stats && tcache_arena == cur_arena
+			    && !merged_stats) {
+				merged_stats = true;
+				cur_bin->stats.nflushes++;
+				cur_bin->stats.nrequests +=
+				    cache_bin->tstats.nrequests;
+				cache_bin->tstats.nrequests = 0;
+			}
+			unsigned preallocated_slabs = nflush;
+			unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs(
+			    preallocated_slabs);
+
+			/* Next flush objects our own objects. */
+			/* Init only to avoid used-uninitialized warning. */
+			arena_dalloc_bin_locked_info_t dalloc_bin_info = {0};
+			arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind);
+			for (unsigned i = prev_flush_start; i < flush_start;
+			    i++) {
+				void *ptr = ptrs->ptr[i];
+				edata_t *edata = item_edata[i].edata;
+				arena_dalloc_bin_locked_step(tsdn, cur_arena,
+				    cur_bin, &dalloc_bin_info, binind, edata,
+				    ptr, dalloc_slabs, ndalloc_slabs,
+				    &dalloc_count, &dalloc_slabs_extra);
+			}
+			/*
+			 * Lastly, flush any batched objects (from other
+			 * threads).
+			 */
+			if (bin_is_batched) {
+				arena_bin_flush_batch_impl(tsdn, cur_arena,
+				    cur_bin, &dalloc_bin_info, binind,
+				    dalloc_slabs, ndalloc_slabs,
+				    &dalloc_count, &dalloc_slabs_extra);
+			}
+
+			arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin,
+			    &dalloc_bin_info);
+			malloc_mutex_unlock(tsdn, &cur_bin->lock);
+		}
 		arena_decay_ticks(tsdn, cur_arena,
 		    flush_start - prev_flush_start);
 	}
@ -460,7 +569,11 @@ tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 	for (unsigned i = 0; i < dalloc_count; i++) {
 		edata_t *slab = dalloc_slabs[i];
 		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
-
+	}
+	while (!edata_list_active_empty(&dalloc_slabs_extra)) {
+		edata_t *slab = edata_list_active_first(&dalloc_slabs_extra);
+		edata_list_active_remove(&dalloc_slabs_extra, slab);
+		arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab);
 	}

 	if (config_stats && !merged_stats) {
--- a/test/include/test/fork.h
+++ b/test/include/test/fork.h
@ -0,0 +1,32 @@
+#ifndef JEMALLOC_TEST_FORK_H
+#define JEMALLOC_TEST_FORK_H
+
+#ifndef _WIN32
+
+#include <sys/wait.h>
+
+static inline void
+fork_wait_for_child_exit(int pid) {
+	int status;
+	while (true) {
+		if (waitpid(pid, &status, 0) == -1) {
+			test_fail("Unexpected waitpid() failure.");
+		}
+		if (WIFSIGNALED(status)) {
+			test_fail("Unexpected child termination due to "
+			    "signal %d", WTERMSIG(status));
+			break;
+		}
+		if (WIFEXITED(status)) {
+			if (WEXITSTATUS(status) != 0) {
+				test_fail("Unexpected child exit value %d",
+				    WEXITSTATUS(status));
+			}
+			break;
+		}
+	}
+}
+
+#endif
+
+#endif /* JEMALLOC_TEST_FORK_H */
--- a/test/include/test/jemalloc_test.h.in
+++ b/test/include/test/jemalloc_test.h.in
@ -1,3 +1,6 @@
+#ifndef JEMALLOC_TEST_H
+#define JEMALLOC_TEST_H
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -172,3 +175,5 @@ extern "C" {
 #ifdef __cplusplus
 }
 #endif
+
+#endif
--- a/test/unit/bin_batching.c
+++ b/test/unit/bin_batching.c
@ -0,0 +1,264 @@
+#include "test/jemalloc_test.h"
+#include "test/fork.h"
+
+enum {
+	STRESS_THREADS = 3,
+	STRESS_OBJECTS_PER_THREAD = 1000,
+	STRESS_ALLOC_SZ = PAGE / 2,
+};
+
+typedef struct stress_thread_data_s stress_thread_data_t;
+struct stress_thread_data_s {
+	unsigned thd_id;
+	atomic_zu_t *ready_thds;
+	atomic_zu_t *done_thds;
+	void **to_dalloc;
+};
+
+static atomic_zu_t push_failure_count;
+static atomic_zu_t pop_attempt_results[2];
+static atomic_zu_t dalloc_zero_slab_count;
+static atomic_zu_t dalloc_nonzero_slab_count;
+static atomic_zu_t dalloc_nonempty_list_count;
+
+static bool
+should_skip() {
+	return
+	    /*
+	     * We do batching operations on tcache flush pathways; we can't if
+	     * caching is disabled.
+	     */
+	    !opt_tcache ||
+	    /* We rely on tcache fill/flush operations of the size we use. */
+	    opt_tcache_max < STRESS_ALLOC_SZ
+	    /*
+	     * Some of the races we want to trigger are fiddly enough that they
+	     * only show up under real concurrency.  We add 1 to account for the
+	     * main thread, which also does some work.
+	     */
+	    || ncpus < STRESS_THREADS + 1;
+}
+
+static void
+increment_push_failure(size_t push_idx) {
+	if (push_idx == BATCHER_NO_IDX) {
+		atomic_fetch_add_zu(&push_failure_count, 1, ATOMIC_RELAXED);
+	} else {
+		assert_zu_lt(push_idx, 4, "Only 4 elems");
+		volatile int x = 10000;
+		while (--x) {
+			/* Spin for a while, to try to provoke a failure. */
+		}
+	}
+}
+
+static void
+increment_pop_attempt(size_t elems_to_pop) {
+	bool elems = (elems_to_pop != BATCHER_NO_IDX);
+	atomic_fetch_add_zu(&pop_attempt_results[elems], 1, ATOMIC_RELAXED);
+}
+
+static void
+increment_slab_dalloc_count(unsigned slab_dalloc_count, bool list_empty) {
+	if (slab_dalloc_count > 0) {
+		atomic_fetch_add_zu(&dalloc_nonzero_slab_count, 1,
+		    ATOMIC_RELAXED);
+	} else {
+		atomic_fetch_add_zu(&dalloc_zero_slab_count, 1,
+		    ATOMIC_RELAXED);
+	}
+	if (!list_empty) {
+		atomic_fetch_add_zu(&dalloc_nonempty_list_count, 1,
+		    ATOMIC_RELAXED);
+	}
+}
+
+static void flush_tcache() {
+	assert_d_eq(0, mallctl("thread.tcache.flush", NULL, NULL, NULL, 0),
+	    "Unexpected mallctl failure");
+}
+
+static void *
+stress_thread(void *arg) {
+	stress_thread_data_t *data = arg;
+	uint64_t prng_state = data->thd_id;
+	atomic_fetch_add_zu(data->ready_thds, 1, ATOMIC_RELAXED);
+	while (atomic_load_zu(data->ready_thds, ATOMIC_RELAXED)
+	    != STRESS_THREADS) {
+		/* Spin */
+	}
+	for (int i = 0; i < STRESS_OBJECTS_PER_THREAD; i++) {
+		dallocx(data->to_dalloc[i], 0);
+		if (prng_range_u64(&prng_state, 3) == 0) {
+			flush_tcache();
+		}
+
+	}
+	flush_tcache();
+	atomic_fetch_add_zu(data->done_thds, 1, ATOMIC_RELAXED);
+	return NULL;
+}
+
+/*
+ * Run main_thread_fn in conditions that trigger all the various edge cases and
+ * subtle race conditions.
+ */
+static void
+stress_run(void (*main_thread_fn)(), int nruns) {
+	bin_batching_test_ndalloc_slabs_max = 1;
+	bin_batching_test_after_push_hook = &increment_push_failure;
+	bin_batching_test_mid_pop_hook = &increment_pop_attempt;
+	bin_batching_test_after_unlock_hook = &increment_slab_dalloc_count;
+
+	atomic_store_zu(&push_failure_count, 0, ATOMIC_RELAXED);
+	atomic_store_zu(&pop_attempt_results[2], 0, ATOMIC_RELAXED);
+	atomic_store_zu(&dalloc_zero_slab_count, 0, ATOMIC_RELAXED);
+	atomic_store_zu(&dalloc_nonzero_slab_count, 0, ATOMIC_RELAXED);
+	atomic_store_zu(&dalloc_nonempty_list_count, 0, ATOMIC_RELAXED);
+
+	for (int run = 0; run < nruns; run++) {
+		thd_t thds[STRESS_THREADS];
+		stress_thread_data_t thd_datas[STRESS_THREADS];
+		atomic_zu_t ready_thds;
+		atomic_store_zu(&ready_thds, 0, ATOMIC_RELAXED);
+		atomic_zu_t done_thds;
+		atomic_store_zu(&done_thds, 0, ATOMIC_RELAXED);
+
+		void *ptrs[STRESS_THREADS][STRESS_OBJECTS_PER_THREAD];
+		for (int i = 0; i < STRESS_THREADS; i++) {
+			thd_datas[i].thd_id = i;
+			thd_datas[i].ready_thds = &ready_thds;
+			thd_datas[i].done_thds = &done_thds;
+			thd_datas[i].to_dalloc = ptrs[i];
+			for (int j = 0; j < STRESS_OBJECTS_PER_THREAD; j++) {
+				void *ptr = mallocx(STRESS_ALLOC_SZ, 0);
+				assert_ptr_not_null(ptr, "alloc failure");
+				ptrs[i][j] = ptr;
+			}
+		}
+		for (int i = 0; i < STRESS_THREADS; i++) {
+			thd_create(&thds[i], stress_thread, &thd_datas[i]);
+		}
+		while (atomic_load_zu(&done_thds, ATOMIC_RELAXED)
+		    != STRESS_THREADS) {
+			main_thread_fn();
+		}
+		for (int i = 0; i < STRESS_THREADS; i++) {
+			thd_join(thds[i], NULL);
+		}
+	}
+
+	bin_batching_test_ndalloc_slabs_max = (unsigned)-1;
+	bin_batching_test_after_push_hook = NULL;
+	bin_batching_test_mid_pop_hook = NULL;
+	bin_batching_test_after_unlock_hook = NULL;
+}
+
+static void
+do_allocs_frees() {
+	enum {NALLOCS = 32};
+	flush_tcache();
+	void *ptrs[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		ptrs[i] = mallocx(STRESS_ALLOC_SZ, 0);
+	}
+	for (int i = 0; i < NALLOCS; i++) {
+		dallocx(ptrs[i], 0);
+	}
+	flush_tcache();
+}
+
+static void
+test_arena_reset_main_fn() {
+	do_allocs_frees();
+}
+
+TEST_BEGIN(test_arena_reset) {
+	int err;
+	unsigned arena;
+	unsigned old_arena;
+
+	test_skip_if(should_skip());
+	test_skip_if(opt_percpu_arena != percpu_arena_disabled);
+
+	size_t arena_sz = sizeof(arena);
+	err = mallctl("arenas.create", (void *)&arena, &arena_sz, NULL, 0);
+	assert_d_eq(0, err, "Arena creation failed");
+
+	err = mallctl("thread.arena", &old_arena, &arena_sz, &arena, arena_sz);
+	assert_d_eq(0, err, "changing arena failed");
+
+	stress_run(&test_arena_reset_main_fn, /* nruns */ 10);
+
+	flush_tcache();
+
+	char buf[100];
+	malloc_snprintf(buf, sizeof(buf), "arena.%u.reset", arena);
+	err = mallctl(buf, NULL, NULL, NULL, 0);
+	assert_d_eq(0, err, "Couldn't change arena");
+
+	do_allocs_frees();
+
+	err = mallctl("thread.arena", NULL, NULL, &old_arena, arena_sz);
+	assert_d_eq(0, err, "changing arena failed");
+}
+TEST_END
+
+static void
+test_fork_main_fn() {
+#ifndef _WIN32
+	pid_t pid = fork();
+	if (pid == -1) {
+		test_fail("Fork failure!");
+	} else if (pid == 0) {
+		/* Child */
+		do_allocs_frees();
+		_exit(0);
+	} else {
+		fork_wait_for_child_exit(pid);
+		do_allocs_frees();
+	}
+#endif
+}
+
+TEST_BEGIN(test_fork) {
+#ifdef _WIN32
+	test_skip("No fork on windows");
+#endif
+	test_skip_if(should_skip());
+	stress_run(&test_fork_main_fn, /* nruns */ 10);
+}
+TEST_END
+
+static void
+test_races_main_fn() {
+	do_allocs_frees();
+}
+
+TEST_BEGIN(test_races) {
+	test_skip_if(should_skip());
+
+	stress_run(&test_races_main_fn, /* nruns */ 400);
+
+	assert_zu_lt(0, atomic_load_zu(&push_failure_count, ATOMIC_RELAXED),
+	    "Should have seen some push failures");
+	assert_zu_lt(0, atomic_load_zu(&pop_attempt_results[0], ATOMIC_RELAXED),
+	    "Should have seen some pop failures");
+	assert_zu_lt(0, atomic_load_zu(&pop_attempt_results[1], ATOMIC_RELAXED),
+	    "Should have seen some pop successes");
+	assert_zu_lt(0, atomic_load_zu(&dalloc_zero_slab_count, ATOMIC_RELAXED),
+	    "Expected some frees that didn't empty a slab");
+	assert_zu_lt(0, atomic_load_zu(&dalloc_nonzero_slab_count,
+	    ATOMIC_RELAXED), "expected some frees that emptied a slab");
+	assert_zu_lt(0, atomic_load_zu(&dalloc_nonempty_list_count,
+	    ATOMIC_RELAXED), "expected some frees that used the empty list");
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_arena_reset,
+	    test_races,
+	    test_fork);
+}
--- a/test/unit/bin_batching.sh
+++ b/test/unit/bin_batching.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+
+# This value of max_batched_size effectively requires all bins to be batched;
+# our page limits are fuzzy, but we bound slab item counts to 2**32, so we'd be
+# at multi-gigabyte minimum page sizes.
+# The reason for this sort of hacky approach is that we want to
+# allocate/deallocate PAGE/2-sized objects (to trigger the "non-empty" ->
+# "empty" and "non-empty"-> "full" transitions often, which have special
+# handling). But the value of PAGE isn't easily available in test scripts.
+export MALLOC_CONF="narenas:2,bin_shards:1-1000000000:3,max_batched_size:1000000000,remote_free_max_batch:1,remote_free_max:4"
--- a/test/unit/fork.c
+++ b/test/unit/fork.c
@ -1,32 +1,5 @@
 #include "test/jemalloc_test.h"
-
-#ifndef _WIN32
-#include <sys/wait.h>
-#endif
-
-#ifndef _WIN32
-static void
-wait_for_child_exit(int pid) {
-	int status;
-	while (true) {
-		if (waitpid(pid, &status, 0) == -1) {
-			test_fail("Unexpected waitpid() failure.");
-		}
-		if (WIFSIGNALED(status)) {
-			test_fail("Unexpected child termination due to "
-			    "signal %d", WTERMSIG(status));
-			break;
-		}
-		if (WIFEXITED(status)) {
-			if (WEXITSTATUS(status) != 0) {
-				test_fail("Unexpected child exit value %d",
-				    WEXITSTATUS(status));
-			}
-			break;
-		}
-	}
-}
-#endif
+#include "test/fork.h"

 TEST_BEGIN(test_fork) {
 #ifndef _WIN32
@ -64,7 +37,7 @@ TEST_BEGIN(test_fork) {
 		/* Child. */
 		_exit(0);
 	} else {
-		wait_for_child_exit(pid);
+		fork_wait_for_child_exit(pid);
 	}
 #else
 	test_skip("fork(2) is irrelevant to Windows");
@ -87,7 +60,7 @@ do_fork_thd(void *arg) {
 		test_fail("Exec failed");
 	} else {
 		/* Parent */
-		wait_for_child_exit(pid);
+		fork_wait_for_child_exit(pid);
 	}
 	return NULL;
 }
@ -124,7 +97,7 @@ TEST_BEGIN(test_fork_multithreaded) {
 			do_test_fork_multithreaded();
 			_exit(0);
 		} else {
-			wait_for_child_exit(pid);
+			fork_wait_for_child_exit(pid);
 		}
 	}
 #else