Limit maximum number of purged slabs with option

Option `experimental_hpa_max_purge_nhp` introduced for backward compatibility reasons: to make it possible to have behaviour similar to buggy `hpa_strict_min_purge_interval` implementation. When `experimental_hpa_max_purge_nhp` is set to -1, there is no limit to number of slabs we'll purge on each iteration. Otherwise, we'll purge no more than `experimental_hpa_max_purge_nhp` hugepages (slabs). This in turn means we might not purge enough dirty pages to satisfy `hpa_dirty_mult` requirement. Combination of `hpa_dirty_mult`, `experimental_hpa_max_purge_nhp` and `hpa_strict_min_purge_interval` options allows us to have steady rate of pages returned back to the system. This provides a strickier latency guarantees as number of `madvise` calls is bounded (and hence number of TLB shootdowns is limited) in exchange to weaker memory usage guarantees.
2024-08-06 08:47:57 -07:00 · 2024-08-06 08:47:57 -07:00 · aaa29003ab
parent 143f458188
commit aaa29003ab
7 changed files with 109 additions and 6 deletions
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -57,6 +57,11 @@ struct hpa_shard_opts_s {
 	 * purging logic fix.
 	 */
 	bool strict_min_purge_interval;
+
+	/*
+	 * Maximum number of hugepages to purge on each purging attempt.
+	 */
+	ssize_t experimental_max_purge_nhp;
 };

 #define HPA_SHARD_OPTS_DEFAULT {					\
@ -79,7 +84,9 @@ struct hpa_shard_opts_s {
 	/* min_purge_interval_ms */					\
 	5 * 1000,							\
 	/* strict_min_purge_interval */					\
-	false								\
+	false,								\
+	/* experimental_max_purge_nhp */				\
+	-1								\
 }

 #endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */
--- a/src/ctl.c
+++ b/src/ctl.c
@ -104,6 +104,7 @@ CTL_PROTO(opt_hpa_hugification_threshold)
 CTL_PROTO(opt_hpa_hugify_delay_ms)
 CTL_PROTO(opt_hpa_min_purge_interval_ms)
 CTL_PROTO(opt_hpa_strict_min_purge_interval)
+CTL_PROTO(opt_experimental_hpa_max_purge_nhp)
 CTL_PROTO(opt_hpa_dirty_mult)
 CTL_PROTO(opt_hpa_sec_nshards)
 CTL_PROTO(opt_hpa_sec_max_alloc)
@ -460,7 +461,10 @@ static const ctl_named_node_t opt_node[] = {
 		CTL(opt_hpa_hugification_threshold)},
 	{NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)},
 	{NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)},
-	{NAME("hpa_strict_min_purge_interval"), CTL(opt_hpa_strict_min_purge_interval)},
+	{NAME("hpa_strict_min_purge_interval"),
+		CTL(opt_hpa_strict_min_purge_interval)},
+	{NAME("experimental_hpa_max_purge_nhp"),
+		CTL(opt_experimental_hpa_max_purge_nhp)},
 	{NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)},
 	{NAME("hpa_sec_nshards"),	CTL(opt_hpa_sec_nshards)},
 	{NAME("hpa_sec_max_alloc"),	CTL(opt_hpa_sec_max_alloc)},
@ -2197,6 +2201,8 @@ CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms,
    uint64_t)
 CTL_RO_NL_GEN(opt_hpa_strict_min_purge_interval,
    opt_hpa_opts.strict_min_purge_interval, bool)
+CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp,
+    opt_hpa_opts.experimental_max_purge_nhp, ssize_t)

 /*
 * This will have to change before we publicly document this option; fxp_t and
--- a/src/hpa.c
+++ b/src/hpa.c
@ -552,7 +552,22 @@ hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
 	 * too frequently.
 	 */
 	if (hpa_min_purge_interval_passed(tsdn, shard)) {
-		while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
+		size_t max_purges = max_ops;
+		/*
+		 * Limit number of hugepages (slabs) to purge.
+		 * When experimental_max_purge_nhp option is used, there is no
+		 * guarantee we'll always respect dirty_mult option.  Option
+		 * experimental_max_purge_nhp provides a way to configure same
+		 * behaviour as was possible before, with buggy implementation
+		 * of purging algorithm.
+		 */
+		ssize_t max_purge_nhp = shard->opts.experimental_max_purge_nhp;
+		if (max_purge_nhp != -1 &&
+		    max_purges > (size_t)max_purge_nhp) {
+			max_purges = max_purge_nhp;
+		}
+
+		while (hpa_should_purge(tsdn, shard) && nops < max_purges) {
 			if (!hpa_try_purge(tsdn, shard)) {
 				/*
 				 * It is fine if we couldn't purge as sometimes
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -1558,6 +1558,10 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    opt_hpa_opts.strict_min_purge_interval,
 			    "hpa_strict_min_purge_interval");

+			CONF_HANDLE_SSIZE_T(
+			    opt_hpa_opts.experimental_max_purge_nhp,
+			    "experimental_hpa_max_purge_nhp", -1, SSIZE_MAX);
+
 			if (CONF_MATCH("hpa_dirty_mult")) {
 				if (CONF_MATCH_VALUE("-1")) {
 					opt_hpa_opts.dirty_mult = (fxp_t)-1;
--- a/src/stats.c
+++ b/src/stats.c
@ -1565,6 +1565,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_UINT64("hpa_hugify_delay_ms")
 	OPT_WRITE_UINT64("hpa_min_purge_interval_ms")
 	OPT_WRITE_BOOL("hpa_strict_min_purge_interval")
+	OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp")
 	if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0)
 	    == 0) {
 		/*
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@ -35,7 +35,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* strict_min_purge_interval */
-	false
+	false,
+	/* experimental_max_purge_nhp */
+	-1
 };

 static hpa_shard_opts_t test_hpa_shard_opts_purge = {
@ -52,7 +54,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* strict_min_purge_interval */
-	false
+	false,
+	/* experimental_max_purge_nhp */
+	-1
 };

 static hpa_shard_t *
@ -653,6 +657,70 @@ TEST_BEGIN(test_purge) {
 }
 TEST_END

+TEST_BEGIN(test_experimental_max_purge_nhp) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	opts.experimental_max_purge_nhp = 1;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 8 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+	/* Deallocate 3 hugepages out of 8. */
+	for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Expect only one purge call, because opts.experimental_max_purge_nhp
+	 * is set to 1.
+	 */
+	expect_zu_eq(1, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/* We still above the limit for dirty pages. */
+	expect_zu_eq(1, ndefer_purge_calls, "Expect purge");
+	ndefer_purge_calls = 0;
+
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/* Finally, we are below the limit, no purges are expected. */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	destroy_test_data(shard);
+}
+TEST_END
+
 int
 main(void) {
 	/*
@ -675,5 +743,6 @@ main(void) {
 	    test_purge_no_infinite_loop,
 	    test_strict_no_min_purge_interval,
 	    test_strict_min_purge_interval,
-	    test_purge);
+	    test_purge,
+	    test_experimental_max_purge_nhp);
 }
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@ -292,6 +292,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always);
+	TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);