Reland "[scudo] Manage free blocks in BatchGroup."

This is not a pure revert of c929bcb7d8. It also includes a bug fix. Differential Revision: https://reviews.llvm.org/D136029
2022-10-17 20:38:37 +00:00 · 2022-10-17 20:38:37 +00:00 · c0f91856a3
parent 8c8775e938
commit c0f91856a3
8 changed files with 640 additions and 67 deletions
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@ -34,6 +34,14 @@ namespace scudo {
 //   typedef SizeClassAllocator64<ExampleConfig> Primary;
 //   // Log2 of the size of a size class region, as used by the Primary.
 //   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Log2 of the size of block group, as used by the Primary. Each group
+//   // contains a range of memory addresses, blocks in the range will belong to
+//   // the same group. In general, single region may have 1 or 2MB group size.
+//   // Multiple regions will have the group size equal to the region size
+//   // because the region size is usually smaller than 1 MB.
+//   // Smaller value gives fine-grained control of memory usage but the trade
+//   // off is that it may take longer time of deallocation.
+//   static const uptr PrimaryGroupSizeLog = 20U;
 //   // Defines the type and scale of a compact pointer. A compact pointer can
 //   // be understood as the offset of a pointer within the region it belongs
 //   // to, in increments of a power-of-2 scale.
@ -65,6 +73,7 @@ struct DefaultConfig {
 #if SCUDO_CAN_USE_PRIMARY64
  typedef SizeClassAllocator64<DefaultConfig> Primary;
  static const uptr PrimaryRegionSizeLog = 32U;
+  static const uptr PrimaryGroupSizeLog = 21U;
  typedef uptr PrimaryCompactPtrT;
  static const uptr PrimaryCompactPtrScale = 0;
  static const bool PrimaryEnableRandomOffset = true;
@ -72,6 +81,7 @@ struct DefaultConfig {
 #else
  typedef SizeClassAllocator32<DefaultConfig> Primary;
  static const uptr PrimaryRegionSizeLog = 19U;
+  static const uptr PrimaryGroupSizeLog = 19U;
  typedef uptr PrimaryCompactPtrT;
 #endif
  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
@ -96,11 +106,13 @@ struct AndroidConfig {
  static const uptr PrimaryRegionSizeLog = 28U;
  typedef u32 PrimaryCompactPtrT;
  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 20U;
  static const bool PrimaryEnableRandomOffset = true;
  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
  typedef SizeClassAllocator32<AndroidConfig> Primary;
  static const uptr PrimaryRegionSizeLog = 18U;
+  static const uptr PrimaryGroupSizeLog = 18U;
  typedef uptr PrimaryCompactPtrT;
 #endif
  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@ -127,11 +139,13 @@ struct AndroidSvelteConfig {
  static const uptr PrimaryRegionSizeLog = 27U;
  typedef u32 PrimaryCompactPtrT;
  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 18U;
  static const bool PrimaryEnableRandomOffset = true;
  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
  typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
  static const uptr PrimaryRegionSizeLog = 16U;
+  static const uptr PrimaryGroupSizeLog = 16U;
  typedef uptr PrimaryCompactPtrT;
 #endif
  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@ -156,6 +170,7 @@ struct FuchsiaConfig {

  typedef SizeClassAllocator64<FuchsiaConfig> Primary;
  static const uptr PrimaryRegionSizeLog = 30U;
+  static const uptr PrimaryGroupSizeLog = 30U;
  typedef u32 PrimaryCompactPtrT;
  static const bool PrimaryEnableRandomOffset = true;
  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
@ -175,6 +190,7 @@ struct TrustyConfig {
  typedef SizeClassAllocator64<TrustyConfig> Primary;
  // Some apps have 1 page of heap total so small regions are necessary.
  static const uptr PrimaryRegionSizeLog = 10U;
+  static const uptr PrimaryGroupSizeLog = 10U;
  typedef u32 PrimaryCompactPtrT;
  static const bool PrimaryEnableRandomOffset = false;
  // Trusty is extremely memory-constrained so minimally round up map calls.
--- a/compiler-rt/lib/scudo/standalone/list.h
+++ b/compiler-rt/lib/scudo/standalone/list.h
@ -110,6 +110,18 @@ template <class T> struct SinglyLinkedList : public IntrusiveList<T> {
    Size--;
  }

+  // Insert X next to Prev
+  void insert(T *Prev, T *X) {
+    DCHECK(!empty());
+    DCHECK_NE(Prev, nullptr);
+    DCHECK_NE(X, nullptr);
+    X->Next = Prev->Next;
+    Prev->Next = X;
+    if (Last == Prev)
+      Last = X;
+    ++Size;
+  }
+
  void extract(T *Prev, T *X) {
    DCHECK(!empty());
    DCHECK_NE(Prev, nullptr);
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@ -10,6 +10,7 @@
 #define SCUDO_LOCAL_CACHE_H_

 #include "internal_defs.h"
+#include "list.h"
 #include "platform.h"
 #include "report.h"
 #include "stats.h"
@ -27,6 +28,12 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
      Count = N;
      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
    }
+    void appendFromArray(CompactPtrT *Array, u16 N) {
+      DCHECK_LE(N, MaxNumCached - Count);
+      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+      // u16 will be promoted to int by arithmetic type conversion.
+      Count = static_cast<u16>(Count + N);
+    }
    void clear() { Count = 0; }
    void add(CompactPtrT P) {
      DCHECK_LT(Count, MaxNumCached);
@ -50,6 +57,22 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
    u16 Count;
  };

+  // A BatchGroup is used to collect blocks. Each group has a group id to
+  // identify the group kind of contained blocks.
+  struct BatchGroup {
+    // `Next` is used by IntrusiveList.
+    BatchGroup *Next;
+    // The identifier of each group
+    uptr GroupId;
+    // Cache value of TransferBatch::getMaxCached()
+    u16 MaxCachedPerBatch;
+    // Blocks are managed by TransferBatch in a list.
+    SinglyLinkedList<TransferBatch> Batches;
+  };
+
+  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+                "BatchGroup uses the same class size as TransferBatch");
+
  void init(GlobalStats *S, SizeClassAllocator *A) {
    DCHECK(isEmpty());
    Stats.init();
@ -121,9 +144,18 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
  TransferBatch *createBatch(uptr ClassId, void *B) {
    if (ClassId != BatchClassId)
      B = allocate(BatchClassId);
+    if (UNLIKELY(!B))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
    return reinterpret_cast<TransferBatch *>(B);
  }

+  BatchGroup *createGroup() {
+    void *Ptr = allocate(BatchClassId);
+    if (UNLIKELY(!Ptr))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
+    return reinterpret_cast<BatchGroup *>(Ptr);
+  }
+
  LocalStats &getStats() { return Stats; }

 private:
@ -182,16 +214,11 @@ private:

  NOINLINE void drain(PerClass *C, uptr ClassId) {
    const u16 Count = Min(static_cast<u16>(C->MaxCount / 2), C->Count);
-    TransferBatch *B =
-        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
-    if (UNLIKELY(!B))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    B->setFromArray(&C->Chunks[0], Count);
+    Allocator->pushBlocks(this, ClassId, &C->Chunks[0], Count);
    // u16 will be promoted to int by arithmetic type conversion.
    C->Count = static_cast<u16>(C->Count - Count);
    for (u16 I = 0; I < C->Count; I++)
      C->Chunks[I] = C->Chunks[I + Count];
-    Allocator->pushBatch(ClassId, B);
  }
 };

--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@ -43,6 +43,7 @@ template <typename Config> class SizeClassAllocator32 {
 public:
  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
  typedef typename Config::SizeClassMap SizeClassMap;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
  // The bytemap can only track UINT8_MAX - 1 classes.
  static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
  // Regions should be large enough to hold the largest Block.
@ -51,6 +52,7 @@ public:
  typedef SizeClassAllocator32<Config> ThisT;
  typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
  typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;

  static uptr getSizeByClassId(uptr ClassId) {
    return (ClassId == SizeClassMap::BatchClassId)
@ -111,30 +113,69 @@ public:
    return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
  }

+  uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> GroupSizeLog;
+  }
+
  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
    DCHECK_LT(ClassId, NumClasses);
    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
    ScopedLock L(Sci->Mutex);
-    TransferBatch *B = Sci->FreeList.front();
-    if (B) {
-      Sci->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Sci);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
        return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
    }
-    DCHECK_GT(B->getCount(), 0);
    Sci->Stats.PoppedBlocks += B->getCount();
    return B;
  }

-  void pushBatch(uptr ClassId, TransferBatch *B) {
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
    DCHECK_LT(ClassId, NumClasses);
-    DCHECK_GT(B->getCount(), 0);
+    DCHECK_GT(Size, 0);
+
    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Sci->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && !populateFreeList(C, ClassId, Sci))
+        return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Sci->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
    ScopedLock L(Sci->Mutex);
-    Sci->FreeList.push_front(B);
-    Sci->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Sci->Stats.PushedBlocks += Size;
    if (ClassId != SizeClassMap::BatchClassId)
      releaseToOSMaybe(Sci, ClassId);
  }
@ -256,7 +297,7 @@ private:

  struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
    HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
    uptr CurrentRegion;
    uptr CurrentRegionAllocated;
    SizeClassStats Stats;
@ -328,8 +369,187 @@ private:
    return &SizeClassInfoArray[ClassId];
  }

-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           SizeClassInfo *Sci) {
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross different groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Sci->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Sci->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Sci->FreeList.push_front(Cur);
+      else
+        Sci->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Sci->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (Sci->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches = Sci->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci) {
    uptr Region;
    uptr Offset;
    // If the size-class currently has a region associated to it, use it. The
@ -344,7 +564,7 @@ private:
      DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
      Region = allocateRegion(Sci, ClassId);
      if (UNLIKELY(!Region))
-        return nullptr;
+        return false;
      C->getStats().add(StatMapped, RegionSize);
      Sci->CurrentRegion = Region;
      Offset = 0;
@ -378,20 +598,15 @@ private:
    if (ClassId != SizeClassMap::BatchClassId)
      shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
-      if (UNLIKELY(!B))
-        return nullptr;
      // `MaxCount` is u16 so the result will also fit in u16.
      const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Sci->FreeList.push_back(B);
+      // Note that the N blocks here may have different group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
      I += N;
    }
-    TransferBatch *B = Sci->FreeList.front();
-    Sci->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);

    const uptr AllocatedUser = Size * NumberOfBlocks;
    C->getStats().add(StatFree, AllocatedUser);
@ -407,7 +622,7 @@ private:
    }
    Sci->AllocatedUser += AllocatedUser;

-    return B;
+    return true;
  }

  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@ -477,8 +692,12 @@ private:
    auto DecompactPtr = [](CompactPtrT CompactPtr) {
      return reinterpret_cast<uptr>(CompactPtr);
    };
-    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
-                          Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, NumberOfRegions);
+    for (BatchGroup &BG : Sci->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Base);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
    if (Recorder.getReleasedRangesCount() > 0) {
      Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@ -45,10 +45,12 @@ template <typename Config> class SizeClassAllocator64 {
 public:
  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
  static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
  typedef typename Config::SizeClassMap SizeClassMap;
  typedef SizeClassAllocator64<Config> ThisT;
  typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
  typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;

  static uptr getSizeByClassId(uptr ClassId) {
    return (ClassId == SizeClassMap::BatchClassId)
@ -99,25 +101,61 @@ public:
    DCHECK_LT(ClassId, NumClasses);
    RegionInfo *Region = getRegionInfo(ClassId);
    ScopedLock L(Region->Mutex);
-    TransferBatch *B = Region->FreeList.front();
-    if (B) {
-      Region->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Region);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Region)))
        return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
    }
-    DCHECK_GT(B->getCount(), 0);
    Region->Stats.PoppedBlocks += B->getCount();
    return B;
  }

-  void pushBatch(uptr ClassId, TransferBatch *B) {
-    DCHECK_GT(B->getCount(), 0);
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
+    DCHECK_LT(ClassId, NumClasses);
+    DCHECK_GT(Size, 0);
+
    RegionInfo *Region = getRegionInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Region->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && UNLIKELY(!populateFreeList(C, ClassId, Region)))
+        return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Region->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+        SameGroup = false;
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
    ScopedLock L(Region->Mutex);
-    Region->FreeList.push_front(B);
-    Region->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Region->Stats.PushedBlocks += Size;
    if (ClassId != SizeClassMap::BatchClassId)
      releaseToOSMaybe(Region, ClassId);
  }
@ -292,7 +330,7 @@ private:

  struct UnpaddedRegionInfo {
    HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
    uptr RegionBeg = 0;
    RegionStats Stats = {};
    u32 RandState = 0;
@ -330,8 +368,192 @@ private:
    return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
  }

-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           RegionInfo *Region) {
+  static uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> (GroupSizeLog - CompactPtrScale);
+  }
+
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross different groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    RegionInfo *Region = getRegionInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots =
+            static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(
+              ClassId,
+              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Region->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Region->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Region->FreeList.push_front(Cur);
+      else
+        Region->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Region->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    RegionInfo *Region = getRegionInfo(ClassId);
+    if (Region->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches =
+        Region->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Region->FreeList.front();
+      Region->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region) {
    const uptr Size = getSizeByClassId(ClassId);
    const u16 MaxCount = TransferBatch::getMaxCached(Size);

@ -354,7 +576,7 @@ private:
              RegionSize >> 20, Size);
          Str.output();
        }
-        return nullptr;
+        return false;
      }
      if (MappedUser == 0)
        Region->Data = Data;
@ -363,8 +585,9 @@ private:
              "scudo:primary",
              MAP_ALLOWNOMEM | MAP_RESIZABLE |
                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data)))
-        return nullptr;
+              &Region->Data))) {
+        return false;
+      }
      Region->MappedUser += MapSize;
      C->getStats().add(StatMapped, MapSize);
    }
@ -387,27 +610,21 @@ private:
    if (ClassId != SizeClassMap::BatchClassId)
      shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
-                                      CompactPtrBase, ShuffleArray[I])));
-      if (UNLIKELY(!B))
-        return nullptr;
      // `MaxCount` is u16 so the result will also fit in u16.
      const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Region->FreeList.push_back(B);
+      // Note that the N blocks here may have different group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
      I += N;
    }
-    TransferBatch *B = Region->FreeList.front();
-    Region->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);

    const uptr AllocatedUser = Size * NumberOfBlocks;
    C->getStats().add(StatFree, AllocatedUser);
    Region->AllocatedUser += AllocatedUser;

-    return B;
+    return true;
  }

  void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@ -473,8 +690,11 @@ private:
      return decompactPtrInternal(CompactPtrBase, CompactPtr);
    };
    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
-                          BlockSize, Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, /*NumberOfRegions=*/1U);
+    for (BatchGroup &BG : Region->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Region->RegionBeg);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);

    if (Recorder.getReleasedRangesCount() > 0) {
      Region->ReleaseInfo.PushedBlocksAtLastRelease =
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@ -525,6 +525,7 @@ struct DeathConfig {
  static const scudo::uptr PrimaryCompactPtrScale = 0;
  static const bool PrimaryEnableRandomOffset = true;
  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 18;

  typedef scudo::MapAllocatorNoCache SecondaryCache;
  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
--- a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
@ -161,6 +161,10 @@ TEST(ScudoListTest, SinglyLinkedList) {
  setList(&L1, X);
  checkList(&L1, X);

+  setList(&L1, X, Y);
+  L1.insert(X, Z);
+  checkList(&L1, X, Z, Y);
+
  setList(&L1, X, Y, Z);
  setList(&L2, A, B, C);
  L1.append_back(&L2);
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@ -12,8 +12,11 @@
 #include "primary64.h"
 #include "size_class_map.h"

+#include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
+#include <random>
 #include <stdlib.h>
 #include <thread>
 #include <vector>
@ -24,6 +27,7 @@

 struct TestConfig1 {
  static const scudo::uptr PrimaryRegionSizeLog = 18U;
+  static const scudo::uptr PrimaryGroupSizeLog = 18U;
  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
  static const bool MaySupportMemoryTagging = false;
@ -40,6 +44,7 @@ struct TestConfig2 {
 #else
  static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
  static const bool MaySupportMemoryTagging = false;
@ -56,6 +61,7 @@ struct TestConfig3 {
 #else
  static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
  static const bool MaySupportMemoryTagging = true;
@ -65,6 +71,23 @@ struct TestConfig3 {
  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
 };

+struct TestConfig4 {
+#if defined(__mips__)
+  // Unable to allocate greater size on QEMU-user.
+  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+#else
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+#endif
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = true;
+  static const scudo::uptr PrimaryCompactPtrScale = 3U;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
+  typedef scudo::u32 PrimaryCompactPtrT;
+  static const bool PrimaryEnableRandomOffset = true;
+  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+};
+
 template <typename BaseConfig, typename SizeClassMapT>
 struct Config : public BaseConfig {
  using SizeClassMap = SizeClassMapT;
@ -100,7 +123,8 @@ template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)
 #endif

 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@ -153,6 +177,7 @@ struct SmallRegionsConfig {
  static const scudo::uptr PrimaryCompactPtrScale = 0;
  static const bool PrimaryEnableRandomOffset = true;
  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
 };

 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
@ -170,6 +195,8 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
  std::vector<TransferBatch *> Batches;
  const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
  const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
+  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+
  for (scudo::uptr I = 0; I < 10000U; I++) {
    TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
    if (!B) {
@ -181,8 +208,11 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
    Batches.push_back(B);
  }
  while (!Batches.empty()) {
-    Allocator.pushBatch(ClassId, Batches.back());
+    TransferBatch *B = Batches.back();
    Batches.pop_back();
+    B->copyToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
  }
  Cache.destroy(nullptr);
  Allocator.releaseToOS();
@ -294,3 +324,47 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
  Cache.destroy(nullptr);
  EXPECT_GT(Allocator->releaseToOS(), 0U);
 }
+
+SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
+  Allocator->init(/*ReleaseToOsInterval=*/-1);
+  typename Primary::CacheT Cache;
+  Cache.init(nullptr, Allocator.get());
+  const scudo::uptr Size = 32U;
+  const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
+
+  // We will allocate 4 times the group size memory and release all of them. We
+  // expect the free blocks will be classified with groups. Then we will
+  // allocate the same amount of memory as group size and expect the blocks will
+  // have the max address difference smaller or equal to 2 times the group size.
+  // Note that it isn't necessary to be in the range of single group size
+  // because the way we get the group id is doing compact pointer shifting.
+  // According to configuration, the compact pointer may not align to group
+  // size. As a result, the blocks can cross two groups at most.
+  const scudo::uptr GroupSizeMem = (1ULL << Primary::GroupSizeLog);
+  const scudo::uptr PeakAllocationMem = 4 * GroupSizeMem;
+  const scudo::uptr PeakNumberOfAllocations = PeakAllocationMem / Size;
+  const scudo::uptr FinalNumberOfAllocations = GroupSizeMem / Size;
+  std::vector<scudo::uptr> Blocks;
+  std::mt19937 R;
+
+  for (scudo::uptr I = 0; I < PeakNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  std::shuffle(Blocks.begin(), Blocks.end(), R);
+
+  // Release all the allocated blocks, including those held by local cache.
+  while (!Blocks.empty()) {
+    Cache.deallocate(ClassId, reinterpret_cast<void *>(Blocks.back()));
+    Blocks.pop_back();
+  }
+  Cache.drain();
+
+  for (scudo::uptr I = 0; I < FinalNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) -
+                *std::min_element(Blocks.begin(), Blocks.end()),
+            GroupSizeMem * 2);
+}