perf: increase min buckets on very small types

morrisonlevi · morrisonlevi · commit f63722094b92 · 2024-05-07T22:16:35.000-06:00
Consider `HashSet<u8>` on x86_64 with SSE with various bucket sizes and how many bytes the allocation ends up being: | buckets | capacity | allocated bytes | | ------- | -------- | --------------- | | 4 | 3 | 36 | | 8 | 7 | 40 | | 16 | 14 | 48 | | 32 | 28 | 80 | In general, doubling the number of buckets should roughly double the number of bytes used. However, for small bucket sizes for these small TableLayouts (4 -> 8, 8 -> 16), it doesn't happen. This is an edge case which happens because of padding of the control bytes and adding the Group::WIDTH. Taking the buckets from 4 to 16 (4x) only takes the allocated bytes from 36 to 48 (~1.3x). This platform isn't the only one with edges. Here's aarch64 on an M1 for the same `HashSet<u8>`: | buckets | capacity | allocated bytes | | ------- | -------- | --------------- | | 4 | 3 | 20 | | 8 | 7 | 24 | | 16 | 14 | 40 | Notice 4 -> 8 buckets leading to only 4 more bytes (20 -> 24) instead of roughly doubling. Generalized, `buckets * table_layout.size` needs to be at least as big as `table_layout.ctrl_align`. For the cases I listed above, we'd get these new minimum bucket sizes: - x86_64 with SSE: 16 - aarch64: 8 This is a niche optimization. However, it also removes possible undefined behavior edge case in resize operations. In addition, it may be a useful property to utilize over-sized allocations (see #523).
diff --git a/src/raw/mod.rs b/src/raw/mod.rs
@@ -192,14 +192,40 @@ impl ProbeSeq {
 // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258
 #[cfg_attr(target_os = "emscripten", inline(never))]
 #[cfg_attr(not(target_os = "emscripten"), inline)]
-fn capacity_to_buckets(cap: usize) -> Option<usize> {
+fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option<usize> {
     debug_assert_ne!(cap, 0);
 
+    // todo: what happens with ZSTs?
+    // Consider a small layout like TableLayout { size: 1, ctrl_align: 16 } on
+    // a platform with Group::WIDTH of 16 (like x86_64 with SSE2). For small
+    // bucket sizes, this ends up wasting quite a few bytes just to pad to the
+    // relatively larger ctrl_align:
+    //
+    // | capacity | buckets | bytes allocated | bytes per item |
+    // | -------- | ------- | --------------- | -------------- |
+    // |        3 |       4 |              36 | (Yikes!)  12.0 |
+    // |        7 |       8 |              40 | (Poor)     5.7 |
+    // |       14 |      16 |              48 |            3.4 |
+    // |       28 |      32 |              80 |            3.3 |
+    //
+    // The ratio of ctrl_align / size is used to set a minimum buckets so
+    // that padding to the alignment isn't dominating the total allocation.
+    let cap = {
+        let q = table_layout.ctrl_align / table_layout.size.max(1);
+        match q.checked_mul(7) {
+            None => cap,
+            Some(x) => {
+                let adjusted = x / 8;
+                adjusted.max(cap)
+            }
+        }
+    };
+
     // For small tables we require at least 1 empty bucket so that lookups are
     // guaranteed to terminate if an element doesn't exist in the table.
     if cap < 8 {
         // We don't bother with a table size of 2 buckets since that can only
-        // hold a single element. Instead we skip directly to a 4 bucket table
+        // hold a single element. Instead, skip directly to a 4 bucket table
         // which can hold 3 elements.
         return Some(if cap < 4 { 4 } else { 8 });
     }
@@ -1126,7 +1152,7 @@ impl<T, A: Allocator> RawTable<T, A> {
         // elements. If the calculation overflows then the requested bucket
         // count must be larger than what we have right and nothing needs to be
         // done.
-        let min_buckets = match capacity_to_buckets(min_size) {
+        let min_buckets = match capacity_to_buckets(min_size, Self::TABLE_LAYOUT) {
             Some(buckets) => buckets,
             None => return,
         };
@@ -1257,14 +1283,8 @@ impl<T, A: Allocator> RawTable<T, A> {
     /// * If `self.table.items != 0`, calling of this function with `capacity`
     ///   equal to 0 (`capacity == 0`) results in [`undefined behavior`].
     ///
-    /// * If `capacity_to_buckets(capacity) < Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function results in [`undefined behavior`].
-    ///
-    /// * If `capacity_to_buckets(capacity) >= Group::WIDTH` and
-    ///   `self.table.items > capacity_to_buckets(capacity)`
-    ///   calling this function are never return (will go into an
-    ///   infinite loop).
+    /// * If `self.table.items > capacity_to_buckets(capacity, Self::TABLE_LAYOUT)`
+    ///   calling this function are never return (will loop infinitely).
     ///
     /// See [`RawTableInner::find_insert_slot`] for more information.
     ///
@@ -1782,8 +1802,8 @@ impl RawTableInner {
             // SAFETY: We checked that we could successfully allocate the new table, and then
             // initialized all control bytes with the constant `EMPTY` byte.
             unsafe {
-                let buckets =
-                    capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?;
+                let buckets = capacity_to_buckets(capacity, table_layout)
+                    .ok_or_else(|| fallibility.capacity_overflow())?;
 
                 let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?;
                 // SAFETY: We checked that the table is allocated and therefore the table already has
@@ -4566,6 +4586,26 @@ impl<T, A: Allocator> RawExtractIf<'_, T, A> {
 mod test_map {
     use super::*;
 
+    #[test]
+    fn test_minimum_capacity_for_small_types() {
+        #[track_caller]
+        fn test_t<T>() {
+            let raw_table: RawTable<T> = RawTable::with_capacity(1);
+            let actual_buckets = raw_table.buckets();
+            let min_buckets = Group::WIDTH / core::mem::size_of::<T>();
+            assert!(
+                actual_buckets >= min_buckets,
+                "expected at least {min_buckets} buckets, got {actual_buckets} buckets"
+            );
+        }
+
+        test_t::<u8>();
+
+        // This is only "small" for some platforms, like x86_64 with SSE2, but
+        // there's no harm in running it on other platforms.
+        test_t::<u16>();
+    }
+
     fn rehash_in_place<T>(table: &mut RawTable<T>, hasher: impl Fn(&T) -> u64) {
         unsafe {
             table.table.rehash_in_place(