diff --git a/Makefile b/Makefile index 4582377..d9b4d8e 100644 --- a/Makefile +++ b/Makefile @@ -42,8 +42,10 @@ endif CFLAGS = $(CFLAGS_LANG) -g -Werror=return-type -fsanitize=undefined,alignment -fno-sanitize-recover=all $(CFLAGS_PLATFORM) CFLAGS_TEST = -DSP_IMPLEMENTATION -DSP_TEST_IMPLEMENTATION -I. -Itest/tools -Itest +CFLAGS_BENCH = $(CFLAGS_LANG) -g -Werror=return-type -O2 -DSP_IMPLEMENTATION -DUBENCH_ENABLE_PERF_COUNTERS -I. -Itest/bench -Itest/tools TESTS = amalg app array asset etc cv env format fmon fs glob ht io math process ps rb str thread time mem prompt leak +BENCHES = glob heap EXAMPLES = app array format hash_table io zero_copy ls palette prompt prompt_fancy signal wc TRIPLES = \ x86_64-linux-none x86_64-linux-gnu x86_64-linux-musl \ @@ -54,16 +56,19 @@ TRIPLES = \ TEST_DIR = $(BUILD_DIR)/test EXAMPLE_DIR = $(BUILD_DIR)/example +BENCH_DIR = $(BUILD_DIR)/bench TEST_BINARIES = $(addsuffix $(EXE),$(addprefix $(TEST_DIR)/,$(TESTS))) EXAMPLE_BINARIES = $(addsuffix $(EXE),$(addprefix $(EXAMPLE_DIR)/,$(EXAMPLES))) +BENCH_BINARIES = $(addsuffix $(EXE),$(addprefix $(BENCH_DIR)/,$(BENCHES))) SP_HEADERS = sp.h $(wildcard sp/*.h) TEST_SOURCES = $(wildcard test/*/*.c) $(wildcard test/*/*.h) $(wildcard test/*/*/*.c) $(wildcard test/*/*/*.h) -.PHONY: all clean tests examples smoke big c cpp gcc tcc $(TRIPLES) +.PHONY: all clean tests examples bench smoke big c cpp gcc tcc $(TRIPLES) all: examples tests tests: $(TEST_BINARIES) examples: $(EXAMPLE_BINARIES) +bench: $(BENCH_BINARIES) $(EXAMPLE_DIR)/%$(EXE): example/%.c $(SP_HEADERS) | $(EXAMPLE_DIR) $(CC) $(CFLAGS) -I. -o $@ $< @@ -71,6 +76,9 @@ $(EXAMPLE_DIR)/%$(EXE): example/%.c $(SP_HEADERS) | $(EXAMPLE_DIR) $(TEST_DIR)/%$(EXE): test/%.c $(SP_HEADERS) $(TEST_SOURCES) | $(TEST_DIR) $(CC) $(CFLAGS) $(CFLAGS_TEST) -o $@ $< +$(BENCH_DIR)/%$(EXE): test/bench/%.c $(SP_HEADERS) test/bench/ubench.h test/tools/table.h | $(BENCH_DIR) + $(CC) $(CFLAGS_BENCH) -o $@ $< + $(TRIPLES): +$(MAKE) TRIPLE=$@ examples tests @@ -83,7 +91,7 @@ wasm: +$(MAKE) wasm32-wasi wasm32-freestanding +$(MAKE) MODE=cpp wasm32-wasi wasm32-freestanding -$(BUILD_DIR) $(EXAMPLE_DIR) $(TEST_DIR): +$(BUILD_DIR) $(EXAMPLE_DIR) $(TEST_DIR) $(BENCH_DIR): mkdir -p $@ clean: diff --git a/sp.h b/sp.h index b5bb726..f20f908 100644 --- a/sp.h +++ b/sp.h @@ -67,7 +67,6 @@ sp_utf8 encode, decode, validation, iteration SP_RT_NUM_SPIN_LOCK - SP_MEM_ARENA_BLOCK_SIZE SP_PS_MAX_ARGS SP_PS_MAX_ENV @@ -498,9 +497,13 @@ #define SP_MEM_ALIGNMENT 16 #define SP_ALIGNED SP_ALIGN(SP_MEM_ALIGNMENT) -#define sp_align_up(ptr, align) ((void*)(((uintptr_t)(ptr) + ((uintptr_t)(align) - 1)) & ~((uintptr_t)(align) - 1))) #define sp_align_offset(val, align) ((((val) + ((u64)(align) - 1)) & ~((u64)(align) - 1))) +#define sp_uptr(ptr) ((uintptr_t)(ptr)) +#define sp_align_mask(align) (sp_uptr(align) - 1) +#define sp_align_down(ptr, align) (sp_uptr(ptr) & ~sp_align_mask(align)) +#define sp_align_up(ptr, align) sp_align_down(sp_uptr(ptr) + sp_align_mask(align), align) + #define sp_try(expr) \ do { \ sp_err_t _sp_result = (expr); \ @@ -1427,17 +1430,6 @@ typedef s32 (*sp_entry_fn_t)(s32, const c8**); #endif -// ██████████ ███████████ ███████████ ███████ ███████████ -// ░░███░░░░░█░░███░░░░░███ ░░███░░░░░███ ███░░░░░███ ░░███░░░░░███ -// ░███ █ ░ ░███ ░███ ░███ ░███ ███ ░░███ ░███ ░███ -// ░██████ ░██████████ ░██████████ ░███ ░███ ░██████████ -// ░███░░█ ░███░░░░░███ ░███░░░░░███ ░███ ░███ ░███░░░░░███ -// ░███ ░ █ ░███ ░███ ░███ ░███ ░░███ ███ ░███ ░███ -// ██████████ █████ █████ █████ █████ ░░░███████░ █████ █████ -// ░░░░░░░░░░ ░░░░░ ░░░░░ ░░░░░ ░░░░░ ░░░░░░░ ░░░░░ ░░░░░ -// @error - - // ██████ ██████ ██████████ ██████ ██████ ███████ ███████████ █████ █████ // ░░██████ ██████ ░░███░░░░░█░░██████ ██████ ███░░░░░███ ░░███░░░░░███ ░░███ ░░███ // ░███░█████░███ ░███ █ ░ ░███░█████░███ ███ ░░███ ░███ ░███ ░░███ ███ @@ -1447,10 +1439,10 @@ typedef s32 (*sp_entry_fn_t)(s32, const c8**); // █████ █████ ██████████ █████ █████ ░░░███████░ █████ █████ █████ // ░░░░░ ░░░░░ ░░░░░░░░░░ ░░░░░ ░░░░░ ░░░░░░░ ░░░░░ ░░░░░ ░░░░░ // @memory -#ifndef SP_MEM_ARENA_BLOCK_SIZE - #define SP_MEM_ARENA_BLOCK_SIZE 4096 -#endif +/////////////// +// ALLOCATOR // +/////////////// typedef enum { SP_ALLOCATOR_MODE_ALLOC, SP_ALLOCATOR_MODE_FREE, @@ -1468,6 +1460,66 @@ typedef struct sp_allocator_t { void* user_data; } sp_mem_t; +SP_API void* sp_mem_allocator_alloc(sp_mem_t arena, u64 size); +SP_API void* sp_mem_allocator_realloc(sp_mem_t arena, void* ptr, u64 size); +SP_API void sp_mem_allocator_free(sp_mem_t arena, void* buffer); +SP_API void* sp_alloc(sp_mem_t mem, u64 size); +SP_API void* sp_realloc(sp_mem_t mem, void* memory, u64 size); +SP_API void sp_free(sp_mem_t mem, void* memory); +SP_API sp_mem_t sp_mem_get_scratch(); + +////////// +// CORE // +////////// +SP_API void sp_mem_copy(void* dest, const void* source, u64 num_bytes); +SP_API void sp_mem_move(void* dest, const void* source, u64 num_bytes); +SP_API bool sp_mem_is_equal(const void* a, const void* b, u64 len); +SP_API void sp_mem_fill(void* buffer, u64 bsize, void* fill, u64 fsize); +SP_API void sp_mem_fill_u8(void* buffer, u64 buffer_size, u8 fill); +SP_API void sp_mem_zero(void* buffer, u64 buffer_size); +#define sp_sys_alloc_n(T, n) (T*)sp_sys_alloc((n) * sizeof(T)) +#define sp_sys_alloc_type(T) sp_sys_alloc_n(T, 1) +#define sp_mem_allocator_alloc_n(a, T, n) (T*)sp_mem_allocator_alloc(a, (n) * sizeof(T)) +#define sp_mem_allocator_alloc_type(a, T) sp_mem_allocator_alloc_n(a, T, 1) +#define sp_mem_arena_alloc_n(a, T, n) (T*)sp_mem_arena_alloc((a), (n) * sizeof(T)) +#define sp_mem_arena_alloc_type(a, T) sp_mem_arena_alloc_n(a, T, 1) +#define sp_alloc_n(a, T, n) (T*)sp_alloc(a, (n) * sizeof(T)) +#define sp_alloc_type(a, T) sp_alloc_n(a, T, 1) + +///////////////////// +// FIXED ALLOCATOR // +///////////////////// +typedef struct { + u8* buffer; + u64 capacity; + u64 bytes_used; + u8 alignment; +} sp_mem_fixed_t; + +SP_API sp_mem_fixed_t sp_mem_fixed(void* buffer, u64 capacity); +SP_API sp_mem_fixed_t sp_mem_fixed_ex(void* buffer, u64 capacity, u8 alignment); +SP_API sp_mem_t sp_mem_fixed_as_allocator(sp_mem_fixed_t* fixed); +SP_API void sp_mem_fixed_clear(sp_mem_fixed_t* fixed); +SP_API u64 sp_mem_fixed_bytes_used(sp_mem_fixed_t* fixed); +SP_API void* sp_mem_fixed_on_alloc(void* ud, sp_mem_alloc_mode_t mode, u64 size, void* old); + +//////////////////// +// PAGE ALLOCATOR // +//////////////////// +typedef struct SP_ALIGNED { + u64 size; +} sp_mem_os_header_t; + +SP_API void* sp_mem_os_alloc(u64 size); +SP_API void* sp_mem_os_realloc(void* ptr, u64 size); +SP_API void sp_mem_os_free(void* ptr); +SP_API void* sp_mem_os_on_alloc(void* ud, sp_mem_alloc_mode_t mode, u64 size, void* ptr); +SP_API sp_mem_os_header_t* sp_mem_os_get_header(void* ptr); +SP_API sp_mem_t sp_mem_os_new(); + +///////////////////// +// ARENA ALLOCATOR // +///////////////////// typedef enum { SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ARENA_MODE_NO_REALLOC, @@ -1489,10 +1541,6 @@ typedef struct { u8 alignment; } sp_mem_arena_t; -typedef struct SP_ALIGNED { - u64 size; -} sp_mem_os_header_t; - typedef struct { sp_mem_arena_t* arena; sp_mem_arena_block_t* block; @@ -1500,35 +1548,9 @@ typedef struct { sp_mem_t mem; } sp_mem_arena_marker_t; -typedef struct { - u8* buffer; - u64 capacity; - u64 bytes_used; - u8 alignment; -} sp_mem_fixed_t; - -SP_API void sp_mem_copy(void* dest, const void* source, u64 num_bytes); -SP_API void sp_mem_move(void* dest, const void* source, u64 num_bytes); -SP_API bool sp_mem_is_equal(const void* a, const void* b, u64 len); -SP_API void sp_mem_fill(void* buffer, u64 bsize, void* fill, u64 fsize); -SP_API void sp_mem_fill_u8(void* buffer, u64 buffer_size, u8 fill); -SP_API void sp_mem_zero(void* buffer, u64 buffer_size); -SP_API void* sp_mem_allocator_alloc(sp_mem_t arena, u64 size); -SP_API void* sp_mem_allocator_realloc(sp_mem_t arena, void* ptr, u64 size); -SP_API void sp_mem_allocator_free(sp_mem_t arena, void* buffer); -SP_API void* sp_mem_os_alloc(u64 size); -SP_API void* sp_mem_os_alloc_zero(u64 size); -SP_API void* sp_mem_os_realloc(void* ptr, u64 size); -SP_API void sp_mem_os_free(void* ptr); -SP_API void* sp_mem_os_on_alloc(void* ud, sp_mem_alloc_mode_t mode, u64 size, void* ptr); -SP_API sp_mem_os_header_t* sp_mem_os_get_header(void* ptr); -SP_API sp_mem_t sp_mem_os_new(); -SP_API sp_mem_t sp_mem_arena_as_allocator(sp_mem_arena_t* arena); -SP_API void* sp_alloc(sp_mem_t mem, u64 size); -SP_API void* sp_realloc(sp_mem_t mem, void* memory, u64 size); -SP_API void sp_free(sp_mem_t mem, void* memory); SP_API sp_mem_arena_t* sp_mem_arena_new(sp_mem_t mem); SP_API sp_mem_arena_t* sp_mem_arena_new_ex(sp_mem_t mem, u64 block_size, sp_mem_arena_mode_t mode, u8 alignment); +SP_API sp_mem_t sp_mem_arena_as_allocator(sp_mem_arena_t* arena); SP_API void sp_mem_arena_clear(sp_mem_arena_t* arena); SP_API void sp_mem_arena_destroy(sp_mem_arena_t* arena); SP_API void* sp_mem_arena_on_alloc(void* ptr, sp_mem_alloc_mode_t mode, u64 n, void* old); @@ -1539,29 +1561,129 @@ SP_API u64 sp_mem_arena_bytes_used(sp_mem_arena_t* arena); SP_API void* sp_mem_arena_alloc(sp_mem_arena_t* arena, u64 size); SP_API void* sp_mem_arena_realloc(sp_mem_arena_t* arena, void* ptr, u64 size); SP_API void sp_mem_arena_free(sp_mem_arena_t* arena, void* ptr); -SP_API sp_mem_fixed_t sp_mem_fixed(void* buffer, u64 capacity); -SP_API sp_mem_fixed_t sp_mem_fixed_ex(void* buffer, u64 capacity, u8 alignment); -SP_API sp_mem_t sp_mem_fixed_as_allocator(sp_mem_fixed_t* fixed); -SP_API void sp_mem_fixed_clear(sp_mem_fixed_t* fixed); -SP_API u64 sp_mem_fixed_bytes_used(sp_mem_fixed_t* fixed); -SP_API void* sp_mem_fixed_on_alloc(void* ud, sp_mem_alloc_mode_t mode, u64 size, void* old); -SP_API sp_mem_t sp_mem_get_scratch(); SP_API sp_mem_arena_t* sp_mem_get_scratch_arena(); SP_API sp_mem_arena_t* sp_mem_get_scratch_arena_for(sp_mem_t mem); SP_API sp_mem_arena_marker_t sp_mem_begin_scratch(); SP_API sp_mem_arena_marker_t sp_mem_begin_scratch_for(sp_mem_t mem); SP_API void sp_mem_end_scratch(sp_mem_arena_marker_t marker); -#define sp_sys_alloc_n(T, n) (T*)sp_sys_alloc((n) * sizeof(T)) -#define sp_sys_alloc_type(T) sp_sys_alloc_n(T, 1) -#define sp_mem_allocator_alloc_n(a, T, n) (T*)sp_mem_allocator_alloc(a, (n) * sizeof(T)) -#define sp_mem_allocator_alloc_type(a, T) sp_mem_allocator_alloc_n(a, T, 1) -#define sp_mem_arena_alloc_n(a, T, n) (T*)sp_mem_arena_alloc((a), (n) * sizeof(T)) -#define sp_mem_arena_alloc_type(a, T) sp_mem_arena_alloc_n(a, T, 1) -#define sp_alloc_n(a, T, n) (T*)sp_alloc(a, (n) * sizeof(T)) -#define sp_alloc_type(a, T) sp_alloc_n(a, T, 1) +/* + HEAP ALLOCATOR + + sp_mem_heap_t is a general purpose heap allocator, like malloc(). This means + that it's suitable for most allocations that programs want to do, but not + necessarily optimal or ergonomic. + + Small allocations are rounded up to a bucket size and handed out from + bucket-sized chunks carved out of 4KB pages, called spans. Spans are in + turn carved out of 64KB segments, which are what the heap actually + requests from the OS; empty spans go onto a free list for reuse by any + bucket, and segments are only returned to the OS when the heap is + destroyed. Large allocations aren't carved out from anything; each large + allocation makes a syscall. + + ## Design + + The diagram below is a simplified visual of how that 4KB is laid out. First, + there's a header. Then, the remaining memory is split into bucket sized + chunks. The span keeps a free list of chunks by using the first 8 bytes + of each chunk as the pointer in a linked list + + │ 48B │ 512B │ 512B │ 512B │ + │ header │ chunks[0] │ chunks[1] │ chunks[2] │ + ┌───────────┬────────┬──────────┬────────┬──────────┬───────────────────┐ + │ free_head │ next │ junk │ NULL │ junk │ user bytes │ + └─────┬─────┴─┬───┬──┴──────────┴─┬──────┴──────────┴───────────────────┘ + │ ▲ │ ▲ + └───────┘ └───────────────┘ + + Allocation simply calculates the bucket for the request, finds the span that + holds the chunks for that bucket size, and returns the first such chunk from + the page's free list. + + │ header │ chunks[0] │ chunks[1] │ chunks[2] │ + ┌───────────┬───────────────────┬────────┬──────────┬───────────────────┐ + │ free_head │ user bytes │ NULL │ junk │ user bytes │ + └─────┬─────┴───────────────────┴─┬──────┴──────────┴───────────────────┘ + │ ▲ + └───────────────────────────┘ + + Deallocation is identical, but in reverse; it sets the span's free list head + to the newly freed chunk and links it to the rest of the free list. + + # Performance Characteristics + + Allocations are always rounded up to the nearest bucket. If you ask + for, say, 50 bytes, the allocator will use a 64 byte chunk. This is, + obviously, inefficient in overhead. Buckets are spaced 1.25x to 1.5x + apart and sized so that a whole number of chunks packs a span almost + exactly; the worst rounding waste is bounded by the gap to the next + bucket, and the worst packing waste by the span's leftover tail. + + In exchange for this inefficiency, you get better utilization. A freed + chunk is immediately reusable, always. malloc() can fragment across a + program's lifetime no matter what. This heap allocator does not; overhead + is purely a function of your size distribution, not how long the program + has been running. + */ +#define SP_MEM_HEAP_NUM_BUCKETS 16 +#define SP_MEM_HEAP_SEGMENT_SIZE 65536 +#define SP_MEM_HEAP_SPAN_SIZE 4096 +#define SP_MEM_HEAP_MAX_SMALL 2016 +#define SP_MEM_HEAP_SPAN_MAGIC 0x53504D48u +#define SP_MEM_HEAP_LARGE_MAGIC 0x53504C47u + +typedef struct SP_ALIGNED sp_mem_heap_span_t { + u32 magic; + u32 bucket; + u32 in_use; + void* free_head; + struct sp_mem_heap_span_t* prev; + struct sp_mem_heap_span_t* next; + struct sp_mem_heap_t* heap; +} sp_mem_heap_span_t; + +typedef struct sp_mem_heap_segment_t { + struct sp_mem_heap_segment_t* next; +} sp_mem_heap_segment_t; + +typedef struct SP_ALIGNED sp_mem_heap_large_t { + u32 magic; + u32 pad; + u64 size; + u64 capacity; + struct sp_mem_heap_large_t* prev; + struct sp_mem_heap_large_t* next; + struct sp_mem_heap_t* heap; +} sp_mem_heap_large_t; +typedef struct { + sp_mem_heap_span_t* partial; + sp_mem_heap_span_t* full; +} sp_mem_heap_bucket_t; + +typedef struct sp_mem_heap_t { + sp_mem_heap_bucket_t buckets [SP_MEM_HEAP_NUM_BUCKETS]; + sp_mem_heap_large_t* larges; + sp_mem_heap_segment_t* segments; + sp_mem_heap_span_t* recycled; + u64 bytes_used; + u64 bytes_reserved; + u64 peak_reserved; +} sp_mem_heap_t; + +SP_API sp_mem_heap_t* sp_mem_heap_new(); +SP_API void sp_mem_heap_destroy(sp_mem_heap_t* heap); +SP_API sp_mem_t sp_mem_heap_as_allocator(sp_mem_heap_t* heap); +SP_API void* sp_mem_heap_on_alloc(void* ud, sp_mem_alloc_mode_t mode, u64 size, void* ptr); +SP_API void* sp_mem_heap_alloc(sp_mem_heap_t* heap, u64 size); +SP_API void* sp_mem_heap_realloc(sp_mem_heap_t* heap, void* ptr, u64 size); +SP_API void sp_mem_heap_free(sp_mem_heap_t* heap, void* ptr); +SP_API sp_mem_heap_span_t* sp_mem_heap_find_span(sp_mem_heap_t* heap, void* ptr); +/////////// +// SLICE // +/////////// typedef struct { sp_mem_slice_t slice; u64 index; @@ -3624,10 +3746,14 @@ SP_IMP sp_hash_t sp_hash_str(sp_str_t str); // @memory SP_IMP sp_mem_arena_block_t* sp_mem_arena_block_new(sp_mem_arena_t* arena, u64 capacity); -SP_IMP void* sp_mem_arena_align_block(sp_mem_arena_block_t* block, u8 alignment); +SP_IMP void* sp_mem_arena_align_block(sp_mem_arena_block_t* block, u8 alignment); SP_IMP sp_mem_arena_block_t* sp_mem_arena_get_block(sp_mem_arena_t* arena, u64 alloc_size); -SP_IMP void* sp_mem_arena_alloc_with_header(sp_mem_arena_t* arena, u64 size); -SP_IMP void* sp_mem_arena_alloc_no_header(sp_mem_arena_t* arena, u64 size); +SP_IMP void* sp_mem_arena_alloc_with_header(sp_mem_arena_t* arena, u64 size); +SP_IMP void* sp_mem_arena_alloc_no_header(sp_mem_arena_t* arena, u64 size); +SP_IMP u32 sp_mem_heap_bucket_of(u64 size); +SP_IMP u64 sp_mem_heap_bucket_size(u32 bucket); +SP_IMP void sp_mem_heap_track_reserve(sp_mem_heap_t* heap, u64 bytes); +SP_IMP void sp_mem_heap_span_release(sp_mem_heap_t* heap, sp_mem_heap_span_t* span); // @string SP_IMP bool sp_utf8_is_cont(u8 b); @@ -4055,6 +4181,7 @@ s32 errno; #define SP_SYSCALL_NUM_DUP3 292 #define SP_SYSCALL_NUM_PIPE2 293 #define SP_SYSCALL_NUM_INOTIFY_INIT1 294 + #define SP_SYSCALL_NUM_PERF_EVENT_OPEN 298 #elif defined(SP_ARM64) #define SP_SYSCALL_NUM_GETCWD 17 @@ -4100,6 +4227,7 @@ s32 errno; #define SP_SYSCALL_NUM_CLONE 220 #define SP_SYSCALL_NUM_EXECVE 221 #define SP_SYSCALL_NUM_MMAP 222 + #define SP_SYSCALL_NUM_PERF_EVENT_OPEN 241 #define SP_SYSCALL_NUM_WAIT4 260 #define SP_SYSCALL_NUM_SENDFILE 71 #define SP_SYSCALL_NUM_COPY_FILE_RANGE 285 @@ -8064,7 +8192,7 @@ sp_mem_arena_block_t* sp_mem_arena_block_new(sp_mem_arena_t* arena, u64 capacity } sp_mem_arena_t* sp_mem_arena_new(sp_mem_t mem) { - return sp_mem_arena_new_ex(mem, SP_MEM_ARENA_BLOCK_SIZE, SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ALIGNMENT); + return sp_mem_arena_new_ex(mem, 4096, SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ALIGNMENT); } sp_mem_arena_t* sp_mem_arena_new_ex(sp_mem_t mem, u64 block_size, sp_mem_arena_mode_t mode, u8 alignment) { @@ -8365,10 +8493,6 @@ void* sp_mem_os_alloc(u64 size) { return h + 1; } -void* sp_mem_os_alloc_zero(u64 size) { - return sp_mem_os_alloc(size); -} - void sp_mem_os_free(void* ptr) { if (!ptr) return; sp_mem_os_header_t* h = sp_mem_os_get_header(ptr); @@ -14246,7 +14370,7 @@ sp_mem_os_header_t* sp_mem_os_get_header(void* ptr) { void* sp_mem_os_on_alloc(void* user_data, sp_mem_alloc_mode_t mode, u64 size, void* ptr) { (void)user_data; switch (mode) { - case SP_ALLOCATOR_MODE_ALLOC: return sp_mem_os_alloc_zero(size); + case SP_ALLOCATOR_MODE_ALLOC: return sp_mem_os_alloc(size); case SP_ALLOCATOR_MODE_RESIZE: return sp_mem_os_realloc(ptr, size); case SP_ALLOCATOR_MODE_FREE: sp_mem_os_free(ptr); return SP_NULLPTR; default: return SP_NULLPTR; @@ -14260,6 +14384,252 @@ sp_mem_t sp_mem_os_new() { return allocator; } +//////////////////// +// HEAP ALLOCATOR // +//////////////////// +u32 sp_mem_heap_bucket_of(u64 size) { + sp_for(it, SP_MEM_HEAP_NUM_BUCKETS) { + if (size <= sp_mem_heap_bucket_size(it)) return it; + } + return SP_MEM_HEAP_NUM_BUCKETS; +} + +u64 sp_mem_heap_bucket_size(u32 bucket) { + static const u16 sizes [SP_MEM_HEAP_NUM_BUCKETS] = { + 16, 32, 48, 64, 96, 128, 192, 256, 336, 448, 576, 672, 800, 1008, 1344, SP_MEM_HEAP_MAX_SMALL + }; + return sizes[bucket]; +} + +#define sp_mem_heap_list_push(head, node) do { \ + (node)->prev = SP_NULLPTR; \ + (node)->next = *(head); \ + if (*(head)) (*(head))->prev = (node); \ + *(head) = (node); \ + } while (0) + +#define sp_mem_heap_list_unlink(head, node) do { \ + if ((node)->prev) (node)->prev->next = (node)->next; \ + else *(head) = (node)->next; \ + if ((node)->next) (node)->next->prev = (node)->prev; \ + (node)->prev = SP_NULLPTR; \ + (node)->next = SP_NULLPTR; \ + } while (0) + +void sp_mem_heap_track_reserve(sp_mem_heap_t* heap, u64 bytes) { + heap->bytes_reserved += bytes; + heap->peak_reserved = sp_max(heap->peak_reserved, heap->bytes_reserved); +} + +sp_mem_heap_t* sp_mem_heap_new() { + sp_mem_heap_t* heap = sp_sys_alloc_type(sp_mem_heap_t); + if (!heap) return SP_NULLPTR; + sp_mem_heap_track_reserve(heap, sp_align_offset(sizeof(sp_mem_heap_t), SP_MEM_HEAP_SPAN_SIZE)); + return heap; +} + +void sp_mem_heap_destroy(sp_mem_heap_t* heap) { + if (!heap) return; + + sp_mem_heap_large_t* large = heap->larges; + while (large) { + sp_mem_heap_large_t* next = large->next; + sp_sys_free(large, large->capacity); + large = next; + } + + sp_mem_heap_segment_t* segment = heap->segments; + while (segment) { + sp_mem_heap_segment_t* next = segment->next; + sp_sys_free(segment, SP_MEM_HEAP_SEGMENT_SIZE); + segment = next; + } + + sp_sys_free(heap, sizeof(*heap)); +} + +sp_mem_heap_span_t* sp_mem_heap_find_span(sp_mem_heap_t* heap, void* ptr) { + if (!heap || !ptr) return SP_NULLPTR; + + sp_mem_heap_span_t* span = sp_ptr_cast(sp_mem_heap_span_t*, sp_align_down(ptr, SP_MEM_HEAP_SPAN_SIZE)); + if (span->magic != SP_MEM_HEAP_SPAN_MAGIC) return SP_NULLPTR; + if (span->heap != heap) return SP_NULLPTR; + return span; +} + +static sp_mem_heap_span_t* sp_mem_heap_span_new(sp_mem_heap_t* heap, u32 bucket) { + sp_mem_heap_span_t* span = heap->recycled; + if (span) { + sp_mem_heap_list_unlink(&heap->recycled, span); + } + else { + sp_mem_heap_segment_t* segment = sp_ptr_cast(sp_mem_heap_segment_t*, sp_sys_alloc(SP_MEM_HEAP_SEGMENT_SIZE)); + if (!segment) return SP_NULLPTR; + sp_assert(sp_align_down(segment, SP_MEM_HEAP_SPAN_SIZE) == sp_uptr(segment)); + segment->next = heap->segments; + heap->segments = segment; + sp_mem_heap_track_reserve(heap, SP_MEM_HEAP_SEGMENT_SIZE); + + span = sp_ptr_cast(sp_mem_heap_span_t*, (u8*)segment + SP_MEM_HEAP_SPAN_SIZE); + sp_for_range(it, 2, SP_MEM_HEAP_SEGMENT_SIZE / SP_MEM_HEAP_SPAN_SIZE) { + sp_mem_heap_span_t* slot = sp_ptr_cast(sp_mem_heap_span_t*, (u8*)segment + (it * SP_MEM_HEAP_SPAN_SIZE)); + sp_mem_heap_list_push(&heap->recycled, slot); + } + } + + u64 bucket_size = sp_mem_heap_bucket_size(bucket); + u8* base = (u8*)span + sizeof(sp_mem_heap_span_t); + u8* end = (u8*)span + SP_MEM_HEAP_SPAN_SIZE; + u32 num_chunks = (u32)((u64)(end - base) / bucket_size); + sp_assert(num_chunks); + + span->magic = SP_MEM_HEAP_SPAN_MAGIC; + span->bucket = bucket; + span->in_use = 0; + span->heap = heap; + + void* head = SP_NULLPTR; + sp_for(it, num_chunks) { + u8* chunk = base + ((num_chunks - 1 - it) * bucket_size); + *(void**)chunk = head; + head = chunk; + } + span->free_head = head; + + sp_mem_heap_list_push(&heap->buckets[bucket].partial, span); + return span; +} + +void sp_mem_heap_span_release(sp_mem_heap_t* heap, sp_mem_heap_span_t* span) { + sp_mem_heap_list_unlink(&heap->buckets[span->bucket].partial, span); + span->magic = 0; + sp_mem_heap_list_push(&heap->recycled, span); +} + +void* sp_mem_heap_alloc(sp_mem_heap_t* heap, u64 size) { + if (!heap) return SP_NULLPTR; + + u32 bucket = sp_mem_heap_bucket_of(size); + if (bucket < SP_MEM_HEAP_NUM_BUCKETS) { + sp_mem_heap_span_t* span = heap->buckets[bucket].partial; + if (!span) span = sp_mem_heap_span_new(heap, bucket); + if (!span) return SP_NULLPTR; + + void* chunk = span->free_head; + span->free_head = *(void**)chunk; + span->in_use++; + if (!span->free_head) { + sp_mem_heap_list_unlink(&heap->buckets[bucket].partial, span); + sp_mem_heap_list_push(&heap->buckets[bucket].full, span); + } + + u64 bucket_size = sp_mem_heap_bucket_size(bucket); + heap->bytes_used += bucket_size; + sp_mem_zero(chunk, bucket_size); + return chunk; + } + + u64 capacity = sp_align_offset(size + sizeof(sp_mem_heap_large_t), SP_MEM_HEAP_SPAN_SIZE); + if (capacity <= size) return SP_NULLPTR; + sp_mem_heap_large_t* large = (sp_mem_heap_large_t*)sp_sys_alloc(capacity); + if (!large) return SP_NULLPTR; + large->magic = SP_MEM_HEAP_LARGE_MAGIC; + large->size = size; + large->capacity = capacity; + large->heap = heap; + sp_mem_heap_list_push(&heap->larges, large); + sp_mem_heap_track_reserve(heap, capacity); + heap->bytes_used += size; + return large + 1; +} + +void sp_mem_heap_free(sp_mem_heap_t* heap, void* ptr) { + if (!heap || !ptr) return; + + sp_mem_heap_span_t* span = sp_mem_heap_find_span(heap, ptr); + if (span) { + sp_assert(span->in_use); + bool was_full = !span->free_head; + *(void**)ptr = span->free_head; + span->free_head = ptr; + span->in_use--; + heap->bytes_used -= sp_mem_heap_bucket_size(span->bucket); + + if (was_full) { + sp_mem_heap_list_unlink(&heap->buckets[span->bucket].full, span); + sp_mem_heap_list_push(&heap->buckets[span->bucket].partial, span); + } + if (!span->in_use) { + sp_mem_heap_span_release(heap, span); + } + return; + } + + sp_mem_heap_large_t* large = ((sp_mem_heap_large_t*)ptr) - 1; + sp_assert(large->magic == SP_MEM_HEAP_LARGE_MAGIC); + sp_assert(large->heap == heap); + sp_mem_heap_list_unlink(&heap->larges, large); + heap->bytes_used -= large->size; + heap->bytes_reserved -= large->capacity; + sp_sys_free(large, large->capacity); +} + +void* sp_mem_heap_realloc(sp_mem_heap_t* heap, void* ptr, u64 size) { + if (!heap) return SP_NULLPTR; + if (!ptr) return sp_mem_heap_alloc(heap, size); + if (!size) { + sp_mem_heap_free(heap, ptr); + return SP_NULLPTR; + } + + u64 old_size = 0; + sp_mem_heap_span_t* span = sp_mem_heap_find_span(heap, ptr); + if (span) { + u64 bucket_size = sp_mem_heap_bucket_size(span->bucket); + if (sp_mem_heap_bucket_of(size) == span->bucket) { + sp_mem_zero((u8*)ptr + size, bucket_size - size); + return ptr; + } + old_size = bucket_size; + } + else { + sp_mem_heap_large_t* large = ((sp_mem_heap_large_t*)ptr) - 1; + sp_assert(large->magic == SP_MEM_HEAP_LARGE_MAGIC); + sp_assert(large->heap == heap); + if (size > SP_MEM_HEAP_MAX_SMALL && size <= large->capacity - sizeof(sp_mem_heap_large_t)) { + if (size < large->size) sp_mem_zero((u8*)ptr + size, large->size - size); + heap->bytes_used -= large->size; + heap->bytes_used += size; + large->size = size; + return ptr; + } + old_size = large->size; + } + + void* fresh = sp_mem_heap_alloc(heap, size); + if (!fresh) return SP_NULLPTR; + sp_mem_copy(fresh, ptr, sp_min(old_size, size)); + sp_mem_heap_free(heap, ptr); + return fresh; +} + +void* sp_mem_heap_on_alloc(void* user_data, sp_mem_alloc_mode_t mode, u64 size, void* ptr) { + sp_mem_heap_t* heap = (sp_mem_heap_t*)user_data; + switch (mode) { + case SP_ALLOCATOR_MODE_ALLOC: return sp_mem_heap_alloc(heap, size); + case SP_ALLOCATOR_MODE_RESIZE: return sp_mem_heap_realloc(heap, ptr, size); + case SP_ALLOCATOR_MODE_FREE: sp_mem_heap_free(heap, ptr); return SP_NULLPTR; + } + return SP_NULLPTR; +} + +sp_mem_t sp_mem_heap_as_allocator(sp_mem_heap_t* heap) { + return (sp_mem_t) { + .on_alloc = sp_mem_heap_on_alloc, + .user_data = heap + }; +} + void* sp_alloc(sp_mem_t allocator, u64 size) { return sp_mem_allocator_alloc(allocator, size); } diff --git a/sp/sp_prompt.h b/sp/sp_prompt.h index 2df9f83..95c911a 100644 --- a/sp/sp_prompt.h +++ b/sp/sp_prompt.h @@ -1014,7 +1014,7 @@ void sp_prompt_ctx_init(sp_prompt_ctx_t* ctx, sp_mem_t mem, u32 cols, u32 rows) sp_da_init(ctx->mem, ctx->frames); sp_mutex_init(&ctx->channel.lock, SP_MUTEX_PLAIN); - ctx->channel.arena = sp_mem_arena_new_ex(mem, SP_MEM_ARENA_BLOCK_SIZE, SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ALIGNMENT); + ctx->channel.arena = sp_mem_arena_new_ex(mem, 4096, SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ALIGNMENT); // Write buffering is really important, because our rendering algorithm is extremely // naive. It's not much more than this: diff --git a/test/bench/glob.c b/test/bench/glob.c index 8f237bb..e9811f5 100644 --- a/test/bench/glob.c +++ b/test/bench/glob.c @@ -1,144 +1,128 @@ -#define SP_APP -#include "sp.h" +#include "ubench.h" #include "sp/sp_glob.h" -#define BENCH_ITERATIONS 1000000 +#define GLOB_BENCH_MAX_PATTERNS 16 typedef struct { - const c8* pattern; - const c8* path; -} bench_case_t; + bool match; +} glob_bench_expect_t; typedef struct { - sp_str_t name; - f64 ns_per_op; -} bench_result_t; - -// From original rust globset benchmarks -static bench_case_t bench_cases[] = { - {.pattern = "*.txt", .path = "some/a/bigger/path/to/the/crazy/needle.txt"}, - {.pattern = "some/**/needle.txt", .path = "some/needle.txt"}, - {.pattern = "some/**/needle.txt", .path = "some/a/bigger/path/to/the/crazy/needle.txt"}, -}; - -static const c8* case_names[] = { - "ext", "short", "long", -}; - -// From original rust globset benchmarks -static const c8* many_short_patterns[] = { - ".*.swp", - "tags", - "target", - "*.lock", - "tmp", - "*.csv", - "*.fst", - "*-got", - "*.csv.idx", - "words", - "98m*", - "dict", - "test", - "months", -}; - -static const c8* many_short_path = "98m-blah.csv.idx"; - -static f64 run_glob_bench(sp_glob_t* g, sp_str_t path) { - for (u32 i = 0; i < 1000; i++) { - sp_glob_match(g, path); - } - - volatile bool result; - sp_tm_point_t start = sp_tm_now_point(); - for (u32 i = 0; i < BENCH_ITERATIONS; i++) { - result = sp_glob_match(g, path); + const c8* patterns[GLOB_BENCH_MAX_PATTERNS]; + const c8* path; + glob_bench_expect_t expect; +} glob_bench_t; + +static void run_glob_bench(ubench_run_state_t* ubench_run_state, glob_bench_t bench) { + sp_mem_arena_marker_t scratch = sp_mem_begin_scratch(); + sp_glob_t* glob = sp_glob_new(scratch.mem, bench.patterns[0]); + sp_str_t path = sp_str_view(bench.path); + + SP_ASSERT(glob != SP_NULLPTR); + SP_ASSERT(sp_glob_match(glob, path) == bench.expect.match); + + UBENCH_DO_BENCHMARK() { + UBENCH_LOOP { + bool matched = sp_glob_match(glob, path); + UBENCH_DO_NOT_OPTIMIZE(matched); + } } - sp_tm_point_t end = sp_tm_now_point(); - (void)result; - return (f64)sp_tm_point_diff(end, start) / (f64)BENCH_ITERATIONS; + sp_mem_end_scratch(scratch); } -static f64 run_glob_set_bench(sp_glob_set_t* set, sp_str_t path) { - for (u32 i = 0; i < 1000; i++) { - sp_glob_set_match(set, path); +static void run_glob_set_bench(ubench_run_state_t* ubench_run_state, glob_bench_t bench) { + sp_mem_arena_marker_t scratch = sp_mem_begin_scratch(); + sp_glob_set_t* set = sp_glob_set_new(scratch.mem); + sp_carr_for(bench.patterns, it) { + if (!bench.patterns[it]) break; + sp_glob_set_add(set, bench.patterns[it]); } + sp_glob_set_build(set); + sp_str_t path = sp_str_view(bench.path); + + SP_ASSERT(sp_glob_set_match(set, path) == bench.expect.match); - volatile bool result; - sp_tm_point_t start = sp_tm_now_point(); - for (u32 i = 0; i < BENCH_ITERATIONS; i++) { - result = sp_glob_set_match(set, path); + UBENCH_DO_BENCHMARK() { + UBENCH_LOOP { + bool matched = sp_glob_set_match(set, path); + UBENCH_DO_NOT_OPTIMIZE(matched); + } } - sp_tm_point_t end = sp_tm_now_point(); - (void)result; - return (f64)sp_tm_point_diff(end, start) / (f64)BENCH_ITERATIONS; + sp_mem_end_scratch(scratch); } -int main(int argc, char** argv) { - (void)argc; - (void)argv; - - sp_mem_arena_t* arena = sp_mem_arena_new_ex(sp_mem_os_new(), 4 * 1024 * 1024, SP_MEM_ARENA_MODE_DEFAULT, SP_MEM_ALIGNMENT); - sp_mem_t allocator = sp_mem_arena_as_allocator(arena); - (void)allocator; - - u32 num_cases = sizeof(bench_cases) / sizeof(bench_cases[0]); - sp_da(bench_result_t) results = sp_da_new(allocator, bench_result_t); - - // Pre-compile all globs - sp_da(sp_glob_t*) globs = sp_da_new(allocator, sp_glob_t*); - sp_da(sp_glob_set_t*) globsets = sp_da_new(allocator, sp_glob_set_t*); - sp_carr_for(bench_cases, i) { - sp_glob_t* g = sp_glob_new(allocator, bench_cases[i].pattern); - SP_ASSERT(g != SP_NULLPTR); - sp_da_push(globs, g); - - sp_glob_set_t* set = sp_glob_set_new(allocator); - sp_glob_set_add(set, bench_cases[i].pattern); - sp_glob_set_build(set); - sp_da_push(globsets, set); - } +UBENCH_EX(glob, ext) { + run_glob_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "*.txt" }, + .path = "some/a/bigger/path/to/the/crazy/needle.txt", + .expect = { .match = true }, + }); +} - // Pre-compile many_short globset - sp_glob_set_t* many_short_set = sp_glob_set_new(allocator); - sp_carr_for(many_short_patterns, i) { - sp_glob_set_add(many_short_set, many_short_patterns[i]); - } - sp_glob_set_build(many_short_set); - - // Single glob benchmarks - for (u32 i = 0; i < num_cases; i++) { - sp_str_t path = sp_str_view(bench_cases[i].path); - SP_ASSERT(sp_glob_match(globs[i], path)); - f64 ns = run_glob_bench(globs[i], path); - sp_str_t name = sp_fmt(sp_mem_get_scratch(), "{}_glob", sp_fmt_cstr(case_names[i])).value; - sp_da_push(results, ((bench_result_t){.name = name, .ns_per_op = ns})); - } +UBENCH_EX(glob, short) { + run_glob_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "some/**/needle.txt" }, + .path = "some/needle.txt", + .expect = { .match = true }, + }); +} - // GlobSet single pattern benchmarks - for (u32 i = 0; i < num_cases; i++) { - sp_str_t path = sp_str_view(bench_cases[i].path); - SP_ASSERT(sp_glob_set_match(globsets[i], path)); - f64 ns = run_glob_set_bench(globsets[i], path); - sp_str_t name = sp_fmt(sp_mem_get_scratch(), "{}_globset", sp_fmt_cstr(case_names[i])).value; - sp_da_push(results, ((bench_result_t){.name = name, .ns_per_op = ns})); - } +UBENCH_EX(glob, long) { + run_glob_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "some/**/needle.txt" }, + .path = "some/a/bigger/path/to/the/crazy/needle.txt", + .expect = { .match = true }, + }); +} - // many_short benchmark (14 patterns, 2 matches expected) - { - sp_str_t path = sp_str_view(many_short_path); - f64 ns = run_glob_set_bench(many_short_set, path); - sp_da_push(results, ((bench_result_t){.name = sp_str_lit("many_short_globset"), .ns_per_op = ns})); - } +UBENCH_EX(globset, ext) { + run_glob_set_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "*.txt" }, + .path = "some/a/bigger/path/to/the/crazy/needle.txt", + .expect = { .match = true }, + }); +} - // Print space-separated pairs - sp_da_for(results, i) { - sp_log("{} {}", sp_fmt_str(results[i].name), sp_fmt_float(results[i].ns_per_op)); - } +UBENCH_EX(globset, short) { + run_glob_set_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "some/**/needle.txt" }, + .path = "some/needle.txt", + .expect = { .match = true }, + }); +} + +UBENCH_EX(globset, long) { + run_glob_set_bench(ubench_run_state, (glob_bench_t) { + .patterns = { "some/**/needle.txt" }, + .path = "some/a/bigger/path/to/the/crazy/needle.txt", + .expect = { .match = true }, + }); +} - return 0; +UBENCH_EX(globset, many_short) { + run_glob_set_bench(ubench_run_state, (glob_bench_t) { + .patterns = { + ".*.swp", + "tags", + "target", + "*.lock", + "tmp", + "*.csv", + "*.fst", + "*-got", + "*.csv.idx", + "words", + "98m*", + "dict", + "test", + "months", + }, + .path = "98m-blah.csv.idx", + .expect = { .match = true }, + }); } + +UBENCH_MAIN() diff --git a/test/bench/heap.c b/test/bench/heap.c new file mode 100644 index 0000000..7acf82f --- /dev/null +++ b/test/bench/heap.c @@ -0,0 +1,557 @@ +#include "sp.h" + +#define SP_TABLE_IMPLEMENTATION +#include "table.h" + +#if defined(__GLIBC__) + #include +#endif + +#define BENCH_MAX_SLOTS 262144 +#define BENCH_MAX_BACKENDS 3 +#define BENCH_MAX_PHASES 3 + +typedef enum { + DIST_FIXED, + DIST_UNIFORM, + DIST_LOG, +} bench_dist_t; + +typedef enum { + WORK_CHURN, + WORK_RAMP_LIFO, + WORK_RAMP_FIFO, + WORK_PIN, + WORK_REALLOC, +} bench_work_t; + +typedef struct { + const c8* name; + bench_work_t kind; + bench_dist_t dist; + u64 lo; + u64 hi; + u32 slots; + u32 ops; + u32 survive_pct; +} bench_workload_t; + +typedef struct { + const c8* name; + void* (*create)(); + void (*destroy)(void* ctx); + sp_mem_t (*as_mem)(void* ctx); + bool (*sample)(void* ctx, u64* used, u64* reserved); +} bench_backend_t; + +typedef struct { + void* ptrs [BENCH_MAX_SLOTS]; + u64 sizes [BENCH_MAX_SLOTS]; + u64 live_req; + u64 rng; + sp_mem_t mem; +} bench_state_t; + +static bench_state_t state = sp_zero; + +static u64 bench_rng_next() { + u64 x = state.rng; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + state.rng = x; + return x * 0x2545F4914F6CDD1DULL; +} + +static u64 bench_log2(u64 v) { + u64 r = 0; + while (v >>= 1) r++; + return r; +} + +static u64 bench_dist_size(const bench_workload_t* w) { + switch (w->dist) { + case DIST_FIXED: return w->lo; + case DIST_UNIFORM: return w->lo + (bench_rng_next() % (w->hi - w->lo + 1)); + case DIST_LOG: { + u64 lo_exp = bench_log2(w->lo); + u64 hi_exp = bench_log2(w->hi); + u64 e = lo_exp + (bench_rng_next() % (hi_exp - lo_exp + 1)); + u64 size = ((u64)1 << e) + (bench_rng_next() % ((u64)1 << e)); + return sp_min(sp_max(size, w->lo), w->hi); + } + } + return w->lo; +} + +static void bench_touch(void* ptr, u64 size) { + u8* bytes = (u8*)ptr; + u64 head = size & ~(u64)7; + for (u64 i = 0; i < head; i += 8) { + *(u64*)(bytes + i) = 0x5050505050505050ULL; + } + for (u64 i = head; i < size; i++) { + bytes[i] = 0x50; + } +} + +static void bench_alloc_slot(const bench_workload_t* w, u32 slot) { + u64 size = bench_dist_size(w); + state.ptrs[slot] = sp_alloc(state.mem, size); + SP_ASSERT(state.ptrs[slot]); + bench_touch(state.ptrs[slot], size); + state.sizes[slot] = size; + state.live_req += size; +} + +static void bench_free_slot(u32 slot) { + sp_free(state.mem, state.ptrs[slot]); + state.live_req -= state.sizes[slot]; + state.ptrs[slot] = SP_NULLPTR; + state.sizes[slot] = 0; +} + +static void bench_realloc_slot(u32 slot, u64 size) { + state.ptrs[slot] = sp_realloc(state.mem, state.ptrs[slot], size); + SP_ASSERT(state.ptrs[slot]); + bench_touch(state.ptrs[slot], size); + state.live_req -= state.sizes[slot]; + state.live_req += size; + state.sizes[slot] = size; +} + +static void* bench_heap_create() { + return sp_mem_heap_new(); +} + +static void bench_heap_destroy(void* ctx) { + sp_mem_heap_destroy((sp_mem_heap_t*)ctx); +} + +static sp_mem_t bench_heap_as_mem(void* ctx) { + return sp_mem_heap_as_allocator((sp_mem_heap_t*)ctx); +} + +static bool bench_heap_sample(void* ctx, u64* used, u64* reserved) { + sp_mem_heap_t* heap = (sp_mem_heap_t*)ctx; + *used = heap->bytes_used; + *reserved = heap->bytes_reserved; + return true; +} + +#if !defined(SP_FREESTANDING) +static void* bench_malloc_on_alloc(void* user_data, sp_mem_alloc_mode_t mode, u64 size, void* ptr) { + sp_unused(user_data); + switch (mode) { + case SP_ALLOCATOR_MODE_ALLOC: return malloc(size); + case SP_ALLOCATOR_MODE_RESIZE: return realloc(ptr, size); + case SP_ALLOCATOR_MODE_FREE: free(ptr); return SP_NULLPTR; + } + return SP_NULLPTR; +} + +static void* bench_malloc_create() { + return SP_NULLPTR; +} + +static void bench_malloc_destroy(void* ctx) { + sp_unused(ctx); +} + +static sp_mem_t bench_malloc_as_mem(void* ctx) { + sp_unused(ctx); + return (sp_mem_t) { + .on_alloc = bench_malloc_on_alloc + }; +} + +static bool bench_malloc_sample(void* ctx, u64* used, u64* reserved) { + sp_unused(ctx); + #if defined(__GLIBC__) + struct mallinfo2 info = mallinfo2(); + *used = info.uordblks; + *reserved = info.arena + info.hblkhd; + return true; + #else + *used = 0; + *reserved = 0; + return false; + #endif +} +#endif + +typedef struct { + u64 used; + u64 reserved; + u64 peak_reserved; +} bench_os_counters_t; + +static bench_os_counters_t bench_os_counters = sp_zero; + +static u64 bench_os_reservation(u64 size) { + return sp_align_offset(size + sizeof(sp_mem_os_header_t), 4096); +} + +static void* bench_os_on_alloc(void* user_data, sp_mem_alloc_mode_t mode, u64 size, void* ptr) { + bench_os_counters_t* counters = (bench_os_counters_t*)user_data; + switch (mode) { + case SP_ALLOCATOR_MODE_ALLOC: { + void* p = sp_mem_os_alloc(size); + if (p) { + counters->used += size; + counters->reserved += bench_os_reservation(size); + counters->peak_reserved = sp_max(counters->peak_reserved, counters->reserved); + } + return p; + } + case SP_ALLOCATOR_MODE_RESIZE: { + if (!ptr) return bench_os_on_alloc(user_data, SP_ALLOCATOR_MODE_ALLOC, size, SP_NULLPTR); + u64 old = sp_mem_os_get_header(ptr)->size; + void* p = sp_mem_os_realloc(ptr, size); + if (p) { + u64 now = sp_mem_os_get_header(p)->size; + counters->used -= old; + counters->used += now; + counters->reserved -= bench_os_reservation(old); + counters->reserved += bench_os_reservation(now); + counters->peak_reserved = sp_max(counters->peak_reserved, counters->reserved); + } + return p; + } + case SP_ALLOCATOR_MODE_FREE: { + if (ptr) { + u64 old = sp_mem_os_get_header(ptr)->size; + counters->used -= old; + counters->reserved -= bench_os_reservation(old); + sp_mem_os_free(ptr); + } + return SP_NULLPTR; + } + } + return SP_NULLPTR; +} + +static void* bench_os_create() { + bench_os_counters = sp_zero_s(bench_os_counters_t); + return &bench_os_counters; +} + +static void bench_os_destroy(void* ctx) { + sp_unused(ctx); +} + +static sp_mem_t bench_os_as_mem(void* ctx) { + return (sp_mem_t) { + .on_alloc = bench_os_on_alloc, + .user_data = ctx + }; +} + +static bool bench_os_sample(void* ctx, u64* used, u64* reserved) { + bench_os_counters_t* counters = (bench_os_counters_t*)ctx; + *used = counters->used; + *reserved = counters->reserved; + return true; +} + +static const bench_backend_t backends [] = { + { "sp_heap", bench_heap_create, bench_heap_destroy, bench_heap_as_mem, bench_heap_sample }, + #if !defined(SP_FREESTANDING) + { "malloc", bench_malloc_create, bench_malloc_destroy, bench_malloc_as_mem, bench_malloc_sample }, + #endif + // { "sp_os", bench_os_create, bench_os_destroy, bench_os_as_mem, bench_os_sample }, +}; + +static const bench_workload_t workloads [] = { + { .name = "fixed_16", .kind = WORK_CHURN, .dist = DIST_FIXED, .lo = 16, .slots = 65536, .ops = 400000 }, + { .name = "fixed_64", .kind = WORK_CHURN, .dist = DIST_FIXED, .lo = 64, .slots = 65536, .ops = 400000 }, + { .name = "fixed_512", .kind = WORK_CHURN, .dist = DIST_FIXED, .lo = 512, .slots = 16384, .ops = 100000 }, + { .name = "uniform_small", .kind = WORK_CHURN, .dist = DIST_UNIFORM, .lo = 1, .hi = 1024, .slots = 32768, .ops = 200000 }, + { .name = "log_mixed", .kind = WORK_CHURN, .dist = DIST_LOG, .lo = 16, .hi = 16384, .slots = 8192, .ops = 100000 }, + { .name = "large", .kind = WORK_CHURN, .dist = DIST_UNIFORM, .lo = 4096, .hi = 65536, .slots = 1024, .ops = 20000 }, + { .name = "ramp_lifo", .kind = WORK_RAMP_LIFO, .dist = DIST_UNIFORM, .lo = 16, .hi = 512, .slots = 200000 }, + { .name = "ramp_fifo", .kind = WORK_RAMP_FIFO, .dist = DIST_UNIFORM, .lo = 16, .hi = 512, .slots = 200000 }, + { .name = "pin_5pct", .kind = WORK_PIN, .dist = DIST_UNIFORM, .lo = 16, .hi = 256, .slots = 200000, .survive_pct = 5 }, + { .name = "realloc_grow", .kind = WORK_REALLOC, .dist = DIST_FIXED, .lo = 16, .hi = 65536, .slots = 2048, .ops = 100000 }, +}; + +typedef struct { + const c8* phase; + u64 ns_per_op; + void* ctx; + const bench_backend_t* backend; +} bench_report_t; + +typedef struct { + const c8* phase; + const c8* backend; + u64 ns_per_op; + u64 req; + u64 used; + u64 reserved; + u64 util; + bool exact; +} bench_result_t; + +typedef struct { + bench_result_t rows [BENCH_MAX_BACKENDS][BENCH_MAX_PHASES]; + u32 num_phases [BENCH_MAX_BACKENDS]; + u32 num_backends; +} bench_results_t; + +static bench_results_t results = sp_zero; + +static void bench_report(bench_report_t r) { + u64 used = 0; + u64 reserved = 0; + bool exact = r.backend->sample(r.ctx, &used, &reserved); + + SP_ASSERT(results.num_backends < BENCH_MAX_BACKENDS); + SP_ASSERT(results.num_phases[results.num_backends] < BENCH_MAX_PHASES); + bench_result_t* result = &results.rows[results.num_backends][results.num_phases[results.num_backends]++]; + *result = (bench_result_t) { + .phase = r.phase, + .backend = r.backend->name, + .ns_per_op = r.ns_per_op, + .req = state.live_req, + .used = used, + .reserved = reserved, + .util = reserved ? (state.live_req * 100) / reserved : 0, + .exact = exact, + }; +} + +static void bench_write_ratio(sp_table_writer_t* table, u64 value, u64 best, bool valid) { + if (!valid) { + sp_table_write_cstr(table, "?"); + return; + } + if (value == best) sp_table_color(table, SP_ANSI_FG_GREEN); + if (!best) { + if (value) sp_table_write_cstr(table, "-"); + else sp_table_write_f64(table, 1.0); + return; + } + sp_table_write_f64(table, (f64)value / (f64)best); +} + +static void bench_render_results() { + if (!results.num_backends) return; + + sp_mem_arena_marker_t scratch = sp_mem_begin_scratch(); + sp_table_writer_t table = sp_zero; + sp_table_init(&table, scratch.mem); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("phase") }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("backend") }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("ns/op"), .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("req"), .fmt = "{.bytes}", .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("used"), .fmt = "{.bytes}", .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("rsvd"), .fmt = "{.bytes}", .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("util"), .fmt = "{}%", .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("ns/best"), .fmt = "{:.2}x", .align = SP_FMT_ALIGN_RIGHT }); + sp_table_add_col(&table, (sp_table_col_t) { .header = sp_str_lit("rsvd/best"), .fmt = "{:.2}x", .align = SP_FMT_ALIGN_RIGHT }); + + u32 num_phases = results.num_phases[0]; + sp_for(phase, num_phases) { + u64 best_ns = 0; + u64 best_rsvd = 0; + bool have_rsvd = false; + sp_for(b, results.num_backends) { + bench_result_t* result = &results.rows[b][phase]; + if (!b || result->ns_per_op < best_ns) best_ns = result->ns_per_op; + if (result->exact && (!have_rsvd || result->reserved < best_rsvd)) { + best_rsvd = result->reserved; + have_rsvd = true; + } + } + + sp_for(b, results.num_backends) { + SP_ASSERT(results.num_phases[b] == num_phases); + bench_result_t* result = &results.rows[b][phase]; + sp_table_begin(&table); + sp_table_write_cstr(&table, result->phase); + sp_table_write_cstr(&table, result->backend); + sp_table_write_u64(&table, result->ns_per_op); + sp_table_write_u64(&table, result->req); + if (result->exact) { + sp_table_write_u64(&table, result->used); + sp_table_write_u64(&table, result->reserved); + sp_table_write_u64(&table, result->util); + } + else { + sp_table_write_cstr(&table, "?"); + sp_table_write_cstr(&table, "?"); + sp_table_write_cstr(&table, "?"); + } + bench_write_ratio(&table, result->ns_per_op, best_ns, true); + bench_write_ratio(&table, result->reserved, best_rsvd, result->exact && have_rsvd); + } + } + sp_table_log(&table); + sp_mem_end_scratch(scratch); +} + +static void bench_run(const bench_workload_t* w, const bench_backend_t* backend) { + void* ctx = backend->create(); + state.mem = backend->as_mem(ctx); + state.live_req = 0; + state.rng = 0x5EED5EED5EED5EEDULL; + sp_mem_zero(state.ptrs, w->slots * sizeof(void*)); + sp_mem_zero(state.sizes, w->slots * sizeof(u64)); + + + sp_tm_timer_t timer = sp_tm_start_timer(); + sp_for(it, w->slots) { + bench_alloc_slot(w, it); + } + bench_report((bench_report_t) { + .phase = "fill", + .ns_per_op = sp_tm_read_timer(&timer) / w->slots, + .ctx = ctx, + .backend = backend, + }); + + switch (w->kind) { + case WORK_CHURN: { + sp_tm_reset_timer(&timer); + sp_for(it, w->ops) { + u32 slot = (u32)(bench_rng_next() % w->slots); + if (state.ptrs[slot]) bench_free_slot(slot); + else bench_alloc_slot(w, slot); + } + bench_report((bench_report_t) { + .phase = "churn", + .ns_per_op = sp_tm_read_timer(&timer) / w->ops, + .ctx = ctx, + .backend = backend, + }); + break; + } + case WORK_RAMP_LIFO: { + sp_tm_reset_timer(&timer); + for (u32 it = w->slots; it > 0; it--) { + bench_free_slot(it - 1); + } + bench_report((bench_report_t) { + .phase = "free", + .ns_per_op = sp_tm_read_timer(&timer) / w->slots, + .ctx = ctx, + .backend = backend, + }); + break; + } + case WORK_RAMP_FIFO: { + sp_tm_reset_timer(&timer); + sp_for(it, w->slots) { + bench_free_slot(it); + } + bench_report((bench_report_t) { + .phase = "free", + .ns_per_op = sp_tm_read_timer(&timer) / w->slots, + .ctx = ctx, + .backend = backend, + }); + break; + } + case WORK_PIN: { + sp_tm_reset_timer(&timer); + u32 freed = 0; + sp_for(it, w->slots) { + if (bench_rng_next() % 100 >= w->survive_pct) { + bench_free_slot(it); + freed++; + } + } + bench_report((bench_report_t) { + .phase = "pinned", + .ns_per_op = sp_tm_read_timer(&timer) / sp_max(freed, 1), + .ctx = ctx, + .backend = backend, + }); + break; + } + case WORK_REALLOC: { + sp_tm_reset_timer(&timer); + sp_for(it, w->ops) { + u32 slot = (u32)(bench_rng_next() % w->slots); + if (!state.ptrs[slot]) { + bench_alloc_slot(w, slot); + } + else if (state.sizes[slot] * 2 > w->hi) { + bench_free_slot(slot); + } + else { + bench_realloc_slot(slot, state.sizes[slot] * 2); + } + } + bench_report((bench_report_t) { + .phase = "grow", + .ns_per_op = sp_tm_read_timer(&timer) / w->ops, + .ctx = ctx, + .backend = backend, + }); + break; + } + } + + sp_tm_reset_timer(&timer); + u32 drained = 0; + sp_for(it, w->slots) { + if (state.ptrs[it]) { + bench_free_slot(it); + drained++; + } + } + if (drained) { + bench_report((bench_report_t) { + .phase = "drained", + .ns_per_op = sp_tm_read_timer(&timer) / drained, + .ctx = ctx, + .backend = backend, + }); + } + + backend->destroy(ctx); + results.num_backends++; +} + +s32 main(s32 argc, const c8** argv) { + sp_str_t workload_filter = argc > 1 ? sp_str_view(argv[1]) : sp_str_lit(""); + sp_str_t backend_filter = argc > 2 ? sp_str_view(argv[2]) : sp_str_lit(""); + + sp_mem_zero(state.ptrs, sizeof(state.ptrs)); + sp_mem_zero(state.sizes, sizeof(state.sizes)); + + sp_carr_for(workloads, w) { + const bench_workload_t* workload = &workloads[w]; + if (!sp_str_empty(workload_filter)) { + if (!sp_str_equal_cstr(workload_filter, workload->name)) { + continue; + } + } + + sp_log("> {.yellow}", sp_fmt_cstr(workload->name)); + sp_log( + "min={.cyan} max={.cyan} slots={.cyan} ops={.cyan}", + sp_fmt_uint(workload->lo), + sp_fmt_uint(sp_max(workload->lo, workload->hi)), + sp_fmt_uint(workload->slots), + sp_fmt_uint(workload->ops) + ); + + results = sp_zero_s(bench_results_t); + sp_carr_for(backends, b) { + const bench_backend_t* backend = &backends[b]; + if (!sp_str_empty(backend_filter)) { + if (!sp_str_equal(sp_str_view(backend->name), backend_filter)) { + continue; + } + } + bench_run(workload, backend); + } + bench_render_results(); + sp_log(""); + } + + return 0; +} diff --git a/test/bench/ubench.h b/test/bench/ubench.h new file mode 100644 index 0000000..956411b --- /dev/null +++ b/test/bench/ubench.h @@ -0,0 +1,1876 @@ + +#ifndef SP_BENCH_H +#define SP_BENCH_H + +#if defined(UBENCH_ENABLE_SQLITE) && defined(__linux__) && !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif + +#ifndef SP_PRIVATE_HEADER +#define SP_PRIVATE_HEADER +#endif + +#include "sp.h" +SP_BEGIN_EXTERN_C() + +typedef u64 ubench_size_t; + +//////////// +// MACROS // +//////////// +#if defined(SP_CPP) + #define UBENCH_C_FUNC extern "C" +#else + #define UBENCH_C_FUNC +#endif + +#if defined(SP_MSVC) + #define UBENCH_UNUSED +#else + #define UBENCH_UNUSED SP_ATTRIBUTE(unused) +#endif + +#if defined(SP_CPP) + #define UBENCH_EXTERN extern "C" +#else + #define UBENCH_EXTERN extern +#endif + +#if defined(SP_MSVC) + #define UBENCH_DO_NOT_OPTIMIZE(x) \ + do { \ + _ReadWriteBarrier(); \ + ubench_do_nothing((void *)&(x)); \ + } while (0) + #define UBENCH_CLOBBER_MEMORY() _ReadWriteBarrier() + +#else + #define UBENCH_CLOBBER_MEMORY() __asm__ volatile("" : : : "memory") + + #if defined(SP_CLANG) + #define UBENCH_DO_NOT_OPTIMIZE(x) \ + __asm__ volatile("" : "+r,m"(x) : : "memory") + #else + #define UBENCH_DO_NOT_OPTIMIZE(x) \ + __asm__ volatile("" : "+m,r"(x) : : "memory") + #endif +#endif + + +typedef struct ubench_run_state_s { + s64 *ns; + s64 *pause_ns; + s64 size; + s64 sample; + s64 paused_ns; + s64 pause_start; + s64 bytes_processed; + s64 items_processed; + /* Auto-tuned per-sample batch size: each clock-bracketed sample executes + `batch` body invocations when the body uses UBENCH_LOOP. The runner + amortizes clock-call overhead so micro-bodies (sub-µs) become measurable. + batch_consumed is set non-zero by UBENCH_LOOP to signal that the body + opted into batching, so the runner knows whether to tune. */ + s64 batch; + s64 batch_consumed; +} ubench_run_state_t; + +struct ubench_benchmark_state_s; + +typedef void (*ubench_body_t)(void *fixture, struct ubench_run_state_s *ubs); +typedef void (*ubench_setup_t)(void *fixture); +typedef void (*ubench_teardown_t)(void *fixture); + +typedef struct ubench_fixture_ops_s { + ubench_setup_t setup; + ubench_teardown_t teardown; + size_t size; +} ubench_fixture_ops_t; + +typedef void (*ubench_dispatch_t)(struct ubench_benchmark_state_s *b, + struct ubench_run_state_s *ubs); + +typedef struct ubench_benchmark_state_s { + sp_str_t name; + ubench_body_t body; + const struct ubench_fixture_ops_s *ops; + ubench_dispatch_t dispatch; +} ubench_benchmark_state_t; + +typedef struct ubench_state_s { + ubench_benchmark_state_t* benchmarks; // @spader make this sp_da, get rid of len + ubench_size_t benchmarks_length; + f64 confidence; + sp_mem_t mem; +} ubench_state_t; + +typedef struct unbench_benchmark_config_s { + const c8* name; + ubench_body_t body; + const ubench_fixture_ops_t* ops; + ubench_dispatch_t dispatch; +} ubench_benchmark_config_t; + +SP_API struct ubench_state_s ubench_state; +SP_API void ubench_run_lifecycle(ubench_benchmark_state_t* b, ubench_run_state_t* run); +SP_API void ubench_invoke(ubench_benchmark_state_t* b, ubench_run_state_t* run); +SP_API s32 ubench_do_benchmark(ubench_run_state_t* const run); +SP_API void ubench_register_benchmark(sp_str_t name, ubench_body_t body, const ubench_fixture_ops_t* ops, ubench_dispatch_t dispatch); +SP_API void ubench_register_benchmark_s(ubench_benchmark_config_t config); +SP_API SP_INLINE void ubench_pause(ubench_run_state_t* const run); +SP_API SP_INLINE void ubench_resume(ubench_run_state_t* const run); +SP_API SP_INLINE void ubench_do_nothing(void* ptr); +SP_API sp_str_t sp_cpu_get_model_a(sp_mem_t mem); +SP_API u32 sp_cpu_get_thread_count(void); + +#define UBENCH_STATE() \ + struct ubench_state_s ubench_state = { \ + .benchmarks = SP_NULLPTR, \ + .benchmarks_length = 0, \ + .confidence = 2.5, \ + .mem = {sp_mem_os_on_alloc, SP_NULLPTR}} + +#define UBENCH_MAIN() \ + UBENCH_STATE(); \ + s32 main(s32 argc, const c8* const argv[]) { \ + return ubench_main(argc, argv); \ + } + + +typedef struct sqlite3 sqlite3; +typedef struct sqlite3_stmt sqlite3_stmt; +typedef long long int sqlite3_int64; +typedef void (*sqlite3_destructor_type)(void*); + +#define SQLITE_OK 0 +#define SQLITE_ROW 100 +#define SQLITE_DONE 101 +#define SQLITE_STATIC ((sqlite3_destructor_type)0) +#define SQLITE_TRANSIENT ((sqlite3_destructor_type)-1) + +int sqlite3_open(const char *filename, sqlite3 **ppDb); +int sqlite3_close(sqlite3*); +int sqlite3_exec(sqlite3*, const char *sql, int (*callback)(void*,int,char**,char**), void*, char **errmsg); +const char *sqlite3_errmsg(sqlite3*); +int sqlite3_prepare_v2(sqlite3 *db, const char *zSql, int nByte, sqlite3_stmt **ppStmt, const char **pzTail); +int sqlite3_step(sqlite3_stmt*); +int sqlite3_reset(sqlite3_stmt *pStmt); +int sqlite3_finalize(sqlite3_stmt *pStmt); +sqlite3_int64 sqlite3_last_insert_rowid(sqlite3*); +int sqlite3_bind_null(sqlite3_stmt*, int); +int sqlite3_bind_int(sqlite3_stmt*, int, int); +int sqlite3_bind_int64(sqlite3_stmt*, int, sqlite3_int64); +int sqlite3_bind_double(sqlite3_stmt*, int, double); +int sqlite3_bind_text(sqlite3_stmt*, int, const char*, int, void(*)(void*)); +sqlite3_int64 sqlite3_column_int64(sqlite3_stmt*, int iCol); + + +SP_IMP SP_INLINE s32 ubench_should_filter(const c8 *filter, const c8 *benchmark); +SP_IMP SP_INLINE s32 ubench_int64_cmp(const void* a, const void* b); +SP_IMP SP_INLINE f32 sp_sys_sqrtf(f32 x); +SP_IMP void ubench_fmt_tty_green(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params); +SP_IMP void ubench_fmt_tty_red(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params); +SP_IMP void ubench_fmt_tty_reset(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params); + +#define BENCH_STORE_SCHEMA \ + "CREATE TABLE IF NOT EXISTS machines (" \ + " id INTEGER PRIMARY KEY," \ + " fingerprint TEXT NOT NULL UNIQUE," \ + " hostname TEXT," \ + " os_name TEXT, os_version TEXT, arch TEXT," \ + " cpu_model TEXT, cpu_cores INTEGER, cpu_threads INTEGER," \ + " memory_bytes INTEGER);" \ + "CREATE TABLE IF NOT EXISTS runs (" \ + " id INTEGER PRIMARY KEY," \ + " machine_id INTEGER NOT NULL REFERENCES machines(id)," \ + " started_at TEXT NOT NULL," \ + " finished_at TEXT," \ + " executable_path TEXT," \ + " executable_size_bytes INTEGER," \ + " executable_mtime TEXT," \ + " confidence_threshold REAL," \ + " filter TEXT," \ + " has_perf_counters INTEGER," \ + " label TEXT," \ + " framework TEXT," \ + " metadata TEXT);" \ + "CREATE TABLE IF NOT EXISTS benchmarks (" \ + " id INTEGER PRIMARY KEY," \ + " name TEXT NOT NULL UNIQUE);" \ + "CREATE TABLE IF NOT EXISTS results (" \ + " id INTEGER PRIMARY KEY," \ + " run_id INTEGER NOT NULL REFERENCES runs(id)," \ + " benchmark_id INTEGER NOT NULL REFERENCES benchmarks(id)," \ + " iterations INTEGER NOT NULL," \ + " mean_ns REAL NOT NULL," \ + " median_ns REAL NOT NULL," \ + " min_ns REAL NOT NULL," \ + " max_ns REAL NOT NULL," \ + " stddev_ns REAL," \ + " stddev_pct REAL," \ + " ci_low_ns REAL," \ + " ci_high_ns REAL," \ + " ci_level_pct REAL," \ + " confidence_pct REAL," \ + " bytes_processed INTEGER," \ + " items_processed INTEGER," \ + " cycles_per_iter INTEGER," \ + " instructions_per_iter INTEGER," \ + " UNIQUE(run_id, benchmark_id));" \ + "CREATE INDEX IF NOT EXISTS idx_results_bench_run" \ + " ON results(benchmark_id, run_id);" + +#define BENCH_UNSET_I64 ((s64)-1) +#define BENCH_UNSET_F64 (-1.0) + +typedef struct bench_store bench_store; + +typedef struct { + c8 hostname[256]; + c8 os_name[64]; + c8 os_version[128]; + c8 arch[64]; + c8 cpu_model[256]; + s32 cpu_cores; + s32 cpu_threads; + s64 memory_bytes; +} bench_machine_info; + +typedef struct { + const c8 *executable_path; + s64 executable_size_bytes; + const c8 *executable_mtime; /* ISO 8601 UTC. */ + const c8 *filter; + const c8 *label; + const c8 *framework; + const c8 *metadata_json; + f64 confidence_threshold; /* < 0 => NULL. */ + s32 has_perf_counters; /* < 0 => NULL. */ +} bench_run_info; + +typedef struct { + s64 iterations; + f64 mean_ns; + f64 median_ns; + f64 min_ns; + f64 max_ns; + f64 stddev_ns; + f64 stddev_pct; + f64 ci_low_ns; + f64 ci_high_ns; + f64 ci_level_pct; + f64 confidence_pct; + s64 bytes_processed; + s64 items_processed; + s64 cycles_per_iter; + s64 instructions_per_iter; +} bench_result; + +SP_API bench_store* bench_store_open(const c8 *path); +SP_API void bench_store_close(bench_store* s); +SP_API s32 bench_collect_machine_info(bench_machine_info* out); +SP_API s64 bench_store_begin_run(bench_store* s, const bench_machine_info* mi, const bench_run_info* ri); +SP_API s32 bench_store_record(bench_store* s, s64 run_id, const c8 *bench_name, const bench_result* r); +SP_API s32 bench_store_end_run(bench_store* s, s64 run_id); +SP_API s64 bench_simple_begin_run( + bench_store* s, + const c8* framework, const c8* label, + const c8* executable_path, s64 executable_size_bytes, const c8* executable_mtime, + f64 confidence_threshold, + s32 has_perf_counters +); +SP_API s32 bench_simple_record( + bench_store* s, s64 run_id, + const c8* name, + s64 iterations, + f64 mean_ns, f64 median_ns, + f64 min_ns, f64 max_ns, + f64 stddev_ns, f64 stddev_pct, + f64 ci_low_ns, f64 ci_high_ns, f64 ci_level_pct, + f64 confidence_pct, + s64 bytes_processed, s64 items_processed, + s64 cycles_per_iter, s64 instructions_per_iter +); + +SP_IMP s32 bench__get_or_insert_machine(sqlite3* db, const bench_machine_info* m, sqlite3_int64* out_id); +SP_IMP void bench__read_first_field(const c8* path, const c8* prefix, c8* dst, u32 dst_size); +SP_IMP void bench__make_fingerprint(const bench_machine_info* m, c8* dst, u32 dst_size); +SP_IMP s32 bench__get_or_insert_benchmark(sqlite3* db, const c8* name, sqlite3_int64* out_id); +SP_IMP void bench__bind_text_or_null(sqlite3_stmt *stmt, int idx, const c8 *s); +SP_IMP void bench__bind_i64_or_null(sqlite3_stmt *stmt, int idx, s64 v); +SP_IMP void bench__bind_f64_or_null(sqlite3_stmt *stmt, int idx, f64 v); + +/////////////////////// +// UBENCH_DO_NOTHING // +/////////////////////// +#if defined(SP_MSVC) + UBENCH_C_FUNC void _ReadWriteBarrier(void); + + void ubench_do_nothing(void *ptr) { + (void)ptr; + _ReadWriteBarrier(); + } +#elif defined(SP_CLANG) + void ubench_do_nothing(void *ptr) { + _Pragma("clang diagnostic push") + _Pragma("clang diagnostic ignored \"-Wlanguage-extension-token\""); + __asm__ volatile("" : : "r"(ptr), "m"(ptr) : "memory"); + _Pragma("clang diagnostic pop"); + } +#else + void ubench_do_nothing(void *ptr) { + __asm__ volatile("" : : "r"(ptr), "m"(ptr) : "memory"); + } +#endif + + +//////////////////////// +// UBENCH_INITIALIZER // +//////////////////////// +#if defined(SP_CPP) + #if defined(SP_CLANG) + #define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wglobal-constructors\"") + + #define UBENCH_INITIALIZER_END_DISABLE_WARNINGS _Pragma("clang diagnostic pop") + #else + #define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS + #define UBENCH_INITIALIZER_END_DISABLE_WARNINGS + #endif + + #define UBENCH_INITIALIZER(f) \ + struct f##_cpp_struct { \ + f##_cpp_struct(); \ + }; \ + UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS static f##_cpp_struct \ + f##_cpp_global UBENCH_INITIALIZER_END_DISABLE_WARNINGS; \ + f##_cpp_struct::f##_cpp_struct() + +#elif defined(SP_MSVC) + #define UBENCH_SYMBOL_PREFIX + + #if defined(SP_CLANG) + #define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wmissing-variable-declarations\"") + + #define UBENCH_INITIALIZER_END_DISABLE_WARNINGS _Pragma("clang diagnostic pop") + #else + #define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS + #define UBENCH_INITIALIZER_END_DISABLE_WARNINGS + #endif + + #pragma section(".CRT$XCU", read) + #define UBENCH_INITIALIZER(f) \ + static void __cdecl f(void); \ + UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS __pragma( \ + comment(linker, "/include:" UBENCH_SYMBOL_PREFIX #f "_")) UBENCH_C_FUNC \ + __declspec(allocate(".CRT$XCU")) void(__cdecl * f##_)(void) = f; \ + UBENCH_INITIALIZER_END_DISABLE_WARNINGS static void __cdecl f(void) +#else + #define UBENCH_INITIALIZER(f) \ + static void f(void) SP_ATTRIBUTE(constructor); \ + static void f(void) +#endif + +////////////////////////////// +// UBENCH_SURPRESS_WARNINGS // +////////////////////////////// +#if defined(SP_CLANG) +#if __has_warning("-Wunsafe-buffer-usage") +#define UBENCH_SURPRESS_WARNINGS_BEGIN \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wunsafe-buffer-usage\"") +#define UBENCH_SURPRESS_WARNINGS_END _Pragma("clang diagnostic pop") +#else +#define UBENCH_SURPRESS_WARNINGS_BEGIN +#define UBENCH_SURPRESS_WARNINGS_END +#endif +#elif defined(SP_GNUC) && __GNUC__ >= 8 && defined(SP_CPP) +#define UBENCH_SURPRESS_WARNINGS_BEGIN \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wclass-memaccess\"") +#define UBENCH_SURPRESS_WARNINGS_END _Pragma("GCC diagnostic pop") +#else +#define UBENCH_SURPRESS_WARNINGS_BEGIN +#define UBENCH_SURPRESS_WARNINGS_END +#endif + +#define UBENCH_DO_BENCHMARK() \ + while (ubench_do_benchmark(ubench_run_state) > 0) + +#define UBENCH_DO_NOTHING(x) \ + ubench_do_nothing(x) + +#define UBENCH_EX(SET, NAME) \ + UBENCH_SURPRESS_WARNINGS_BEGIN \ + static void ubench_##SET##_##NAME(void *, \ + struct ubench_run_state_s *); \ + UBENCH_INITIALIZER(ubench_register_##SET##_##NAME) { \ + ubench_register_benchmark( \ + sp_str_lit(#SET "." #NAME), \ + &ubench_##SET##_##NAME, \ + SP_NULLPTR, SP_NULLPTR \ + ); \ + } \ + UBENCH_SURPRESS_WARNINGS_END \ + void ubench_##SET##_##NAME(void *ubench_fixture_unused UBENCH_UNUSED, \ + struct ubench_run_state_s *ubench_run_state) + +/* The user body receives `ubench_run_state` as a parameter so that + UBENCH_LOOP, UBENCH_PAUSE, UBENCH_RESUME, UBENCH_SET_BYTES_PROCESSED, etc. + can resolve the symbol from inside a UBENCH(...) body. The parameter has + a fixed name and is unused by callers that don't need it, so this is a + silent extension to the macro contract. */ +#define UBENCH(SET, NAME) \ + static void ubench_run_##SET##_##NAME(struct ubench_run_state_s *); \ + UBENCH_EX(SET, NAME) { \ + UBENCH_DO_BENCHMARK() { ubench_run_##SET##_##NAME(ubench_run_state); } \ + } \ + void ubench_run_##SET##_##NAME( \ + struct ubench_run_state_s *ubench_run_state UBENCH_UNUSED) + +#define UBENCH_F_SETUP(FIXTURE) \ + static void ubench_f_setup_impl_##FIXTURE(struct FIXTURE *ubench_fixture); \ + static void ubench_f_setup_##FIXTURE(void *ubench_fixture_void) { \ + ubench_f_setup_impl_##FIXTURE((struct FIXTURE *)ubench_fixture_void); \ + } \ + static void ubench_f_setup_impl_##FIXTURE(struct FIXTURE *ubench_fixture) + +#define UBENCH_F_TEARDOWN(FIXTURE) \ + static void ubench_f_teardown_impl_##FIXTURE(struct FIXTURE *ubench_fixture);\ + static void ubench_f_teardown_##FIXTURE(void *ubench_fixture_void) { \ + ubench_f_teardown_impl_##FIXTURE((struct FIXTURE *)ubench_fixture_void); \ + } \ + static void ubench_f_teardown_impl_##FIXTURE(struct FIXTURE *ubench_fixture) + +#define UBENCH_EX_F(FIXTURE, NAME) \ + UBENCH_SURPRESS_WARNINGS_BEGIN \ + static void ubench_f_setup_##FIXTURE(void *); \ + static void ubench_f_teardown_##FIXTURE(void *); \ + static void ubench_run_ex_##FIXTURE##_##NAME(struct FIXTURE *, \ + struct ubench_run_state_s *); \ + static void ubench_f_##FIXTURE##_##NAME( \ + void *ubench_fixture_void, \ + struct ubench_run_state_s *ubench_run_state) { \ + ubench_run_ex_##FIXTURE##_##NAME((struct FIXTURE *)ubench_fixture_void, \ + ubench_run_state); \ + } \ + UBENCH_INITIALIZER(ubench_register_##FIXTURE##_##NAME) { \ + static const struct ubench_fixture_ops_s ubench_ops_##FIXTURE##_##NAME = { \ + .setup = &ubench_f_setup_##FIXTURE, \ + .teardown = &ubench_f_teardown_##FIXTURE, \ + .size = sizeof(struct FIXTURE)}; \ + ubench_register_benchmark(sp_str_lit(#FIXTURE "." #NAME), \ + &ubench_f_##FIXTURE##_##NAME, \ + &ubench_ops_##FIXTURE##_##NAME, \ + SP_NULLPTR); \ + } \ + UBENCH_SURPRESS_WARNINGS_END \ + void ubench_run_ex_##FIXTURE##_##NAME( \ + struct FIXTURE *ubench_fixture, \ + struct ubench_run_state_s *ubench_run_state) + +#define UBENCH_F(FIXTURE, NAME) \ + static void ubench_run_##FIXTURE##_##NAME(struct FIXTURE *, \ + struct ubench_run_state_s *); \ + UBENCH_EX_F(FIXTURE, NAME) { \ + UBENCH_DO_BENCHMARK() { \ + ubench_run_##FIXTURE##_##NAME(ubench_fixture, ubench_run_state); \ + } \ + } \ + void ubench_run_##FIXTURE##_##NAME( \ + struct FIXTURE *ubench_fixture, \ + struct ubench_run_state_s *ubench_run_state UBENCH_UNUSED) + +// Prevent 64-bit integer overflow when computing a timestamp by using a trick +// from Sokol: +// https://github.com/floooh/sokol/blob/189843bf4f86969ca4cc4b6d94e793a37c5128a7/sokol_time.h#L204 +SP_IMP SP_INLINE s64 ubench_mul_div(const s64 value, const s64 numer, const s64 denom) { + const s64 q = value / denom; + const s64 r = value % denom; + return q * numer + r * numer / denom; +} + +static SP_INLINE s64 ubench_ns(void) { +#if defined(SP_WIN32) + /* QPC frequency is constant for the lifetime of the process; query once. */ + static s64 qpc_freq = 0; + LARGE_INTEGER counter; + if (qpc_freq == 0) { + LARGE_INTEGER f; + QueryPerformanceFrequency(&f); + qpc_freq = f.QuadPart; + } + QueryPerformanceCounter(&counter); + return ubench_mul_div(counter.QuadPart, 1000000000, qpc_freq); +#elif defined(SP_LINUX) + /* Use a monotonic clock so NTP slew/step cannot corrupt deltas. Prefer + CLOCK_MONOTONIC_RAW where available (Linux >= 2.6.28) since it is also + immune to adjtimex frequency steering. */ + struct timespec ts; +#if defined(CLOCK_MONOTONIC_RAW) + const clockid_t cid = CLOCK_MONOTONIC_RAW; +#else + const clockid_t cid = CLOCK_MONOTONIC; +#endif + clock_gettime(cid, &ts); + return sp_cast(s64, ts.tv_sec) * 1000 * 1000 * 1000 + + ts.tv_nsec; +#elif defined(SP_MACOS) + return sp_cast(s64, clock_gettime_nsec_np(CLOCK_UPTIME_RAW)); +#else +#error Unsupported platform! +#endif +} + +void ubench_run_lifecycle(ubench_benchmark_state_t* b, ubench_run_state_t* run) { + if (b->ops != SP_NULLPTR) { + sp_mem_arena_marker_t scratch = sp_mem_begin_scratch_for(ubench_state.mem); + void *fixture = sp_alloc(scratch.mem, b->ops->size); + b->ops->setup(fixture); + b->body(fixture, run); + b->ops->teardown(fixture); + sp_mem_end_scratch(scratch); + } else { + b->body(SP_NULLPTR, run); + } +} + +void ubench_invoke(ubench_benchmark_state_t* b, ubench_run_state_t* run) { + if (b->dispatch != SP_NULLPTR) { + b->dispatch(b, run); + } else { + ubench_run_lifecycle(b, run); + } +} + +// @spader C string in the public API +void ubench_register_benchmark(sp_str_t name, ubench_body_t body, const ubench_fixture_ops_t* ops, ubench_dispatch_t dispatch) { + const ubench_size_t i = ubench_state.benchmarks_length++; + ubench_state.benchmarks = sp_ptr_cast( + ubench_benchmark_state_t*, + sp_realloc(ubench_state.mem, ubench_state.benchmarks, sizeof(ubench_benchmark_state_t) * ubench_state.benchmarks_length) + ); + + ubench_state.benchmarks[i].name = name; + ubench_state.benchmarks[i].body = body; + ubench_state.benchmarks[i].ops = ops; + ubench_state.benchmarks[i].dispatch = dispatch; +} + +void ubench_register_benchmark_s(ubench_benchmark_config_t config) { + ubench_register_benchmark(sp_cstr_as_str(config.name), config.body, config.ops, config.dispatch); +} + + +s32 ubench_do_benchmark(ubench_run_state_t* const run) { + const s64 curr_sample = run->sample++; + if (curr_sample > 0) { + run->pause_ns[curr_sample - 1] = run->paused_ns; + } + run->paused_ns = 0; + run->ns[curr_sample] = ubench_ns(); + return curr_sample < run->size ? 1 : 0; +} + +void ubench_pause(ubench_run_state_t* const run) { + run->pause_start = ubench_ns(); +} + +void ubench_resume(ubench_run_state_t* const run) { + run->paused_ns += ubench_ns() - run->pause_start; +} + +#define UBENCH_PAUSE() \ + ubench_pause(ubench_run_state) + +#define UBENCH_RESUME() \ + ubench_resume(ubench_run_state) + +#define UBENCH_SET_BYTES_PROCESSED(N) \ + (ubench_run_state->bytes_processed = (s64)(N)) + +#define UBENCH_SET_ITEMS_PROCESSED(N) \ + (ubench_run_state->items_processed = (s64)(N)) + +#define UBENCH_LOOP \ + for ( \ + s64 ubench_loop_i_ = ((ubench_run_state->batch_consumed = 1), \ + ubench_run_state->batch); \ + ubench_loop_i_ > 0; ubench_loop_i_-- \ + ) + +s32 ubench_should_filter(const c8 *filter, const c8 *benchmark) { + if (filter) { + const c8 *filter_cur = filter; + const c8 *benchmark_cur = benchmark; + const c8 *filter_wildcard = SP_NULLPTR; + + while (('\0' != *filter_cur) && ('\0' != *benchmark_cur)) { + if ('*' == *filter_cur) { + /* store the position of the wildcard */ + filter_wildcard = filter_cur; + + /* skip the wildcard character */ + filter_cur++; + + while (('\0' != *filter_cur) && ('\0' != *benchmark_cur)) { + if ('*' == *filter_cur) { + /* + we found another wildcard (filter is something like *foo*) so we + exit the current loop, and return to the parent loop to handle + the wildcard case + */ + break; + } else if (*filter_cur != *benchmark_cur) { + /* otherwise our filter didn't match, so reset it */ + filter_cur = filter_wildcard; + } + + /* move benchmark along */ + benchmark_cur++; + + /* move filter along */ + filter_cur++; + } + + if (('\0' == *filter_cur) && ('\0' == *benchmark_cur)) { + return 0; + } + + /* if the benchmarks have been exhausted, we don't have a match! */ + if ('\0' == *benchmark_cur) { + return 1; + } + } else { + if (*benchmark_cur != *filter_cur) { + /* benchmark doesn't match filter */ + return 1; + } else { + /* move our filter and benchmark forward */ + benchmark_cur++; + filter_cur++; + } + } + } + + if (('\0' != *filter_cur) || + (('\0' != *benchmark_cur) && + ((filter == filter_cur) || ('*' != filter_cur[-1])))) { + /* we have a mismatch! */ + return 1; + } + } + + return 0; +} + +s32 ubench_int64_cmp(const void *a, const void *b) { + const s64 aa = *sp_ptr_cast(const s64 *, a); + const s64 bb = *sp_ptr_cast(const s64 *, b); + return aa < bb ? -1 : (aa > bb ? 1 : 0); +} + +f32 sp_sys_sqrtf(f32 x) { + if (x < 0) return 0; + if (x == 0) return 0; + f32 guess = x / 2.0f; + for (s32 i = 0; i < 10; i++) { + guess = (guess + x / guess) / 2.0f; + } + return guess; +} + +void ubench_fmt_tty_green(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params) { + sp_unused(arg); sp_unused(params); + if (sp_os_is_tty(sp_sys_stdout)) { + sp_io_write_cstr(io, SP_ANSI_FG_GREEN, SP_NULLPTR); + } +} + +void ubench_fmt_tty_red(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params) { + sp_unused(arg); sp_unused(params); + if (sp_os_is_tty(sp_sys_stdout)) { + sp_io_write_cstr(io, SP_ANSI_FG_RED, SP_NULLPTR); + } +} + +void ubench_fmt_tty_reset(sp_io_writer_t *io, sp_fmt_arg_t *arg, sp_fmt_arg_t *params) { + sp_unused(arg); sp_unused(params); + if (sp_os_is_tty(sp_sys_stdout)) { + sp_io_write_cstr(io, SP_ANSI_RESET, SP_NULLPTR); + } +} + +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) +#include + +struct ubench_perf_s { + s32 group_fd; + s32 instr_fd; + /* Per-pair overhead introduced by the ioctl(RESET)+ioctl(ENABLE) ... + ioctl(DISABLE)+read() sequence itself, measured at startup with an empty + body. Subtracted from every measurement so reported counts approximate + just the user code. */ + u64 overhead_cycles; + u64 overhead_instructions; +}; + +static s32 ubench_perf_open_event(s32 leader, u32 config) { + struct perf_event_attr pea; + sp_mem_zero(&pea, sizeof(pea)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(pea); + pea.config = config; + pea.disabled = (leader == -1) ? 1 : 0; + pea.exclude_kernel = 1; + pea.exclude_hv = 1; + pea.read_format = PERF_FORMAT_GROUP; + return sp_cast(s32, + sp_syscall(SP_SYSCALL_NUM_PERF_EVENT_OPEN, &pea, 0, -1, leader, 0)); +} + +static void ubench_perf_start(struct ubench_perf_s *p) { + if (p->group_fd < 0) { + return; + } + sp_syscall(SP_SYSCALL_NUM_IOCTL, p->group_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + sp_syscall(SP_SYSCALL_NUM_IOCTL, p->group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); +} + +static void ubench_perf_stop_raw(struct ubench_perf_s *p, + u64 *cycles, + u64 *instructions) { + struct { + u64 nr; + u64 values[2]; + } buf; + *cycles = 0; + *instructions = 0; + if (p->group_fd < 0) { + return; + } + sp_syscall(SP_SYSCALL_NUM_IOCTL, p->group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); + if (sp_syscall(SP_SYSCALL_NUM_READ, p->group_fd, &buf, sizeof(buf)) == + sp_cast(s64, sizeof(buf)) && + buf.nr == 2) { + *cycles = buf.values[0]; + *instructions = buf.values[1]; + } +} + +static void ubench_perf_stop(struct ubench_perf_s *p, u64 *cycles, + u64 *instructions) { + ubench_perf_stop_raw(p, cycles, instructions); + /* Subtract per-pair overhead measured at init. Saturate to zero rather than + wrap around if a single ultra-cheap measurement happens to undercount. */ + *cycles = (*cycles > p->overhead_cycles) ? *cycles - p->overhead_cycles : 0; + *instructions = (*instructions > p->overhead_instructions) + ? *instructions - p->overhead_instructions + : 0; +} + +static void ubench_perf_init(struct ubench_perf_s *p) { + p->instr_fd = -1; + p->overhead_cycles = 0; + p->overhead_instructions = 0; + p->group_fd = ubench_perf_open_event(-1, PERF_COUNT_HW_CPU_CYCLES); + if (p->group_fd < 0) { + return; + } + p->instr_fd = + ubench_perf_open_event(p->group_fd, PERF_COUNT_HW_INSTRUCTIONS); + if (p->instr_fd < 0) { + sp_syscall(SP_SYSCALL_NUM_CLOSE, p->group_fd); + p->group_fd = -1; + return; + } + + /* Calibrate per-pair start/stop overhead: take the minimum of N empty + start/stop pairs, mirroring nanobench's mCalibratedOverhead. */ + { + u64 best_cycles = (u64)-1; + u64 best_instructions = (u64)-1; + s32 trial; + for (trial = 0; trial < 32; trial++) { + u64 c = 0, i = 0; + ubench_perf_start(p); + ubench_perf_stop_raw(p, &c, &i); + if (c < best_cycles) { + best_cycles = c; + } + if (i < best_instructions) { + best_instructions = i; + } + } + if (best_cycles == (u64)-1) { + best_cycles = 0; + } + if (best_instructions == (u64)-1) { + best_instructions = 0; + } + p->overhead_cycles = best_cycles; + p->overhead_instructions = best_instructions; + } +} + +static void ubench_perf_close(struct ubench_perf_s *p) { + if (p->group_fd >= 0) { + sp_syscall(SP_SYSCALL_NUM_CLOSE, p->instr_fd); + sp_syscall(SP_SYSCALL_NUM_CLOSE, p->group_fd); + p->group_fd = -1; + p->instr_fd = -1; + } +} +#endif + +SP_END_EXTERN_C() + +#endif // SP_BENCH_H + + + +#if defined SP_IMPLEMENTATION && !defined(SP_BENCH_IMPLEMENTATION) + #define SP_BENCH_IMPLEMENTATION +#endif + +#if defined(SP_BENCH_IMPLEMENTATION) + +#include +#include + +#if defined(SP_WIN32) +#if !defined(WIN32_LEAN_AND_MEAN) +#define WIN32_LEAN_AND_MEAN +#endif +#include +#include +#else +#include +#include +#endif + +#if defined(__APPLE__) +#include +#include +#endif + +SP_BEGIN_EXTERN_C() + +#if defined(UBENCH_ENABLE_SQLITE) +struct bench_store { + sqlite3 * db; + sqlite3_stmt* result_stmt; + sp_mem_arena_t* arena; + sp_mem_t mem; +}; + +void bench__read_first_field(const c8* path, const c8* prefix, c8* dst, u32 dst_size) { + FILE *f = fopen(path, "r"); + c8 line[1024]; + u32 plen = (u32)strlen(prefix); + dst[0] = '\0'; + if (!f) return; + while (fgets(line, sizeof(line), f)) { + if (strncmp(line, prefix, plen) == 0) { + const c8 *p = line + plen; + u32 i = 0; + while (*p == ' ' || *p == '\t' || *p == ':') p++; + while (*p && *p != '\n' && i + 1 < dst_size) dst[i++] = *p++; + dst[i] = '\0'; + break; + } + } + fclose(f); +} + +void bench__make_fingerprint(const bench_machine_info* m, c8* dst, u32 dst_size) { + s64 mem_gb = (m->memory_bytes + (1LL << 30) - 1) / (1LL << 30); + snprintf(dst, dst_size, "%s|%s|%s|%d|%lld", + m->os_name[0] ? m->os_name : "Unknown", + m->arch[0] ? m->arch : "unknown", + m->cpu_model[0] ? m->cpu_model : "unknown", + (int)m->cpu_threads, + (long long)mem_gb); +} + + +#if defined(SP_LINUX) + +static sp_str_t ubench_cpu_read_file_a(sp_mem_t mem, const c8 *path) { + sp_sys_fd_t fd = (sp_sys_fd_t)sp_syscall( + SP_SYSCALL_NUM_OPENAT, SP_AT_FDCWD, path, SP_O_RDONLY, 0); + if (fd < 0) return sp_zero_s(sp_str_t); + + u64 cap = 4096; + c8 *buf = sp_alloc_n(mem, c8, cap); + u64 len = 0; + for (;;) { + if (len == cap) { + u64 new_cap = cap * 2; + c8 *grown = sp_alloc_n(mem, c8, new_cap); + sp_mem_copy(grown, buf, len); + buf = grown; + cap = new_cap; + } + s64 n = sp_syscall(SP_SYSCALL_NUM_READ, fd, buf + len, cap - len); + if (n <= 0) break; + len += (u64)n; + } + sp_syscall(SP_SYSCALL_NUM_CLOSE, fd); + return (sp_str_t) { .data = buf, .len = (u32)len }; +} + +sp_str_t sp_cpu_get_model_a(sp_mem_t mem) { + sp_mem_arena_marker_t s = sp_mem_begin_scratch_for(mem); + sp_str_t cpuinfo = ubench_cpu_read_file_a(s.mem, "/proc/cpuinfo"); + sp_str_t result = sp_zero_s(sp_str_t); + + sp_da(sp_str_t) lines = sp_str_split_c8(s.mem, cpuinfo, '\n'); + sp_da_for(lines, i) { + sp_str_t line = lines[i]; + if (!sp_str_starts_with(line, sp_str_lit("model name")) && + !sp_str_starts_with(line, sp_str_lit("Hardware"))) { + continue; + } + s32 colon = sp_str_find_c8(line, ':'); + if (colon < 0) continue; + sp_str_t value = sp_str_sub(line, colon + 1, line.len - colon - 1); + result = sp_str_copy(mem, sp_str_trim(value)); + break; + } + + sp_mem_end_scratch(s); + return result; +} + +u32 sp_cpu_get_thread_count(void) { + /* glibc cpu_set_t is 1024 bits; the kernel pads to 8-byte multiples. */ + u8 mask[128] = sp_zero; + s64 rc = sp_syscall(SP_SYSCALL_NUM_SCHED_GETAFFINITY, 0, sizeof(mask), mask); + if (rc <= 0) return 1; + u32 count = 0; + for (u64 i = 0; i < (u64)rc; i++) { + u8 b = mask[i]; + while (b) { count += b & 1; b >>= 1; } + } + return count ? count : 1; +} + +#elif defined(SP_MACOS) + +sp_str_t sp_cpu_get_model_a(sp_mem_t mem) { + c8 buf[256] = sp_zero; + size_t len = sizeof(buf); + if (sysctlbyname("machdep.cpu.brand_string", buf, &len, NULL, 0) != 0) { + return sp_zero_s(sp_str_t); + } + /* sysctlbyname returns len including the trailing NUL on success. */ + if (len > 0 && buf[len - 1] == '\0') len--; + return sp_str_copy(mem, (sp_str_t){ .data = buf, .len = (u32)len }); +} + +u32 sp_cpu_get_thread_count(void) { + int v = 0; + size_t len = sizeof(v); + if (sysctlbyname("hw.logicalcpu", &v, &len, NULL, 0) == 0 && v > 0) { + return (u32)v; + } + return 1; +} + +#elif defined(SP_WIN32) + +sp_str_t sp_cpu_get_model_a(sp_mem_t mem) { + HKEY key; + c8 buf[256] = sp_zero; + DWORD len = (DWORD)sizeof(buf); + LONG rc; + if (RegOpenKeyExA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + 0, KEY_READ, &key) != ERROR_SUCCESS) { + return sp_zero_s(sp_str_t); + } + rc = RegQueryValueExA(key, "ProcessorNameString", NULL, NULL, + (LPBYTE)buf, &len); + RegCloseKey(key); + if (rc != ERROR_SUCCESS) return sp_zero_s(sp_str_t); + /* RegQueryValueExA returns len including the trailing NUL for REG_SZ. */ + if (len > 0 && buf[len - 1] == '\0') len--; + return sp_str_copy(mem, (sp_str_t){ .data = buf, .len = (u32)len }); +} + +u32 sp_cpu_get_thread_count(void) { + SYSTEM_INFO si; + GetNativeSystemInfo(&si); + return si.dwNumberOfProcessors > 0 ? (u32)si.dwNumberOfProcessors : 1; +} + +#else + #error "ubench: sp_cpu_* impl missing for this platform" +#endif + +SP_API s32 bench_collect_machine_info(bench_machine_info *m) { + memset(m, 0, sizeof(*m)); +#if defined(SP_WIN32) + { + DWORD n = (DWORD)sizeof(m->hostname); + SYSTEM_INFO si; + MEMORYSTATUSEX mem; + if (!GetComputerNameA(m->hostname, &n)) m->hostname[0] = '\0'; + snprintf(m->os_name, sizeof(m->os_name), "%s", "Windows"); + /* os_version is intentionally left blank: GetVersionExA is deprecated and + lies about the running OS, and RtlGetVersion needs a runtime ntdll + resolve. The fingerprint is fine without it. */ + GetNativeSystemInfo(&si); + switch (si.wProcessorArchitecture) { + case PROCESSOR_ARCHITECTURE_AMD64: snprintf(m->arch, sizeof(m->arch), "x86_64"); break; + case PROCESSOR_ARCHITECTURE_ARM64: snprintf(m->arch, sizeof(m->arch), "aarch64"); break; + case PROCESSOR_ARCHITECTURE_INTEL: snprintf(m->arch, sizeof(m->arch), "x86"); break; + default: snprintf(m->arch, sizeof(m->arch), "unknown"); + } + mem.dwLength = sizeof(mem); + if (GlobalMemoryStatusEx(&mem)) m->memory_bytes = (s64)mem.ullTotalPhys; + } +#else + { + struct utsname u; + if (uname(&u) == 0) { + snprintf(m->os_name, sizeof(m->os_name), "%s", u.sysname); + snprintf(m->os_version, sizeof(m->os_version), "%s", u.release); + snprintf(m->arch, sizeof(m->arch), "%s", u.machine); + } + if (gethostname(m->hostname, sizeof(m->hostname)) != 0) m->hostname[0] = '\0'; + m->hostname[sizeof(m->hostname) - 1] = '\0'; + } +#endif +#if defined(__linux__) + { + c8 buf[64]; + bench__read_first_field("/proc/meminfo", "MemTotal", buf, sizeof(buf)); + m->memory_bytes = (s64)atoll(buf) * 1024; + } +#elif defined(__APPLE__) + { + s64 bytes = 0; + size_t len = sizeof(bytes); + if (sysctlbyname("hw.memsize", &bytes, &len, NULL, 0) == 0) m->memory_bytes = bytes; + } +#endif + + { + sp_mem_arena_marker_t s = sp_mem_begin_scratch(); + sp_str_t model = sp_cpu_get_model_a(s.mem); + sp_cstr_copy_to_n(model.data, model.len, + m->cpu_model, sizeof(m->cpu_model)); + sp_mem_end_scratch(s); + } + m->cpu_threads = (s32)sp_cpu_get_thread_count(); + m->cpu_cores = m->cpu_threads; + return 0; +} + +SP_IMP s32 bench__get_or_insert_machine(sqlite3 *db, const bench_machine_info *m, sqlite3_int64 *out_id) { + c8 fingerprint[512]; + sqlite3_stmt *stmt = NULL; + s32 rc; + + bench__make_fingerprint(m, fingerprint, sizeof(fingerprint)); + + rc = sqlite3_prepare_v2(db, "SELECT id FROM machines WHERE fingerprint=?", + -1, &stmt, NULL); + if (rc != SQLITE_OK) goto fail; + sqlite3_bind_text(stmt, 1, fingerprint, -1, SQLITE_TRANSIENT); + if (sqlite3_step(stmt) == SQLITE_ROW) { + *out_id = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return 0; + } + sqlite3_finalize(stmt); + + rc = sqlite3_prepare_v2( + db, + "INSERT INTO machines (fingerprint, hostname, os_name, os_version, " + "arch, cpu_model, cpu_cores, cpu_threads, memory_bytes) " + "VALUES (?,?,?,?,?,?,?,?,?)", + -1, &stmt, NULL); + if (rc != SQLITE_OK) goto fail; + sqlite3_bind_text (stmt, 1, fingerprint, -1, SQLITE_TRANSIENT); + sqlite3_bind_text (stmt, 2, m->hostname, -1, SQLITE_TRANSIENT); + sqlite3_bind_text (stmt, 3, m->os_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_text (stmt, 4, m->os_version,-1, SQLITE_TRANSIENT); + sqlite3_bind_text (stmt, 5, m->arch, -1, SQLITE_TRANSIENT); + sqlite3_bind_text (stmt, 6, m->cpu_model, -1, SQLITE_TRANSIENT); + sqlite3_bind_int (stmt, 7, m->cpu_cores); + sqlite3_bind_int (stmt, 8, m->cpu_threads); + sqlite3_bind_int64(stmt, 9, m->memory_bytes); + if (sqlite3_step(stmt) != SQLITE_DONE) goto fail; + *out_id = sqlite3_last_insert_rowid(db); + sqlite3_finalize(stmt); + return 0; +fail: + fprintf(stderr, "bench machine: %s\n", sqlite3_errmsg(db)); + if (stmt) sqlite3_finalize(stmt); + return -1; +} + +s32 bench__get_or_insert_benchmark(sqlite3* db, const c8* name, sqlite3_int64* out_id) { + sqlite3_stmt *stmt = NULL; + s32 rc = sqlite3_prepare_v2(db, "SELECT id FROM benchmarks WHERE name=?", + -1, &stmt, NULL); + if (rc != SQLITE_OK) goto fail; + sqlite3_bind_text(stmt, 1, name, -1, SQLITE_TRANSIENT); + if (sqlite3_step(stmt) == SQLITE_ROW) { + *out_id = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return 0; + } + sqlite3_finalize(stmt); + + rc = sqlite3_prepare_v2(db, "INSERT INTO benchmarks (name) VALUES (?)", + -1, &stmt, NULL); + if (rc != SQLITE_OK) goto fail; + sqlite3_bind_text(stmt, 1, name, -1, SQLITE_TRANSIENT); + if (sqlite3_step(stmt) != SQLITE_DONE) goto fail; + *out_id = sqlite3_last_insert_rowid(db); + sqlite3_finalize(stmt); + return 0; +fail: + fprintf(stderr, "bench benchmark: %s\n", sqlite3_errmsg(db)); + if (stmt) sqlite3_finalize(stmt); + return -1; +} + +void bench__bind_text_or_null(sqlite3_stmt *stmt, int idx, const c8 *s) { + if (s) { + sqlite3_bind_text(stmt, idx, s, -1, SQLITE_TRANSIENT); + } else { + sqlite3_bind_null(stmt, idx); + } +} + +void bench__bind_i64_or_null(sqlite3_stmt *stmt, int idx, s64 v) { + if (v == BENCH_UNSET_I64) { + sqlite3_bind_null(stmt, idx); + } else { + sqlite3_bind_int64(stmt, idx, v); + } +} + +void bench__bind_f64_or_null(sqlite3_stmt *stmt, int idx, f64 v) { + if (v <= BENCH_UNSET_F64) { + sqlite3_bind_null(stmt, idx); + } else { + sqlite3_bind_double(stmt, idx, v); + } +} + +bench_store* bench_store_open(const c8 *path) { + sp_mem_t mem = sp_mem_os_new(); + + bench_store* s = sp_alloc_type(mem, bench_store); + s->arena = sp_mem_arena_new(mem); + s->mem = sp_mem_arena_as_allocator(s->arena); + + if (sqlite3_open(path, &s->db) != SQLITE_OK) { + fprintf(stderr, "bench open: %s\n", sqlite3_errmsg(s->db)); + sqlite3_close(s->db); + goto error; + } + sqlite3_exec(s->db, "PRAGMA journal_mode=WAL", 0, 0, 0); + sqlite3_exec(s->db, "PRAGMA foreign_keys=ON", 0, 0, 0); + if (sqlite3_exec(s->db, BENCH_STORE_SCHEMA, 0, 0, 0) != SQLITE_OK) { + fprintf(stderr, "bench schema: %s\n", sqlite3_errmsg(s->db)); + sqlite3_close(s->db); + goto error; + } + + return s; + +error: + if (s) sp_mem_allocator_free(mem, s); + return SP_NULLPTR; +} + +void bench_store_close(bench_store* s) { + if (!s) return; + if (s->result_stmt) sqlite3_finalize(s->result_stmt); + if (s->db) sqlite3_close(s->db); + + sp_mem_t mem = sp_mem_os_new(); + sp_mem_allocator_free(mem, s); +} + +s64 bench_store_begin_run(bench_store* s, const bench_machine_info* mi, const bench_run_info* ri) { + sqlite3_int64 machine_id = 0, run_id = 0; + sqlite3_stmt *stmt = NULL; + c8 started_at[32]; + + if (!s || !s->db || !mi || !ri) return -1; + if (bench__get_or_insert_machine(s->db, mi, &machine_id) != 0) return -1; + + //bench__iso_time_now(started_at, sizeof(started_at)); + sp_mem_fixed_t mem = sp_mem_fixed(started_at, sizeof(started_at)); + sp_tm_epoch_to_iso8601(sp_mem_fixed_as_allocator(&mem), sp_tm_now_epoch()); + + if (sqlite3_prepare_v2( + s->db, + "INSERT INTO runs (machine_id, started_at, executable_path, " + "executable_size_bytes, executable_mtime, confidence_threshold, " + "filter, has_perf_counters, label, framework, metadata) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?)", + -1, &stmt, NULL) != SQLITE_OK) goto fail; + + sqlite3_bind_int64(stmt, 1, machine_id); + sqlite3_bind_text (stmt, 2, started_at, -1, SQLITE_TRANSIENT); + bench__bind_text_or_null(stmt, 3, ri->executable_path); + if (ri->executable_size_bytes > 0) + sqlite3_bind_int64(stmt, 4, ri->executable_size_bytes); + else + sqlite3_bind_null(stmt, 4); + bench__bind_text_or_null(stmt, 5, ri->executable_mtime); + if (ri->confidence_threshold >= 0) + sqlite3_bind_double(stmt, 6, ri->confidence_threshold); + else + sqlite3_bind_null(stmt, 6); + bench__bind_text_or_null(stmt, 7, ri->filter); + if (ri->has_perf_counters >= 0) sqlite3_bind_int (stmt, 8, ri->has_perf_counters); + else sqlite3_bind_null(stmt, 8); + bench__bind_text_or_null(stmt, 9, ri->label); + bench__bind_text_or_null(stmt, 10, ri->framework); + bench__bind_text_or_null(stmt, 11, ri->metadata_json); + + if (sqlite3_step(stmt) != SQLITE_DONE) goto fail; + run_id = sqlite3_last_insert_rowid(s->db); + sqlite3_finalize(stmt); + + /* Batch all subsequent INSERTs into one transaction; bench_store_end_run + commits. The prepared INSERT is reused for every record. */ + sqlite3_exec(s->db, "BEGIN TRANSACTION", 0, 0, 0); + if (sqlite3_prepare_v2( + s->db, + "INSERT INTO results (run_id, benchmark_id, iterations, mean_ns, " + "median_ns, min_ns, max_ns, stddev_ns, stddev_pct, ci_low_ns, " + "ci_high_ns, ci_level_pct, confidence_pct, bytes_processed, " + "items_processed, cycles_per_iter, instructions_per_iter) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + -1, &s->result_stmt, NULL) != SQLITE_OK) goto fail; + + return (s64)run_id; +fail: + fprintf(stderr, "bench begin_run: %s\n", sqlite3_errmsg(s->db)); + if (stmt) sqlite3_finalize(stmt); + return -1; +} + +s32 bench_store_record(bench_store* s, s64 run_id, const c8* bench_name, const bench_result* r) { + sqlite3_int64 bench_id = 0; + if (!s || !s->result_stmt || !bench_name || !r) return -1; + if (bench__get_or_insert_benchmark(s->db, bench_name, &bench_id) != 0) return -1; + + sqlite3_reset(s->result_stmt); + sqlite3_bind_int64 (s->result_stmt, 1, run_id); + sqlite3_bind_int64 (s->result_stmt, 2, bench_id); + sqlite3_bind_int64 (s->result_stmt, 3, r->iterations); + sqlite3_bind_double(s->result_stmt, 4, r->mean_ns); + sqlite3_bind_double(s->result_stmt, 5, r->median_ns); + sqlite3_bind_double(s->result_stmt, 6, r->min_ns); + sqlite3_bind_double(s->result_stmt, 7, r->max_ns); + bench__bind_f64_or_null(s->result_stmt, 8, r->stddev_ns); + bench__bind_f64_or_null(s->result_stmt, 9, r->stddev_pct); + bench__bind_f64_or_null(s->result_stmt, 10, r->ci_low_ns); + bench__bind_f64_or_null(s->result_stmt, 11, r->ci_high_ns); + bench__bind_f64_or_null(s->result_stmt, 12, r->ci_level_pct); + bench__bind_f64_or_null(s->result_stmt, 13, r->confidence_pct); + bench__bind_i64_or_null(s->result_stmt, 14, r->bytes_processed); + bench__bind_i64_or_null(s->result_stmt, 15, r->items_processed); + bench__bind_i64_or_null(s->result_stmt, 16, r->cycles_per_iter); + bench__bind_i64_or_null(s->result_stmt, 17, r->instructions_per_iter); + + if (sqlite3_step(s->result_stmt) != SQLITE_DONE) { + fprintf(stderr, "bench record: %s\n", sqlite3_errmsg(s->db)); + return -1; + } + return 0; +} + +s32 bench_store_end_run(bench_store* s, s64 run_id) { + c8 finished_at[32]; + sqlite3_stmt *stmt = NULL; + if (!s || !s->db) return -1; + + if (s->result_stmt) { + sqlite3_finalize(s->result_stmt); + s->result_stmt = NULL; + sqlite3_exec(s->db, "COMMIT TRANSACTION", 0, 0, 0); + } + + sp_mem_fixed_t mem = sp_mem_fixed(finished_at, sizeof(finished_at)); + sp_tm_epoch_to_iso8601(sp_mem_fixed_as_allocator(&mem), sp_tm_now_epoch()); + sqlite3_prepare_v2(s->db, "UPDATE runs SET finished_at=? WHERE id=?", + -1, &stmt, NULL); + sqlite3_bind_text (stmt, 1, finished_at, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmt, 2, run_id); + sqlite3_step (stmt); + sqlite3_finalize(stmt); + return 0; +} + +s64 bench_simple_begin_run( + bench_store* s, + const c8* framework, const c8* label, + const c8* executable_path, s64 executable_size_bytes, const c8* executable_mtime, + f64 confidence_threshold, + s32 has_perf_counters +) { + bench_machine_info mi; + bench_run_info ri; + bench_collect_machine_info(&mi); + memset(&ri, 0, sizeof(ri)); + ri.framework = framework; + ri.label = label; + ri.executable_path = executable_path; + ri.executable_size_bytes = executable_size_bytes; + ri.executable_mtime = executable_mtime; + ri.confidence_threshold = confidence_threshold; + ri.has_perf_counters = has_perf_counters; + return bench_store_begin_run(s, &mi, &ri); +} + +s32 bench_simple_record( + bench_store* s, s64 run_id, + const c8* name, + s64 iterations, + f64 mean_ns, f64 median_ns, + f64 min_ns, f64 max_ns, + f64 stddev_ns, f64 stddev_pct, + f64 ci_low_ns, f64 ci_high_ns, f64 ci_level_pct, + f64 confidence_pct, + s64 bytes_processed, s64 items_processed, + s64 cycles_per_iter, s64 instructions_per_iter +) { + bench_result r; + r.iterations = iterations; + r.mean_ns = mean_ns; + r.median_ns = median_ns; + r.min_ns = min_ns; + r.max_ns = max_ns; + r.stddev_ns = stddev_ns; + r.stddev_pct = stddev_pct; + r.ci_low_ns = ci_low_ns; + r.ci_high_ns = ci_high_ns; + r.ci_level_pct = ci_level_pct; + r.confidence_pct = confidence_pct; + r.bytes_processed = bytes_processed; + r.items_processed = items_processed; + r.cycles_per_iter = cycles_per_iter; + r.instructions_per_iter = instructions_per_iter; + return bench_store_record(s, run_id, name, &r); +} +#endif + +SP_END_EXTERN_C() + +static SP_INLINE s32 ubench_main(s32 argc, const c8 *const argv[]); +s32 ubench_main(s32 argc, const c8 *const argv[]) { + u64 failed = 0; + ubench_size_t index = 0; + ubench_size_t *failed_benchmarks = SP_NULLPTR; + ubench_size_t failed_benchmarks_length = 0; + const c8 *filter = SP_NULLPTR; + u64 ran_benchmarks = 0; + + sp_fmt_register_decorator("green", ubench_fmt_tty_green, ubench_fmt_tty_reset); + sp_fmt_register_decorator("red", ubench_fmt_tty_red, ubench_fmt_tty_reset); + +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + struct ubench_perf_s perf; + ubench_perf_init(&perf); +#endif +#if defined(UBENCH_ENABLE_SQLITE) + const c8 *db_path = "./ubench.db"; + bench_store *store = SP_NULLPTR; + s64 run_id = 0; +#endif + + /* loop through all arguments looking for our options */ + for (index = 1; index < sp_cast(ubench_size_t, argc); index++) { + /* Informational switches */ + const sp_str_t help_str = sp_str_lit("--help"); + const sp_str_t list_str = sp_str_lit("--list-benchmarks"); + /* Benchmark config switches */ + const sp_str_t filter_str = sp_str_lit("--filter="); +#if defined(UBENCH_ENABLE_SQLITE) + const sp_str_t output_str = sp_str_lit("--output="); +#endif + const sp_str_t confidence_str = sp_str_lit("--confidence="); + const sp_str_t arg = sp_cstr_as_str(argv[index]); + + if (sp_str_starts_with(arg, help_str)) { + sp_log("ubench.h - the single file benchmarking solution for C/C++!"); + sp_log("Command line Options:"); + sp_log(" --help Show this message and exit."); + sp_log(" --filter= Filter the benchmarks to run (EG. " + "MyBench*.a would run MyBenchmark.a but not MyBenchmark.b)."); + sp_log(" --list-benchmarks List benchmarks, one per line. " + "Output names can be passed to --filter."); +#if defined(UBENCH_ENABLE_SQLITE) + sp_log(" --output= SQLite database to write results to " + "(default ./ubench.db, 'none' disables)."); +#endif + sp_log(" --confidence= MdAPE (median absolute percent " + "error) cut-off above which a benchmark is reported as failed. " + "Defaults to 2.5%"); + goto cleanup; + } else if (sp_str_starts_with(arg, filter_str)) { + /* user wants to filter what benchmarks run! */ + filter = argv[index] + filter_str.len; +#if defined(UBENCH_ENABLE_SQLITE) + } else if (sp_str_starts_with(arg, output_str)) { + const c8 *value = argv[index] + output_str.len; + if (sp_cstr_equal(value, "none")) { + db_path = SP_NULLPTR; + } else { + db_path = value; + } +#endif + } else if (sp_str_starts_with(arg, list_str)) { + for (index = 0; index < ubench_state.benchmarks_length; index++) { + sp_log("{}", sp_fmt_str(ubench_state.benchmarks[index].name)); + } + + /* when printing the benchmark list, don't actually run the benchmarks */ + goto cleanup; + } else if (sp_str_starts_with(arg, confidence_str)) { + /* user wants to specify a different confidence */ + ubench_state.confidence = + sp_parse_f64(sp_cstr_as_str(argv[index] + confidence_str.len)); + + /* must be between 0 and 100 */ + if ((ubench_state.confidence < 0) || (ubench_state.confidence > 100)) { + sp_print_err( + "Confidence must be in the range [0..100] (you specified {})\n", + sp_fmt_float(ubench_state.confidence)); + goto cleanup; + } + } + } + + for (index = 0; index < ubench_state.benchmarks_length; index++) { + if (ubench_should_filter(filter, ubench_state.benchmarks[index].name.data)) { + continue; + } + + ran_benchmarks++; + } + + sp_log("{.green} Running {} benchmarks.", sp_fmt_cstr("[==========]"), sp_fmt_uint(ran_benchmarks)); + +#if defined(UBENCH_ENABLE_SQLITE) + if (db_path) { + store = bench_store_open(db_path); + if (store) { + bench_machine_info mi; + bench_run_info ri; + c8 exe_path_buf[4096]; + c8 exe_mtime_buf[32]; + s64 exe_size = 0; + s32 has_perf = 0; +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + has_perf = perf.group_fd >= 0 ? 1 : 0; +#endif + exe_path_buf[0] = '\0'; + exe_mtime_buf[0] = '\0'; + + bench_collect_machine_info(&mi); + + /* Collect executable info via sp_fs_*; bench_run_info just holds + pointers into these stack buffers (live until bench_store_begin_run + returns, which is what binds the strings into SQLite). */ + { + sp_mem_arena_marker_t s = sp_mem_begin_scratch(); + sp_str_t exe = sp_fs_get_exe_path(s.mem); + sp_cstr_copy_to_n(exe.data, exe.len, + exe_path_buf, sizeof(exe_path_buf)); + if (exe_path_buf[0]) { + struct stat st; + if (stat(exe_path_buf, &st) == 0) { + exe_size = (s64)st.st_size; + sp_str_t mtime = sp_tm_epoch_to_iso8601( + s.mem, sp_fs_get_mod_time(exe)); + sp_cstr_copy_to_n(mtime.data, mtime.len, + exe_mtime_buf, sizeof(exe_mtime_buf)); + } + } + sp_mem_end_scratch(s); + } + + sp_mem_zero(&ri, sizeof(ri)); + ri.executable_path = exe_path_buf[0] ? exe_path_buf : SP_NULLPTR; + ri.executable_size_bytes = exe_size > 0 ? exe_size : BENCH_UNSET_I64; + ri.executable_mtime = exe_mtime_buf[0] ? exe_mtime_buf : SP_NULLPTR; + ri.filter = filter; + ri.framework = "ubench"; + ri.confidence_threshold = ubench_state.confidence; + ri.has_perf_counters = has_perf; + + run_id = bench_store_begin_run(store, &mi, &ri); + if (run_id < 0) { + bench_store_close(store); + store = SP_NULLPTR; + run_id = 0; + } + } + } +#endif + + for (index = 0; index < ubench_state.benchmarks_length; index++) { + s32 result = 0; + s64 kndex = 0; + s64 cal_ns = 0; + s64 epochs = 0; + /* Per-body times stored as doubles end-to-end so sub-nanosecond bodies + (UBENCH_LOOP-batched microbenchmarks) survive both the display path + and the SQL bind. The unit is nanoseconds. */ + f64 best_avg_ns = 0; + f64 best_min_ns = 0; + f64 best_median_ns = 0; + f64 best_max_ns = 0; + f64 best_deviation = 0; + f64 best_confidence = 0; +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + u64 best_cycles = 0; + u64 best_instructions = 0; + u64 pass_cycles = 0; + u64 pass_instructions = 0; +#endif + struct ubench_run_state_s ubs; + +#define UBENCH_MIN_EPOCHS 16 +#define UBENCH_MAX_EPOCHS 500 + const s64 max_epochs = UBENCH_MAX_EPOCHS; + const s64 min_epochs = UBENCH_MIN_EPOCHS; + /* Add one extra timestamp slot: each sample stores the timestamp at its + start, plus one final timestamp after the last sample exits. */ + s64 ns[UBENCH_MAX_EPOCHS + 1]; + s64 pause_ns[UBENCH_MAX_EPOCHS + 1]; + /* Scratch for MdAPE computation (relative deviations in ppm). */ + s64 mdape_scratch[UBENCH_MAX_EPOCHS]; +#undef UBENCH_MAX_EPOCHS +#undef UBENCH_MIN_EPOCHS + + if (ubench_should_filter(filter, ubench_state.benchmarks[index].name.data)) { + continue; + } + + sp_str_t name = ubench_state.benchmarks[index].name; + sp_log("[ {:<9 .green}] {}", sp_fmt_cstr("RUN"), sp_fmt_str(name)); + + ubs.ns = ns; + ubs.pause_ns = pause_ns; + ubs.size = 1; + ubs.sample = 0; + ubs.paused_ns = 0; + ubs.pause_start = 0; + ubs.bytes_processed = 0; + ubs.items_processed = 0; + ubs.batch = 1; + ubs.batch_consumed = 0; + + /* CALIBRATION: one body invocation, batch=1, to estimate single-body + cost. The body announces UBENCH_LOOP usage by setting batch_consumed. */ + ubench_invoke(&ubench_state.benchmarks[index], &ubs); + cal_ns = ns[1] - ns[0] - pause_ns[0]; + if (cal_ns <= 0) { + cal_ns = 1; + } + + /* Auto-tune batch only if the body opted in via UBENCH_LOOP. The clock- + bracketed window is sized to ~1 ms regardless of body cost, giving + (1 ms / clock_overhead) ~= 10^4..10^6 amortization for sub-µs bodies. */ + ubs.batch = 1; + if (ubs.batch_consumed) { + const s64 target_batch_ns = 1 * 1000 * 1000; + s64 b = target_batch_ns / cal_ns; + if (b < 1) { + b = 1; + } + ubs.batch = b; + } + + /* Choose epoch count: target ~100 ms total wall-time across all samples, + but never fewer than min_epochs (so the median has a meaningful base). */ + { + const s64 target_total_ns = 100 * 1000 * 1000; + const s64 per_sample_ns = ubs.batch * cal_ns; + epochs = target_total_ns / (per_sample_ns > 0 ? per_sample_ns : 1); + if (epochs < min_epochs) { + epochs = min_epochs; + } + if (epochs > max_epochs) { + epochs = max_epochs; + } + } + + /* WARMUP: one throwaway sample at the chosen batch, to prime caches, + branch predictor, TLB, and demand-page any lazy allocations. */ + ubs.size = 1; + ubs.sample = 0; + ubs.paused_ns = 0; + ubs.pause_start = 0; + ubench_invoke(&ubench_state.benchmarks[index], &ubs); + + /* MEASUREMENT */ + ubs.size = epochs; + ubs.sample = 0; + ubs.paused_ns = 0; + ubs.pause_start = 0; +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + ubench_perf_start(&perf); +#endif + ubench_invoke(&ubench_state.benchmarks[index], &ubs); +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + ubench_perf_stop(&perf, &pass_cycles, &pass_instructions); +#endif + + /* Convert raw timestamps to per-batch deltas (in ns) in place. The /batch + split is deferred to f64-precision arithmetic below so that sub-ns + per-body times don't get truncated to zero. */ + for (kndex = 0; kndex < epochs; kndex++) { + s64 d = ns[kndex + 1] - ns[kndex] - pause_ns[kndex]; + if (d < 0) { + d = 0; + } + ns[kndex] = d; + } + + /* Mean per body in f64. */ + { + const f64 batch_d = sp_cast(f64, ubs.batch); + f64 sum = 0; + for (kndex = 0; kndex < epochs; kndex++) { + sum += sp_cast(f64, ns[kndex]); + } + best_avg_ns = sum / (sp_cast(f64, epochs) * batch_d); + + /* Sample stddev (kept for legacy reporting). */ + { + f64 var = 0; + for (kndex = 0; kndex < epochs; kndex++) { + const f64 v = + sp_cast(f64, ns[kndex]) / batch_d - best_avg_ns; + var += v * v; + } + var /= sp_cast(f64, epochs); + best_deviation = + (best_avg_ns > 0) + ? ((f64)sp_sys_sqrtf((f32)var) / best_avg_ns) * 100.0 + : 0.0; + } + } + + /* Sort raw per-batch samples to derive median, min, max. MdAPE is + scale-invariant ((x-med)/med), so it works on raw samples without /batch + — the batch factor cancels in the ratio. */ + sp_os_qsort(ns, sp_cast(ubench_size_t, epochs), sizeof(*ns), ubench_int64_cmp); + { + const s64 raw_median = ns[epochs / 2]; + const f64 batch_d = sp_cast(f64, ubs.batch); + best_min_ns = sp_cast(f64, ns[0]) / batch_d; + best_median_ns = sp_cast(f64, raw_median) / batch_d; + best_max_ns = sp_cast(f64, ns[epochs - 1]) / batch_d; + + /* MdAPE = median of |x - median| / median, in percent. Robust against + the heavy-tailed one-sided noise typical of microbenchmark + distributions (preemption, page faults, IRQs, frequency steps). + Replaces the prior Gaussian CI, which was structurally wrong here. */ + best_confidence = 0.0; + if (raw_median > 0) { + for (kndex = 0; kndex < epochs; kndex++) { + s64 v = ns[kndex] - raw_median; + if (v < 0) { + v = -v; + } + mdape_scratch[kndex] = (v * 1000000) / raw_median; + } + sp_os_qsort(mdape_scratch, sp_cast(ubench_size_t, epochs), sizeof(*mdape_scratch), ubench_int64_cmp); + { + const s64 mid = epochs / 2; + best_confidence = + sp_cast(f64, mdape_scratch[mid]) / 10000.0; + } + } + } +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + { + const u64 total_bodies = + sp_cast(u64, epochs) * + sp_cast(u64, ubs.batch); + if (total_bodies > 0) { + best_cycles = pass_cycles / total_bodies; + best_instructions = pass_instructions / total_bodies; + } + } +#endif + + /* Flag the benchmark as failed if MdAPE exceeds the user threshold. */ + result = best_confidence > ubench_state.confidence; + + if (result) { + sp_log("MdAPE {}% exceeds maximum permitted {}%", + sp_fmt_float(best_confidence), + sp_fmt_float(ubench_state.confidence)); + } + + { + const f64 bps = (ubs.bytes_processed > 0 && best_avg_ns > 0) + ? sp_cast(f64, ubs.bytes_processed) * 1e9 / + sp_cast(f64, best_avg_ns) + : 0.0; + const f64 ips = (ubs.items_processed > 0 && best_avg_ns > 0) + ? sp_cast(f64, ubs.items_processed) * 1e9 / + sp_cast(f64, best_avg_ns) + : 0.0; + +#if defined(UBENCH_ENABLE_SQLITE) + if (store) { + bench_result br; + sp_mem_zero(&br, sizeof(br)); + br.iterations = epochs; + br.mean_ns = best_avg_ns; + br.median_ns = best_median_ns; + br.min_ns = best_min_ns; + br.max_ns = best_max_ns; + br.stddev_ns = best_deviation * best_avg_ns / 100.0; + br.stddev_pct = best_deviation; + br.ci_low_ns = BENCH_UNSET_F64; + br.ci_high_ns = BENCH_UNSET_F64; + br.ci_level_pct = BENCH_UNSET_F64; + br.confidence_pct = best_confidence; + br.bytes_processed = ubs.bytes_processed > 0 ? ubs.bytes_processed + : BENCH_UNSET_I64; + br.items_processed = ubs.items_processed > 0 ? ubs.items_processed + : BENCH_UNSET_I64; +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + br.cycles_per_iter = perf.group_fd >= 0 ? (s64)best_cycles + : BENCH_UNSET_I64; + br.instructions_per_iter = perf.group_fd >= 0 ? (s64)best_instructions + : BENCH_UNSET_I64; +#else + br.cycles_per_iter = BENCH_UNSET_I64; + br.instructions_per_iter = BENCH_UNSET_I64; +#endif + bench_store_record(store, run_id, + ubench_state.benchmarks[index].name.data, &br); + } +#endif + + { + const c8 *unit = "us"; + f64 scale_div = 1.0; + + if (0 != result) { + const ubench_size_t failed_benchmark_index = failed_benchmarks_length++; + failed_benchmarks = sp_ptr_cast( + ubench_size_t *, + sp_realloc(ubench_state.mem, + sp_ptr_cast(void *, failed_benchmarks), + sizeof(ubench_size_t) * failed_benchmarks_length)); + failed_benchmarks[failed_benchmark_index] = index; + failed++; + } + + if (0 != result) { + sp_print("[{:^10 .red}] ", sp_fmt_cstr("FAILED")); + } else { + sp_print("[{:>9 .green} ] ", sp_fmt_cstr("OK")); + } + sp_print("{} (mean ", sp_fmt_str(ubench_state.benchmarks[index].name)); + + /* Auto-scale display: pick a unit so the mean prints in [1, 1000). */ + if (best_avg_ns >= 1e9) { + unit = "s"; + scale_div = 1e9; + } else if (best_avg_ns >= 1e6) { + unit = "ms"; + scale_div = 1e6; + } else if (best_avg_ns >= 1e3) { + unit = "us"; + scale_div = 1e3; + } else if (best_avg_ns >= 1.0) { + unit = "ns"; + scale_div = 1.0; + } else { + unit = "ps"; + scale_div = 1e-3; + } + sp_print("{:.3}{}, median {:.3}{}, min {:.3}{}, MdAPE {}%", + sp_fmt_float(best_avg_ns / scale_div), sp_fmt_cstr(unit), + sp_fmt_float(best_median_ns / scale_div), sp_fmt_cstr(unit), + sp_fmt_float(best_min_ns / scale_div), sp_fmt_cstr(unit), + sp_fmt_float(best_confidence)); + + if (bps > 0.0) { + const c8 *bps_unit; + f64 bps_scaled; + if (bps >= 1e9) { bps_unit = "GB/s"; bps_scaled = bps / 1e9; } + else if (bps >= 1e6) { bps_unit = "MB/s"; bps_scaled = bps / 1e6; } + else if (bps >= 1e3) { bps_unit = "KB/s"; bps_scaled = bps / 1e3; } + else { bps_unit = "B/s"; bps_scaled = bps; } + sp_print(", {:.3} {}", sp_fmt_float(bps_scaled), sp_fmt_cstr(bps_unit)); + } + if (ips > 0.0) { + const c8 *ips_unit; + f64 ips_scaled; + if (ips >= 1e9) { ips_unit = "G items/s"; ips_scaled = ips / 1e9; } + else if (ips >= 1e6) { ips_unit = "M items/s"; ips_scaled = ips / 1e6; } + else if (ips >= 1e3) { ips_unit = "K items/s"; ips_scaled = ips / 1e3; } + else { ips_unit = "items/s"; ips_scaled = ips; } + sp_print(", {:.3} {}", sp_fmt_float(ips_scaled), sp_fmt_cstr(ips_unit)); + } +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + if (perf.group_fd >= 0) { + sp_print(", {} cycles, {} instructions", + sp_fmt_uint(best_cycles), sp_fmt_uint(best_instructions)); + } +#endif + sp_log(")"); + } + } + } + + sp_log("{.green} {} benchmarks ran.", + sp_fmt_cstr("[==========]"), sp_fmt_uint(ran_benchmarks)); + sp_log("[{:^10 .green}] {} benchmarks.", + sp_fmt_cstr("PASSED"), sp_fmt_uint(ran_benchmarks - failed)); + + if (0 != failed) { + sp_log("[{:^10 .red}] {} benchmarks, listed below:", + sp_fmt_cstr("FAILED"), sp_fmt_uint(failed)); + for (index = 0; index < failed_benchmarks_length; index++) { + sp_log("[{:^10 .red}] {}", + sp_fmt_cstr("FAILED"), + sp_fmt_str(ubench_state.benchmarks[failed_benchmarks[index]].name)); + } + } + +cleanup: + sp_free(ubench_state.mem, sp_ptr_cast(void *, failed_benchmarks)); + sp_free(ubench_state.mem, sp_ptr_cast(void *, ubench_state.benchmarks)); + +#if defined(UBENCH_ENABLE_PERF_COUNTERS) && defined(SP_LINUX) + ubench_perf_close(&perf); +#endif +#if defined(UBENCH_ENABLE_SQLITE) + if (store) { + if (run_id > 0) bench_store_end_run(store, run_id); + bench_store_close(store); + } +#endif + + return sp_cast(s32, failed); +} + +#endif // SP_BENCH_C diff --git a/test/mem.c b/test/mem.c index 2880bff..7d0af90 100644 --- a/test/mem.c +++ b/test/mem.c @@ -6,6 +6,7 @@ #include "mem/builtin.c" #include "mem/arena.c" #include "mem/fixed.c" +#include "mem/heap.c" #include "mem/slice.c" #ifdef MEM_TEST_OWNS_MAIN diff --git a/test/mem/heap.c b/test/mem/heap.c new file mode 100644 index 0000000..735b05a --- /dev/null +++ b/test/mem/heap.c @@ -0,0 +1,696 @@ +#include "mem.h" + +typedef enum { + HEAP_OP_NONE = 0, + HEAP_OP_ALLOC, + HEAP_OP_FREE, + HEAP_OP_REALLOC, + HEAP_OP_WRITE, +} heap_op_kind_t; + +typedef struct { + heap_op_kind_t kind; + u32 ref; + u32 src; + u64 size; + u64 offset; + u64 value; +} heap_op_t; + +typedef struct { + u32 a; + u32 b; +} heap_pair_t; + +typedef struct { + u64 offset; + u64 value; +} heap_data_t; + +typedef struct { + u32 ref; + u64 bucket; + u64 zeroed; + bool large; + heap_data_t data [4]; +} heap_ref_check_t; + +typedef struct { + heap_pair_t same [8]; + heap_pair_t different [8]; + heap_pair_t same_ptr [8]; + heap_pair_t different_ptr [8]; + heap_ref_check_t refs [8]; + u64 num_spans; + u64 num_large; + u64 recycled; + struct { u64 used; u64 reserved; } bytes; +} heap_expect_t; + +#define HEAP_MAX_REFS 32 + +typedef struct { + heap_op_t ops [32]; + heap_expect_t expect; +} heap_test_t; + +static u64 count_heap_spans(sp_mem_heap_t* heap) { + u64 n = 0; + sp_for(b, SP_MEM_HEAP_NUM_BUCKETS) { + for (sp_mem_heap_span_t* s = heap->buckets[b].partial; s; s = s->next) n++; + for (sp_mem_heap_span_t* s = heap->buckets[b].full; s; s = s->next) n++; + } + return n; +} + +static void run_heap_test(s32* utest_result, heap_test_t t) { + sp_mem_heap_t* heap = sp_mem_heap_new(); + void* ptrs [HEAP_MAX_REFS] = sp_zero; + + sp_carr_for(t.ops, i) { + heap_op_t* op = &t.ops[i]; + if (op->kind == HEAP_OP_NONE) break; + switch (op->kind) { + case HEAP_OP_NONE: break; + case HEAP_OP_ALLOC: + ptrs[op->ref] = sp_mem_heap_alloc(heap, op->size); + EXPECT_NE(ptrs[op->ref], SP_NULLPTR); + EXPECT_EQ((uintptr_t)ptrs[op->ref] & (SP_MEM_ALIGNMENT - 1), 0u); + break; + case HEAP_OP_FREE: + sp_mem_heap_free(heap, ptrs[op->ref]); + ptrs[op->ref] = SP_NULLPTR; + break; + case HEAP_OP_REALLOC: + ptrs[op->ref] = sp_mem_heap_realloc(heap, ptrs[op->src], op->size); + if (op->size) { + EXPECT_NE(ptrs[op->ref], SP_NULLPTR); + EXPECT_EQ((uintptr_t)ptrs[op->ref] & (SP_MEM_ALIGNMENT - 1), 0u); + } + else { + EXPECT_EQ(ptrs[op->ref], SP_NULLPTR); + } + break; + case HEAP_OP_WRITE: + ((u8*)ptrs[op->ref])[op->offset] = (u8)op->value; + break; + } + } + + heap_expect_t* e = &t.expect; + + sp_carr_for(e->same, i) { + heap_pair_t p = e->same[i]; + if (!p.a && !p.b) break; + sp_mem_heap_span_t* sa = sp_mem_heap_find_span(heap, ptrs[p.a]); + sp_mem_heap_span_t* sb = sp_mem_heap_find_span(heap, ptrs[p.b]); + EXPECT_NE(sa, SP_NULLPTR); + EXPECT_EQ(sa, sb); + } + + sp_carr_for(e->different, i) { + heap_pair_t p = e->different[i]; + if (!p.a && !p.b) break; + sp_mem_heap_span_t* sa = sp_mem_heap_find_span(heap, ptrs[p.a]); + sp_mem_heap_span_t* sb = sp_mem_heap_find_span(heap, ptrs[p.b]); + EXPECT_NE(sa, sb); + } + + sp_carr_for(e->same_ptr, i) { + heap_pair_t p = e->same_ptr[i]; + if (!p.a && !p.b) break; + EXPECT_EQ(ptrs[p.a], ptrs[p.b]); + } + + sp_carr_for(e->different_ptr, i) { + heap_pair_t p = e->different_ptr[i]; + if (!p.a && !p.b) break; + EXPECT_NE(ptrs[p.a], ptrs[p.b]); + } + + sp_carr_for(e->refs, i) { + heap_ref_check_t* r = &e->refs[i]; + if (!r->ref && !r->bucket && !r->zeroed && !r->large && !r->data[0].offset && !r->data[0].value) break; + + u8* p = (u8*)ptrs[r->ref]; + EXPECT_NE(p, SP_NULLPTR); + if (!p) continue; + + if (r->bucket) { + sp_mem_heap_span_t* s = sp_mem_heap_find_span(heap, p); + EXPECT_NE(s, SP_NULLPTR); + if (s) EXPECT_EQ((u64)s->bucket, (u64)sp_mem_heap_bucket_of(r->bucket)); + } + if (r->zeroed) { + sp_for(j, r->zeroed) EXPECT_EQ(p[j], 0u); + } + if (r->large) { + EXPECT_EQ(sp_mem_heap_find_span(heap, p), SP_NULLPTR); + } + sp_carr_for(r->data, j) { + heap_data_t* d = &r->data[j]; + if (!d->offset && !d->value) break; + EXPECT_EQ(p[d->offset], (u8)d->value); + } + } + + u64 num_large = 0; + for (sp_mem_heap_large_t* l = heap->larges; l; l = l->next) num_large++; + + u64 num_recycled = 0; + for (sp_mem_heap_span_t* s = heap->recycled; s; s = s->next) num_recycled++; + + EXPECT_EQ(count_heap_spans(heap), e->num_spans); + EXPECT_EQ(num_large, e->num_large); + EXPECT_EQ(num_recycled, e->recycled); + EXPECT_EQ(heap->bytes_used, e->bytes.used); + EXPECT_EQ(heap->bytes_reserved, e->bytes.reserved); + + sp_mem_heap_destroy(heap); +} + +UTEST_F(mem, heap_alloc_returns_aligned_nonnull) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 16 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 64 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 1024 }, + { .kind = HEAP_OP_ALLOC, .ref = 3, .size = 8192 }, + }, + .expect = { + .num_spans = 3, + .num_large = 1, + .recycled = 12, + .bytes = { .used = 16 + 64 + 1344 + 8192, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE + 12288 }, + }, + }); +} + +UTEST_F(mem, heap_alloc_is_zeroed) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 128 }, + }, + .expect = { + .refs = { + { .ref = 0, .zeroed = 128 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 128, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_alloc_distinct_pointers) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 32 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 32 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 32 }, + }, + .expect = { + .different_ptr = { + { 0, 1 }, { 1, 2 }, { 0, 2 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 3 * 32, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_bucket_of_sizes) { + EXPECT_EQ(sp_mem_heap_bucket_of(0), 0u); + EXPECT_EQ(sp_mem_heap_bucket_of(1), 0u); + EXPECT_EQ(sp_mem_heap_bucket_of(16), 0u); + EXPECT_EQ(sp_mem_heap_bucket_of(17), 1u); + EXPECT_EQ(sp_mem_heap_bucket_of(32), 1u); + EXPECT_EQ(sp_mem_heap_bucket_of(48), 2u); + EXPECT_EQ(sp_mem_heap_bucket_of(64), 3u); + EXPECT_EQ(sp_mem_heap_bucket_of(96), 4u); + EXPECT_EQ(sp_mem_heap_bucket_of(128), 5u); + EXPECT_EQ(sp_mem_heap_bucket_of(192), 6u); + EXPECT_EQ(sp_mem_heap_bucket_of(256), 7u); + EXPECT_EQ(sp_mem_heap_bucket_of(257), 8u); + EXPECT_EQ(sp_mem_heap_bucket_of(336), 8u); + EXPECT_EQ(sp_mem_heap_bucket_of(448), 9u); + EXPECT_EQ(sp_mem_heap_bucket_of(576), 10u); + EXPECT_EQ(sp_mem_heap_bucket_of(672), 11u); + EXPECT_EQ(sp_mem_heap_bucket_of(800), 12u); + EXPECT_EQ(sp_mem_heap_bucket_of(1008), 13u); + EXPECT_EQ(sp_mem_heap_bucket_of(1009), 14u); + EXPECT_EQ(sp_mem_heap_bucket_of(1344), 14u); + EXPECT_EQ(sp_mem_heap_bucket_of(1345), 15u); + EXPECT_EQ(sp_mem_heap_bucket_of(2016), 15u); + EXPECT_EQ(sp_mem_heap_bucket_of(2017), (u32)SP_MEM_HEAP_NUM_BUCKETS); + EXPECT_EQ(sp_mem_heap_bucket_of(4096), (u32)SP_MEM_HEAP_NUM_BUCKETS); +} + +UTEST_F(mem, heap_small_allocs_share_a_span) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 64 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 64 }, + }, + .expect = { + .same = { + { 0, 1 }, { 1, 2 }, + }, + .refs = { + { .ref = 0, .bucket = 64 }, + { .ref = 1, .bucket = 64 }, + { .ref = 2, .bucket = 64 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 3 * 64, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_different_buckets_use_different_spans) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 32 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 128 }, + }, + .expect = { + .different = { + { 0, 1 }, + }, + .refs = { + { .ref = 0, .bucket = 32 }, + { .ref = 1, .bucket = 128 }, + }, + .num_spans = 2, + .recycled = 13, + .bytes = { .used = 32 + 128, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_large_alloc_bypasses_spans) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 8192 }, + }, + .expect = { + .refs = { + { .ref = 0, .large = true }, + }, + .num_spans = 0, + .num_large = 1, + .bytes = { .used = 8192, .reserved = 4096 + 12288 }, + }, + }); +} + +UTEST_F(mem, heap_packed_bucket_fits_four_1008_chunks) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 3, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 4, .size = 1008 }, + }, + .expect = { + .same = { + { 0, 1 }, { 1, 2 }, { 2, 3 }, + }, + .different = { + { 3, 4 }, + }, + .num_spans = 2, + .recycled = 13, + .bytes = { .used = 5 * 1008, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_full_span_overflows_to_new_span) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 800 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 800 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 800 }, + { .kind = HEAP_OP_ALLOC, .ref = 3, .size = 800 }, + { .kind = HEAP_OP_ALLOC, .ref = 4, .size = 800 }, + { .kind = HEAP_OP_ALLOC, .ref = 5, .size = 800 }, + }, + .expect = { + .same = { + { 0, 1 }, { 1, 2 }, { 2, 3 }, { 3, 4 }, + }, + .different = { + { 4, 5 }, + }, + .num_spans = 2, + .recycled = 13, + .bytes = { .used = 6 * 800, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_free_from_full_span_reuses_slot) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 1008 }, + { .kind = HEAP_OP_ALLOC, .ref = 3, .size = 1008 }, + { .kind = HEAP_OP_FREE, .ref = 1 }, + { .kind = HEAP_OP_ALLOC, .ref = 4, .size = 1008 }, + }, + .expect = { + .same = { + { 0, 4 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 4 * 1008, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_empty_span_is_recycled) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_FREE, .ref = 0 }, + }, + .expect = { + .num_spans = 0, + .recycled = 15, + .bytes = { .used = 0, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_recycled_span_is_reused) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_FREE, .ref = 0 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 256 }, + }, + .expect = { + .refs = { + { .ref = 1, .bucket = 256 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 256, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_chunk_reuse_is_zeroed) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 32 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 5, .value = 0xAB }, + { .kind = HEAP_OP_FREE, .ref = 0 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 32 }, + }, + .expect = { + .refs = { + { .ref = 1, .zeroed = 32 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 32, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_alloc_zero_returns_smallest_chunk) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 0 }, + }, + .expect = { + .refs = { + { .ref = 0, .bucket = 16 }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 16, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_within_bucket_keeps_pointer) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 20 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 10, .value = 7 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 25 }, + }, + .expect = { + .same_ptr = { + { 0, 1 }, + }, + .refs = { + { .ref = 1, .data = { { .offset = 10, .value = 7 } } }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 32, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_shrink_then_grow_reveals_zeroes) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 32 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 10, .value = 7 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 30, .value = 9 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 20 }, + { .kind = HEAP_OP_REALLOC, .ref = 2, .src = 1, .size = 31 }, + }, + .expect = { + .same_ptr = { + { 0, 2 }, + }, + .refs = { + { .ref = 2, .data = { { .offset = 10, .value = 7 }, { .offset = 30, .value = 0 } } }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 32, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_grows_and_preserves_bytes) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 16 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 3, .value = 7 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 64 }, + }, + .expect = { + .different_ptr = { + { 0, 1 }, + }, + .refs = { + { .ref = 1, .data = { { .offset = 3, .value = 7 } } }, + }, + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 64, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_large_in_place) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 5000 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 4999, .value = 9 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 6000 }, + }, + .expect = { + .same_ptr = { + { 0, 1 }, + }, + .refs = { + { .ref = 1, .large = true, .data = { { .offset = 4999, .value = 9 }, { .offset = 5999, .value = 0 } } }, + }, + .num_spans = 0, + .num_large = 1, + .bytes = { .used = 6000, .reserved = 4096 + 8192 }, + }, + }); +} + +UTEST_F(mem, heap_realloc_large_to_small_moves) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 5000 }, + { .kind = HEAP_OP_WRITE, .ref = 0, .offset = 10, .value = 3 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 64 }, + }, + .expect = { + .different_ptr = { + { 0, 1 }, + }, + .refs = { + { .ref = 1, .bucket = 64, .data = { { .offset = 10, .value = 3 } } }, + }, + .num_spans = 1, + .num_large = 0, + .recycled = 14, + .bytes = { .used = 64, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_null_acts_as_alloc) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_REALLOC, .ref = 0, .src = 31, .size = 64 }, + }, + .expect = { + .num_spans = 1, + .recycled = 14, + .bytes = { .used = 64, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_zero_frees) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_REALLOC, .ref = 1, .src = 0, .size = 0 }, + }, + .expect = { + .num_spans = 0, + .num_large = 0, + .recycled = 15, + .bytes = { .used = 0, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_bytes_accounting) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 8192 }, + }, + .expect = { + .num_spans = 1, + .num_large = 1, + .recycled = 14, + .bytes = { .used = 64 + 8192, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE + 12288 }, + }, + }); +} + +UTEST_F(mem, heap_drained_heap_retains_segment) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 64 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 8192 }, + { .kind = HEAP_OP_FREE, .ref = 0 }, + { .kind = HEAP_OP_FREE, .ref = 1 }, + }, + .expect = { + .num_spans = 0, + .recycled = 15, + .bytes = { .used = 0, .reserved = 4096 + SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_exhausted_segment_grows_new_segment) { + run_heap_test(&ur, (heap_test_t){ + .ops = { + { .kind = HEAP_OP_ALLOC, .ref = 0, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 1, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 2, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 3, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 4, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 5, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 6, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 7, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 8, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 9, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 10, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 11, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 12, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 13, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 14, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 15, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 16, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 17, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 18, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 19, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 20, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 21, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 22, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 23, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 24, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 25, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 26, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 27, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 28, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 29, .size = 2016 }, + { .kind = HEAP_OP_ALLOC, .ref = 30, .size = 2016 }, + }, + .expect = { + .different_ptr = { + { 0, 30 }, + }, + .num_spans = 16, + .recycled = 14, + .bytes = { .used = 31 * 2016, .reserved = 4096 + 2 * SP_MEM_HEAP_SEGMENT_SIZE }, + }, + }); +} + +UTEST_F(mem, heap_realloc_huge_size_fails_cleanly) { + sp_mem_heap_t* heap = sp_mem_heap_new(); + u8* p = sp_ptr_cast(u8*, sp_mem_heap_alloc(heap, 5000)); + EXPECT_NE(p, SP_NULLPTR); + if (p) { + p[100] = 42; + EXPECT_EQ(sp_mem_heap_realloc(heap, p, SP_LIMIT_U64_MAX - 16), SP_NULLPTR); + EXPECT_EQ(p[100], 42u); + EXPECT_EQ(heap->bytes_used, (u64)5000); + sp_mem_heap_free(heap, p); + } + EXPECT_EQ(sp_mem_heap_alloc(heap, SP_LIMIT_U64_MAX - 16), SP_NULLPTR); + sp_mem_heap_destroy(heap); +} + +UTEST_F(mem, heap_null_heap_ops_are_noops) { + u8 byte = 7; + EXPECT_EQ(sp_mem_heap_alloc(SP_NULLPTR, 64), SP_NULLPTR); + EXPECT_EQ(sp_mem_heap_realloc(SP_NULLPTR, &byte, 64), SP_NULLPTR); + EXPECT_EQ(sp_mem_heap_find_span(SP_NULLPTR, &byte), SP_NULLPTR); + sp_mem_heap_free(SP_NULLPTR, &byte); + EXPECT_EQ(byte, 7u); + sp_mem_heap_destroy(SP_NULLPTR); +} + +UTEST_F(mem, heap_as_allocator_routes_through_sp_alloc) { + sp_mem_heap_t* heap = sp_mem_heap_new(); + sp_mem_t mem = sp_mem_heap_as_allocator(heap); + + u8* p = sp_ptr_cast(u8*, sp_alloc(mem, 64)); + EXPECT_NE(p, SP_NULLPTR); + EXPECT_EQ((uintptr_t)p & (SP_MEM_ALIGNMENT - 1), 0u); + + u8* q = sp_ptr_cast(u8*, sp_realloc(mem, p, 128)); + EXPECT_NE(q, SP_NULLPTR); + + sp_free(mem, q); + sp_mem_heap_destroy(heap); +} diff --git a/test/mem/mem.h b/test/mem/mem.h index 2f73dee..8710174 100644 --- a/test/mem/mem.h +++ b/test/mem/mem.h @@ -9,7 +9,7 @@ #define SP_MEM_ALIGNMENT 16 #endif -#define EXPECT_ALIGNED(ptr) EXPECT_EQ(sp_align_up(ptr, SP_MEM_ALIGNMENT), ptr) +#define EXPECT_ALIGNED(ptr) EXPECT_EQ(sp_align_up(ptr, SP_MEM_ALIGNMENT), sp_uptr(ptr)) struct mem { u8 placeholder; diff --git a/test/tools/table.h b/test/tools/table.h new file mode 100644 index 0000000..77214ac --- /dev/null +++ b/test/tools/table.h @@ -0,0 +1,182 @@ +#ifndef SP_TABLE_H +#define SP_TABLE_H + +#include "sp.h" + +typedef struct { + sp_str_t header; + const c8* fmt; + sp_fmt_align_t align; +} sp_table_col_t; + +typedef struct { + sp_str_t value; + const c8* color; +} sp_table_cell_t; + +typedef struct { + sp_mem_t mem; + sp_da(sp_table_col_t) cols; + sp_da(sp_table_cell_t) cells; + const c8* color; +} sp_table_writer_t; + +SP_API void sp_table_init(sp_table_writer_t* table, sp_mem_t mem); +SP_API void sp_table_add_col(sp_table_writer_t* table, sp_table_col_t col); +SP_API void sp_table_begin(sp_table_writer_t* table); +SP_API void sp_table_color(sp_table_writer_t* table, const c8* ansi); +SP_API void sp_table_write_str(sp_table_writer_t* table, sp_str_t value); +SP_API void sp_table_write_cstr(sp_table_writer_t* table, const c8* value); +SP_API void sp_table_write_u64(sp_table_writer_t* table, u64 value); +SP_API void sp_table_write_s64(sp_table_writer_t* table, s64 value); +SP_API void sp_table_write_u32(sp_table_writer_t* table, u32 value); +SP_API void sp_table_write_s32(sp_table_writer_t* table, s32 value); +SP_API void sp_table_write_f64(sp_table_writer_t* table, f64 value); +SP_API void sp_table_write_f32(sp_table_writer_t* table, f32 value); +SP_API sp_str_t sp_table_render(sp_table_writer_t* table, sp_mem_t mem); +SP_API void sp_table_log(sp_table_writer_t* table); + +#endif + +#if defined(SP_TABLE_IMPLEMENTATION) && !defined(SP_TABLE_C) +#define SP_TABLE_C + +void sp_table_init(sp_table_writer_t* table, sp_mem_t mem) { + sp_mem_zero(table, sizeof(*table)); + table->mem = mem; + sp_da_init(mem, table->cols); + sp_da_init(mem, table->cells); +} + +void sp_table_add_col(sp_table_writer_t* table, sp_table_col_t col) { + SP_ASSERT(sp_da_empty(table->cells)); + if (!col.fmt) col.fmt = "{}"; + sp_da_push(table->cols, col); +} + +void sp_table_begin(sp_table_writer_t* table) { + SP_ASSERT(!sp_da_empty(table->cols)); + SP_ASSERT(sp_da_size(table->cells) % sp_da_size(table->cols) == 0); +} + +static const sp_table_col_t* sp_table_current_col(sp_table_writer_t* table) { + return &table->cols[sp_da_size(table->cells) % sp_da_size(table->cols)]; +} + +void sp_table_color(sp_table_writer_t* table, const c8* ansi) { + table->color = ansi; +} + +static void sp_table_push_cell(sp_table_writer_t* table, sp_str_t value) { + sp_table_cell_t cell = { + .value = value, + .color = table->color, + }; + sp_da_push(table->cells, cell); + table->color = SP_NULLPTR; +} + +void sp_table_write_str(sp_table_writer_t* table, sp_str_t value) { + sp_table_push_cell(table, sp_str_copy(table->mem, value)); +} + +void sp_table_write_cstr(sp_table_writer_t* table, const c8* value) { + sp_table_write_str(table, sp_str_view(value)); +} + +void sp_table_write_u64(sp_table_writer_t* table, u64 value) { + sp_table_push_cell(table, sp_fmt(table->mem, sp_table_current_col(table)->fmt, sp_fmt_uint(value)).value); +} + +void sp_table_write_s64(sp_table_writer_t* table, s64 value) { + sp_table_push_cell(table, sp_fmt(table->mem, sp_table_current_col(table)->fmt, sp_fmt_int(value)).value); +} + +void sp_table_write_u32(sp_table_writer_t* table, u32 value) { + sp_table_write_u64(table, (u64)value); +} + +void sp_table_write_s32(sp_table_writer_t* table, s32 value) { + sp_table_write_s64(table, (s64)value); +} + +void sp_table_write_f64(sp_table_writer_t* table, f64 value) { + sp_table_push_cell(table, sp_fmt(table->mem, sp_table_current_col(table)->fmt, sp_fmt_float(value)).value); +} + +void sp_table_write_f32(sp_table_writer_t* table, f32 value) { + sp_table_write_f64(table, (f64)value); +} + +static const c8* sp_table_pad_fmt(sp_fmt_align_t align) { + switch (align) { + case SP_FMT_ALIGN_NONE: return "{:<$}"; + case SP_FMT_ALIGN_LEFT: return "{:<$}"; + case SP_FMT_ALIGN_CENTER: return "{:^$}"; + case SP_FMT_ALIGN_RIGHT: return "{:>$}"; + } + return "{:<$}"; +} + +static void sp_table_render_cell(sp_io_writer_t* io, const sp_table_col_t* col, sp_table_cell_t cell, u32 width, bool last) { + if (cell.color) sp_io_write_cstr(io, cell.color, SP_NULLPTR); + if (last && (col->align == SP_FMT_ALIGN_NONE || col->align == SP_FMT_ALIGN_LEFT)) { + sp_io_write_str(io, cell.value, SP_NULLPTR); + } + else { + sp_fmt_io(io, sp_table_pad_fmt(col->align), sp_fmt_uint(width), sp_fmt_str(cell.value)); + } + if (cell.color) sp_io_write_cstr(io, SP_ANSI_RESET, SP_NULLPTR); +} + +sp_str_t sp_table_render(sp_table_writer_t* table, sp_mem_t mem) { + u32 num_cols = (u32)sp_da_size(table->cols); + SP_ASSERT(num_cols); + SP_ASSERT(sp_da_size(table->cells) % num_cols == 0); + u32 num_rows = (u32)(sp_da_size(table->cells) / num_cols); + + u32* widths = sp_alloc_n(mem, u32, num_cols); + sp_for(col, num_cols) { + widths[col] = table->cols[col].header.len; + } + sp_da_for(table->cells, it) { + u32 col = (u32)(it % num_cols); + widths[col] = sp_max(widths[col], table->cells[it].value.len); + } + + sp_io_dyn_mem_writer_t builder = sp_zero; + sp_io_dyn_mem_writer_init(mem, &builder); + + sp_io_write_cstr(&builder.base, SP_ANSI_FG_BRIGHT_BLACK, SP_NULLPTR); + sp_for(col, num_cols) { + if (col) sp_io_write_cstr(&builder.base, " ", SP_NULLPTR); + sp_table_cell_t header = { .value = table->cols[col].header }; + sp_table_render_cell(&builder.base, &table->cols[col], header, widths[col], col == num_cols - 1); + } + sp_io_write_cstr(&builder.base, SP_ANSI_RESET, SP_NULLPTR); + // sp_io_write_c8(&builder.base, '\n'); + // sp_for(col, num_cols) { + // if (col) sp_io_write_cstr(&builder.base, " ", SP_NULLPTR); + // sp_for(it, widths[col]) { + // sp_io_write_c8(&builder.base, '-'); + // } + // } + + sp_for(row, num_rows) { + sp_io_write_c8(&builder.base, '\n'); + sp_for(col, num_cols) { + if (col) sp_io_write_cstr(&builder.base, " ", SP_NULLPTR); + sp_table_render_cell(&builder.base, &table->cols[col], table->cells[row * num_cols + col], widths[col], col == num_cols - 1); + } + } + + return sp_io_dyn_mem_writer_as_str(&builder); +} + +void sp_table_log(sp_table_writer_t* table) { + sp_mem_arena_marker_t scratch = sp_mem_begin_scratch(); + sp_log("{}", sp_fmt_str(sp_table_render(table, scratch.mem))); + sp_mem_end_scratch(scratch); +} + +#endif diff --git a/test/bench/ht.c b/tools/wip/bench/ht.c similarity index 100% rename from test/bench/ht.c rename to tools/wip/bench/ht.c diff --git a/test/bench/stb_ds.h b/tools/wip/bench/stb_ds.h similarity index 100% rename from test/bench/stb_ds.h rename to tools/wip/bench/stb_ds.h