From c87e3973e3a517da3b237862e9b2bc467b3e2ea4 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 10 Oct 2023 15:32:12 +0200 Subject: [PATCH] rb_shape_transition_shape_capa: use optimal sizes transitions Previously the growth was 3(embed), 6, 12, 24, ... With this change it's now 3(embed), 8, 16, 32, 64, ... by default. However, since power of two isn't the best size for all allocators, if `malloc_usable_size` is vailable, we use it to discover the best offset. On Linux/glibc 2.35 for instance, the growth will be 3(embed), 7, 15, 31 to avoid wasting 8B per object. Test program: ```c size_t test(size_t slots) { size_t allocated = slots * VALUE_SIZE; void *test_ptr = malloc(allocated); size_t wasted = malloc_usable_size(test_ptr) - allocated; free(test_ptr); fprintf(stderr, "slots = %lu, wasted_bytes = %lu\n", slots, wasted); return wasted; } int main(int argc, char *argv[]) { size_t best_padding = 0; size_t padding = 0; for (padding = 0; padding <= 2; padding++) { size_t wasted = test(8 - padding); if (wasted == 0) { best_padding = padding; break; } } size_t index = 0; fprintf(stderr, "=============== naive ================\n"); size_t list_size = 4; for (index = 0; index < 10; index++) { test(list_size); list_size *= 2; } fprintf(stderr, "=============== auto-padded (-%lu) ================\n", best_padding); list_size = 4; for (index = 0; index < 10; index ++) { test(list_size - best_padding); list_size *= 2; } fprintf(stderr, "\n\n"); return 0; } ``` ``` ===== glibc ====== slots = 8, wasted_bytes = 8 slots = 7, wasted_bytes = 0 =============== naive ================ slots = 4, wasted_bytes = 8 slots = 8, wasted_bytes = 8 slots = 16, wasted_bytes = 8 slots = 32, wasted_bytes = 8 slots = 64, wasted_bytes = 8 slots = 128, wasted_bytes = 8 slots = 256, wasted_bytes = 8 slots = 512, wasted_bytes = 8 slots = 1024, wasted_bytes = 8 slots = 2048, wasted_bytes = 8 =============== auto-padded (-1) ================ slots = 3, wasted_bytes = 0 slots = 7, wasted_bytes = 0 slots = 15, wasted_bytes = 0 slots = 31, wasted_bytes = 0 slots = 63, wasted_bytes = 0 slots = 127, wasted_bytes = 0 slots = 255, wasted_bytes = 0 slots = 511, wasted_bytes = 0 slots = 1023, wasted_bytes = 0 slots = 2047, wasted_bytes = 0 ``` ``` ========== jemalloc ======= slots = 8, wasted_bytes = 0 =============== naive ================ slots = 4, wasted_bytes = 0 slots = 8, wasted_bytes = 0 slots = 16, wasted_bytes = 0 slots = 32, wasted_bytes = 0 slots = 64, wasted_bytes = 0 slots = 128, wasted_bytes = 0 slots = 256, wasted_bytes = 0 slots = 512, wasted_bytes = 0 slots = 1024, wasted_bytes = 0 slots = 2048, wasted_bytes = 0 =============== auto-padded (-0) ================ slots = 4, wasted_bytes = 0 slots = 8, wasted_bytes = 0 slots = 16, wasted_bytes = 0 slots = 32, wasted_bytes = 0 slots = 64, wasted_bytes = 0 slots = 128, wasted_bytes = 0 slots = 256, wasted_bytes = 0 slots = 512, wasted_bytes = 0 slots = 1024, wasted_bytes = 0 slots = 2048, wasted_bytes = 0 ``` --- ext/objspace/objspace_dump.c | 2 +- gc.c | 47 ++++++++++++++++++++++++++++++++++++ internal/gc.h | 1 + shape.c | 8 +++--- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/ext/objspace/objspace_dump.c b/ext/objspace/objspace_dump.c index bcee5312fc92d0..f5e69a861af54e 100644 --- a/ext/objspace/objspace_dump.c +++ b/ext/objspace/objspace_dump.c @@ -788,7 +788,7 @@ shape_i(rb_shape_t *shape, void *data) dump_append(dc, "\"OBJ_TOO_COMPLEX\""); break; default: - rb_bug("[objspace] unexpected shape type"); + rb_bug("[objspace] unexpected shape type: %u", shape->type); } dump_append(dc, ", \"edges\":"); diff --git a/gc.c b/gc.c index 273207c4b7deff..12141e6896944a 100644 --- a/gc.c +++ b/gc.c @@ -157,6 +157,51 @@ #define MAP_ANONYMOUS MAP_ANON #endif + +static size_t malloc_offset = 0; +#ifdef HAVE_MALLOC_USABLE_SIZE +static size_t +gc_compute_malloc_offset(void) +{ + // Different allocators use different metadata storage strategies which result in different + // ideal sizes. + // For instance malloc(64) will waste 8B with glibc, but waste 0B with jemalloc. + // But malloc(56) will waste 0B with glibc, but waste 8B with jemalloc. + // So we try allocating 64, 56 and 48 bytes and select the first offset that doesn't + // waste memory. + // This was tested on Linux with glibc 2.35 and jemalloc 5, and for both it result in + // no wasted memory. + size_t offset = 0; + for (offset = 0; offset <= 16; offset += 8) { + size_t allocated = (64 - offset); + void *test_ptr = malloc(allocated); + size_t wasted = malloc_usable_size(test_ptr) - allocated; + free(test_ptr); + + if (wasted == 0) { + return offset; + } + } + return 0; +} +#else +static size_t +gc_compute_malloc_offset(void) +{ + // If we don't have malloc_usable_size, we use powers of 2. + return 0; +} +#endif + +size_t +rb_malloc_grow_capa(size_t current_capacity, size_t type_size) +{ + current_capacity *= type_size; + size_t new_capacity = 1 << (CHAR_BIT*SIZEOF_SIZE_T - nlz_long((current_capacity * 2) - 1)); + new_capacity -= malloc_offset; + return new_capacity / type_size; +} + static inline struct rbimpl_size_mul_overflow_tag size_add_overflow(size_t x, size_t y) { @@ -13984,6 +14029,8 @@ void Init_GC(void) { #undef rb_intern + malloc_offset = gc_compute_malloc_offset(); + VALUE rb_mObjSpace; VALUE rb_mProfiler; VALUE gc_constants; diff --git a/internal/gc.h b/internal/gc.h index f8f88a41cba39c..188497b0078fce 100644 --- a/internal/gc.h +++ b/internal/gc.h @@ -227,6 +227,7 @@ __attribute__((__alloc_align__(1))) RUBY_ATTR_MALLOC void *rb_aligned_malloc(size_t, size_t) RUBY_ATTR_ALLOC_SIZE((2)); size_t rb_size_mul_or_raise(size_t, size_t, VALUE); /* used in compile.c */ size_t rb_size_mul_add_or_raise(size_t, size_t, size_t, VALUE); /* used in iseq.h */ +size_t rb_malloc_grow_capa(size_t current_capacity, size_t type_size); RUBY_ATTR_MALLOC void *rb_xmalloc_mul_add(size_t, size_t, size_t); RUBY_ATTR_MALLOC void *rb_xcalloc_mul_add(size_t, size_t, size_t); void *rb_xrealloc_mul_add(const void *, size_t, size_t, size_t); diff --git a/shape.c b/shape.c index 89a2c3bd0bef8a..68702bd5903e1c 100644 --- a/shape.c +++ b/shape.c @@ -418,8 +418,10 @@ rb_shape_get_next(rb_shape_t* shape, VALUE obj, ID id) } static inline rb_shape_t * -rb_shape_transition_shape_capa_create(rb_shape_t* shape, uint32_t new_capacity) +rb_shape_transition_shape_capa_create(rb_shape_t* shape, size_t new_capacity) { + RUBY_ASSERT(new_capacity < (size_t)MAX_IVARS); + ID edge_name = rb_make_temporary_id(new_capacity); bool dont_care; rb_shape_t * new_shape = get_next_shape_internal(shape, edge_name, SHAPE_CAPACITY_CHANGE, &dont_care, true, false); @@ -430,7 +432,7 @@ rb_shape_transition_shape_capa_create(rb_shape_t* shape, uint32_t new_capacity) rb_shape_t * rb_shape_transition_shape_capa(rb_shape_t* shape) { - return rb_shape_transition_shape_capa_create(shape, shape->capacity * 2); + return rb_shape_transition_shape_capa_create(shape, rb_malloc_grow_capa(shape->capacity, sizeof(VALUE))); } bool @@ -833,7 +835,7 @@ Init_default_shapes(void) // Shapes by size pool for (int i = 1; i < SIZE_POOL_COUNT; i++) { - uint32_t capa = (uint32_t)((rb_size_pool_slot_size(i) - offsetof(struct RObject, as.ary)) / sizeof(VALUE)); + size_t capa = ((rb_size_pool_slot_size(i) - offsetof(struct RObject, as.ary)) / sizeof(VALUE)); rb_shape_t * new_shape = rb_shape_transition_shape_capa_create(root, capa); new_shape->type = SHAPE_INITIAL_CAPACITY; new_shape->size_pool_index = i;