Merge pull request #14870 from s-hadinger/berry_new_allocator

Berry faster allocator for small objects
This commit is contained in:
s-hadinger 2022-02-15 23:04:27 +01:00 committed by GitHub
commit 77432c3281
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 358 additions and 89 deletions

View File

@ -175,6 +175,14 @@
**/
#define BE_USE_DEBUG_GC 0
/* Macro: BE_USE_MEM_ALIGNED
* Some embedded processors have special memory areas
* with read/write constraints of being aligned to 32 bits boundaries.
* This options tries to move such memory areas to this region.
* Default: 0
**/
#define BE_USE_MEM_ALIGNED 1
/* Macro: BE_USE_XXX_MODULE
* These macros control whether the related module is compiled.
* When they are true, they will enable related modules. At this
@ -215,8 +223,6 @@ extern "C" {
extern void berry_free(void *ptr);
extern void *berry_realloc(void *ptr, size_t size);
extern void *berry_malloc32(size_t size);
extern void berry_free32(void *ptr);
extern void *berry_realloc32(void *ptr, size_t size);
#ifdef __cplusplus
}
#endif

View File

@ -40,6 +40,7 @@ void be_gc_init(bvm *vm)
{
vm->gc.usage = sizeof(bvm);
be_gc_setsteprate(vm, 200);
be_gc_init_memory_pools(vm);
}
void be_gc_deleteall(bvm *vm)
@ -543,6 +544,8 @@ void be_gc_collect(bvm *vm)
return; /* the GC cannot run for some reason */
}
#if BE_USE_PERF_COUNTERS
size_t slors_used_before_gc, slots_allocated_before_gc;
be_gc_memory_pools_info(vm, &slors_used_before_gc, &slots_allocated_before_gc);
vm->counter_gc_kept = 0;
vm->counter_gc_freed = 0;
#endif
@ -563,8 +566,13 @@ void be_gc_collect(bvm *vm)
reset_fixedlist(vm);
/* step 5: calculate the next GC threshold */
vm->gc.threshold = next_threshold(vm->gc);
be_gc_memory_pools(vm); /* free unsued memory pools */
#if BE_USE_PERF_COUNTERS
if (vm->obshook != NULL) (*vm->obshook)(vm, BE_OBS_GC_END, vm->gc.usage, vm->counter_gc_kept, vm->counter_gc_freed);
size_t slors_used_after_gc, slots_allocated_after_gc;
be_gc_memory_pools_info(vm, &slors_used_after_gc, &slots_allocated_after_gc);
if (vm->obshook != NULL) (*vm->obshook)(vm, BE_OBS_GC_END, vm->gc.usage, vm->counter_gc_kept, vm->counter_gc_freed,
slors_used_before_gc, slots_allocated_before_gc,
slors_used_after_gc, slots_allocated_after_gc);
#else
if (vm->obshook != NULL) (*vm->obshook)(vm, BE_OBS_GC_END, vm->gc.usage);
#endif

View File

@ -26,6 +26,12 @@
#define realloc BE_EXPLICIT_REALLOC
#endif
static void* malloc_from_pool(bvm *vm, size_t size);
static void free_from_pool(bvm *vm, void* ptr, size_t old_size);
#define POOL16_SIZE 16
#define POOL32_SIZE 32
BERRY_API void* be_os_malloc(size_t size)
{
return malloc(size);
@ -41,39 +47,310 @@ BERRY_API void* be_os_realloc(void *ptr, size_t size)
return realloc(ptr, size);
}
static void* _realloc(void *ptr, size_t old_size, size_t new_size)
{
if (old_size == new_size) { /* the block unchanged */
return ptr;
}
if (ptr && new_size) { /* realloc block */
return realloc(ptr, new_size);
}
if (new_size) { /* alloc a new block */
be_assert(ptr == NULL && old_size == 0);
return malloc(new_size);
}
be_assert(new_size == 0);
#if BE_USE_DEBUG_GC
memset(ptr, 0xFF, old_size); /* fill the structure with invalid pointers */
#endif
free(ptr);
return NULL;
}
BERRY_API void* be_realloc(bvm *vm, void *ptr, size_t old_size, size_t new_size)
{
void *block = _realloc(ptr, old_size, new_size);
if (!block && new_size) { /* allocation failure */
void *block = NULL;
// serial_debug("be_realloc ptr=%p old_size=%i new_size=%i\n", ptr, old_size, new_size);
bbool gc_occured = bfalse; /* if allocation failed, retry after forced GC */
if (old_size == new_size) { /* the block unchanged, this also captures creation of a zero byte object */
return ptr;
}
/* from now on, block == NULL means allocation failure */
while (1) {
/* Case 1: new allocation */
if (!ptr || (old_size == 0)) {
block = malloc_from_pool(vm, new_size);
}
/* Case 2: deallocate */
else if (new_size == 0) {
#if BE_USE_DEBUG_GC
memset(ptr, 0xFF, old_size); /* fill the structure with invalid pointers */
#endif
free_from_pool(vm, ptr, old_size);
break; /* early exit */
}
/* Case 3: reallocate with a different size */
else if (new_size && old_size) { // TODO we already know they are not null TODO
if (new_size <= POOL32_SIZE || old_size <=POOL32_SIZE) {
/* complex case with different pools */
if (new_size <= POOL16_SIZE && old_size <= POOL16_SIZE) {
// no change of slot
block = ptr;
break;
} else if (new_size > POOL16_SIZE && old_size > POOL16_SIZE && new_size <= POOL32_SIZE && old_size <= POOL32_SIZE) {
// no change of slot
block = ptr;
break;
} else {
/* one of the buffer is out of pool, the other not */
block = malloc_from_pool(vm, new_size);
if (block) {
/* copy memory */
size_t min_size = old_size < new_size ? old_size : new_size;
memmove(block, ptr, min_size);
// serial_debug("memmove from %p to %p size=%i\n", ptr, block, min_size);
free_from_pool(vm, ptr, old_size);
}
}
} else {
block = realloc(ptr, new_size);
// serial_debug("realloc from %p to %p size=%i", ptr, block, new_size);
}
} /* end of reallocation */
/* exit allocator, do we need to GC ? */
if (block) { break; } /* all good */
if (gc_occured) { /* already tried gc, can't do much anymore */
be_throw(vm, BE_MALLOC_FAIL); /* lack of heap space */
}
// serial_debug("be_realloc force_gc\n");
/* force GC now */
vm->gc.status |= GC_ALLOC;
be_gc_collect(vm); /* try to allocate again after GC */
vm->gc.status &= ~GC_ALLOC;
block = _realloc(ptr, old_size, new_size);
if (!block) { /* lack of heap space */
be_throw(vm, BE_MALLOC_FAIL);
}
gc_occured = btrue; /* don't try again GC */
}
vm->gc.usage = vm->gc.usage + new_size - old_size; /* update allocated count */
// serial_debug("be_realloc ret=%p\n", block);
return block;
}
BERRY_API void* be_move_to_aligned(bvm *vm, void *ptr, size_t size) {
#if BE_USE_MEM_ALIGNED
if (size <= POOL32_SIZE) {
return ptr; /* if in memory pool, don't move it so be_free() will continue to work */
}
void* iram = berry_malloc32(size);
if (iram) {
memcpy(iram, ptr, size); /* new_size is always smaller than initial mem zone */
free(ptr); // TODO gc size is now wrong
return iram;
}
#endif
return ptr;
}
/* Special allocator for structures under 32 bytes */
typedef uint8_t mem16[16]; /* memory line of 16 bytes */
typedef uint8_t mem32[32]; /* memory line of 32 bytes */
#define POOL16_SLOTS 31
#define POOL16_BITMAP_FULL ((1UL<<POOL16_SLOTS)-1) /* 31 bits set to 1 - 0x7FFFFFFF */
#define POOL32_SLOTS 15
#define POOL32_BITMAP_FULL ((1UL<<POOL32_SLOTS)-1) /* 15 bits set to 1 - 0x7FFF*/
typedef struct gc16_t {
uint32_t bitmap; /* bitmap or allocated lines, 0 is free */
struct gc16_t* next;
mem16 lines[POOL16_SLOTS];
} gc16_t;
typedef struct gc32_t {
uint32_t bitmap; /* bitmap or allocated lines, 0 is free */
struct gc32_t* next;
mem32 lines[POOL32_SLOTS];
} gc32_t;
static void* malloc_from_pool(bvm *vm, size_t size) {
if (size == 0) return NULL;
if (size <= POOL16_SIZE) {
/* allocate in pool 16 */
gc16_t* pool16 = vm->gc.pool16;
while (pool16) {
/* look for an empty slot */
if (pool16->bitmap != 0x0000) {
/* there is a free slot */
#ifdef __GNUC__
int bit = __builtin_ffs(pool16->bitmap) - 1;
#else
int bit = ffs(pool16->bitmap) - 1;
#endif
if (bit >= 0) {
/* we found a free slot */
// bitClear(pool16->bitmap, bit);
pool16->bitmap &= ~(1UL << bit);
// serial_debug("malloc_from_pool found slot in pool %p, bit %i, ptr=%p\n", pool16, bit, &pool16->lines[bit]);
return &pool16->lines[bit];
}
}
pool16 = pool16->next;
}
/* no slot available, we allocate a new pool */
pool16 = (gc16_t*) malloc(sizeof(gc16_t));
if (!pool16) { return NULL; } /* out of memory */
pool16->next = vm->gc.pool16;
pool16->bitmap = POOL16_BITMAP_FULL - 1; /* allocate first line */
vm->gc.pool16 = pool16; /* insert at head of linked list */
// serial_debug("malloc_from_pool allocated new pool %p, size=%i p=%p\n", pool16, sizeof(gc16_t), &pool16->lines[0]);
return &pool16->lines[0];
}
if (size <= POOL32_SIZE) {
/* allocate in pool 32 */
gc32_t* pool32 = vm->gc.pool32;
while (pool32) {
/* look for an empty slot */
if (pool32->bitmap != 0x0000) {
/* there is a free slot */
#ifdef __GNUC__
int bit = __builtin_ffs(pool32->bitmap) - 1;
#else
int bit = ffs(pool32->bitmap) - 1;
#endif
if (bit >= 0) {
/* we found a free slot */
// bitClear(pool32->bitmap, bit);
pool32->bitmap &= ~(1UL << bit);
// serial_debug("malloc_from_pool found slot in pool %p, bit %i, ptr=%p\n", pool32, bit, &pool32->lines[bit]);
return &pool32->lines[bit];
}
}
pool32 = pool32->next;
}
/* no slot available, we allocate a new pool */
pool32 = (gc32_t*) malloc(sizeof(gc32_t));
if (!pool32) { return NULL; } /* out of memory */
pool32->next = vm->gc.pool32;
pool32->bitmap = POOL32_BITMAP_FULL - 1; /* allocate first line */
vm->gc.pool32 = pool32; /* insert at head of linked list */
// serial_debug("malloc_from_pool allocated new pool %p, size=%i p=%p\n", pool32, sizeof(gc32_t), &pool32->lines[0]);
return &pool32->lines[0];
}
return malloc(size); /* default to system malloc */
}
static void free_from_pool(bvm *vm, void* ptr, size_t old_size) {
if (old_size <= POOL16_SIZE) {
gc16_t* pool16 = vm->gc.pool16;
while (pool16) {
int32_t offset = (uint8_t*)ptr - (uint8_t*) &pool16->lines[0];
// serial_debug("free_from_pool ptr=%p pool=%p offset=%i\n", ptr,pool16, offset);
if ((offset >= 0) && (offset < POOL16_SLOTS*16) && ((offset & 0x0F) == 0)) {
int bit = offset >> 4;
// serial_debug("free_from_pool ptr=%p fond pool=%p bit=%i\n", ptr, pool16, bit);
// bitSet(pool16->bitmap, bit);
pool16->bitmap |= 1UL << bit;
return;
}
pool16 = pool16->next;
}
}
else if (old_size <= POOL32_SIZE) {
gc32_t* pool32 = vm->gc.pool32;
while (pool32) {
int32_t offset = (uint8_t*)ptr - (uint8_t*) &pool32->lines[0];
// serial_debug("free_from_pool pool=%p offset=%i\n", pool32, offset);
if ((offset >= 0) && (offset < POOL16_SLOTS*16) && ((offset & 0x1F) == 0)) {
int bit = offset >> 5;
// serial_debug("free_from_pool ptr=%p fond pool=%p bit=%i\n", ptr, pool32, bit);
// bitSet(pool32->bitmap, bit);
pool32->bitmap |= 1UL << bit;
return;
}
pool32 = pool32->next;
}
}
else {
// serial_debug("free_from_pool free=%p\n", ptr);
free(ptr);
}
}
BERRY_API void be_gc_memory_pools(bvm *vm) {
gc16_t** prev16 = &vm->gc.pool16;
gc16_t* pool16 = vm->gc.pool16;
while (pool16) {
if (pool16->bitmap == POOL16_BITMAP_FULL) {
/* pool is empty, we can free it */
*prev16 = pool16->next;
gc16_t* pool_to_freed = pool16;
pool16 = pool16->next; /* move to next */
free(pool_to_freed);
} else {
prev16 = &pool16->next;
pool16 = pool16->next; /* move to next */
}
}
gc32_t** prev32 = &vm->gc.pool32;
gc32_t* pool32 = vm->gc.pool32;
while (pool32) {
if (pool32->bitmap == POOL32_BITMAP_FULL) {
/* pool is empty, we can free it */
*prev32 = pool32->next;
gc32_t* pool_to_freed = pool32;
pool32 = pool32->next; /* move to next */
free(pool_to_freed);
} else {
prev32 = &pool32->next;
pool32 = pool32->next; /* move to next */
}
}
}
BERRY_API void be_gc_init_memory_pools(bvm *vm) {
vm->gc.pool16 = NULL;
vm->gc.pool32 = NULL;
}
BERRY_API void be_gc_free_memory_pools(bvm *vm) {
gc16_t* pool16 = vm->gc.pool16;
while (pool16) {
gc16_t* pool_to_freed = pool16;
pool16 = pool16->next;
be_os_free(pool_to_freed);
pool16 = pool16->next;
}
vm->gc.pool16 = NULL;
gc32_t* pool32 = vm->gc.pool32;
while (pool32) {
gc32_t* pool_to_freed = pool32;
pool32 = pool32->next;
be_os_free(pool_to_freed);
pool32 = pool32->next;
}
vm->gc.pool32 = NULL;
}
/* https://github.com/hcs0/Hackers-Delight/blob/master/pop.c.txt - count number of 1-bits */
static int pop0(uint32_t n) {
n = (n & 0x55555555u) + ((n >> 1) & 0x55555555u);
n = (n & 0x33333333u) + ((n >> 2) & 0x33333333u);
n = (n & 0x0f0f0f0fu) + ((n >> 4) & 0x0f0f0f0fu);
n = (n & 0x00ff00ffu) + ((n >> 8) & 0x00ff00ffu);
n = (n & 0x0000ffffu) + ((n >>16) & 0x0000ffffu);
return n;
}
#ifdef __GNUC__
#define count_bits_1(v) __builtin_popcount(v)
#else
#define count_bits_1(v) pop0(v)
#endif
BERRY_API void be_gc_memory_pools_info(bvm *vm, size_t* slots_used, size_t* slots_allocated) {
size_t used = 0;
size_t allocated = 0;
gc16_t* pool16 = vm->gc.pool16;
while (pool16) {
allocated += POOL16_SLOTS;
used += POOL16_SLOTS - count_bits_1(pool16->bitmap);
pool16 = pool16->next;
}
gc32_t* pool32 = vm->gc.pool32;
while (pool32) {
allocated += POOL32_SLOTS;
used += POOL32_SLOTS - count_bits_1(pool32->bitmap);
pool32 = pool32->next;
}
if (slots_used) { *slots_used = used; }
if (slots_allocated) { *slots_allocated = allocated; }
}

View File

@ -21,6 +21,14 @@ BERRY_API void* be_os_malloc(size_t size);
BERRY_API void be_os_free(void *ptr);
BERRY_API void* be_os_realloc(void *ptr, size_t size);
BERRY_API void* be_realloc(bvm *vm, void *ptr, size_t old_size, size_t new_size);
BERRY_API void be_gc_memory_pools(bvm *vm);
BERRY_API void be_gc_free_memory_pools(bvm *vm);
BERRY_API void be_gc_init_memory_pools(bvm *vm);
BERRY_API void be_gc_memory_pools_info(bvm *vm, size_t* slots_used, size_t* slots_allocated);
/* The following moves a portion of memory to constraint regions with 32-bits read/write acess */
/* Effective only if `BE_USE_MEM_ALIGNED` is set to `1`*/
BERRY_API void* be_move_to_aligned(bvm *vm, void *ptr, size_t size);
#ifdef __cplusplus
}

View File

@ -312,16 +312,23 @@ static void end_func(bparser *parser)
be_code_ret(finfo, NULL); /* append a return to last code */
end_block(parser); /* close block */
setupvals(finfo); /* close upvals */
proto->code = be_vector_release_32(vm, &finfo->code); /* compact all vectors and return NULL if empty */
proto->code = be_vector_release(vm, &finfo->code); /* compact all vectors and return NULL if empty */
proto->codesize = finfo->pc;
proto->ktab = be_vector_release_32(vm, &finfo->kvec);
proto->ktab = be_vector_release(vm, &finfo->kvec);
proto->nconst = be_vector_count(&finfo->kvec);
proto->ptab = be_vector_release(vm, &finfo->pvec);
proto->nproto = be_vector_count(&finfo->pvec);
#if BE_USE_MEM_ALIGNED
proto->code = be_move_to_aligned(vm, proto->code, proto->codesize * sizeof(binstruction)); /* move `code` to 4-bytes aligned memory region */
proto->ktab = be_move_to_aligned(vm, proto->ktab, proto->nconst * sizeof(bvalue)); /* move `ktab` to 4-bytes aligned memory region */
#endif /* BE_USE_MEM_ALIGNED */
#if BE_DEBUG_RUNTIME_INFO
proto->lineinfo = be_vector_release_32(vm, &finfo->linevec);
proto->lineinfo = be_vector_release(vm, &finfo->linevec); /* move `lineinfo` to 4-bytes aligned memory region */
proto->nlineinfo = be_vector_count(&finfo->linevec);
#endif
#if BE_USE_MEM_ALIGNED
proto->lineinfo = be_move_to_aligned(vm, proto->lineinfo, proto->nlineinfo * sizeof(blineinfo));
#endif /* BE_USE_MEM_ALIGNED */
#endif /* BE_DEBUG_RUNTIME_INFO */
#if BE_DEBUG_VAR_INFO
proto->varinfo = be_vector_release(vm, &finfo->varvec);
proto->nvarinfo = be_vector_count(&finfo->varvec);

View File

@ -114,32 +114,6 @@ void* be_vector_release(bvm *vm, bvector *vector)
return vector->data;
}
/* free not used */
void* be_vector_release_32(bvm *vm, bvector *vector)
{
size_t size = vector->size;
int count = be_vector_count(vector);
if (count == 0) {
be_free(vm, vector->data, vector->capacity * size);
vector->capacity = 0;
vector->data = NULL;
vector->end = NULL;
} else if (count < vector->capacity) {
vector->data = be_realloc(vm,
vector->data, vector->capacity * size, count * size); // TODO - can we skip that step?
void* iram = berry_malloc32(count * size);
if (iram) {
memcpy(iram, vector->data, count * size);
free(vector->data);
vector->data = iram;
}
// vm->gc.usage = vm->gc.usage + count * size - vector->capacity * size; /* update allocated count */
vector->end = (char*)vector->data + ((size_t)count - 1) * size;
vector->capacity = count;
}
return vector->data;
}
/* use binary search to find the vector capacity between 0-1024 */
static int binary_search(int value)
{

View File

@ -491,6 +491,7 @@ BERRY_API void be_vm_delete(bvm *vm)
be_stack_delete(vm, &vm->tracestack);
be_free(vm, vm->stack, (vm->stacktop - vm->stack) * sizeof(bvalue));
be_globalvar_deinit(vm);
be_gc_free_memory_pools(vm);
#if BE_USE_DEBUG_HOOK
/* free native hook */
if (var_istype(&vm->hook, BE_COMPTR))

View File

@ -46,10 +46,14 @@ typedef struct {
int status;
} bcallframe;
struct gc16_t; /* memory pool for 0-16 bytes or less objects */
struct gc32_t; /* memory pool for 17-32 bytes */
struct bgc {
bgcobject *list; /* the GC-object list */
bgcobject *gray; /* the gray object list */
bgcobject *fixed; /* the fixed objecct list */
struct gc16_t* pool16;
struct gc32_t* pool32;
size_t usage; /* the count of bytes currently allocated */
size_t threshold; /* he threshold of allocation for the next GC */
bbyte steprate; /* the rate of increase in the distribution between two GCs (percentage) */
@ -80,7 +84,6 @@ struct bupval {
} u;
int refcnt;
};
struct bvm {
bglobaldesc gbldesc; /* global description */
bvalue *stack; /* stack space */

View File

@ -509,13 +509,7 @@ void *special_calloc(size_t num, size_t size) {
// Variants for IRAM heap, which need all accesses to be 32 bits aligned
void *special_malloc32(uint32_t size) {
return heap_caps_malloc(size, UsePSRAM() ? MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT : MALLOC_CAP_32BIT);
}
void *special_realloc32(void *ptr, size_t size) {
return heap_caps_realloc(ptr, size, UsePSRAM() ? MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT : MALLOC_CAP_32BIT);
}
void *special_calloc32(size_t num, size_t size) {
return heap_caps_calloc(num, size, UsePSRAM() ? MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT : MALLOC_CAP_32BIT);
return heap_caps_malloc(size, MALLOC_CAP_32BIT);
}
float CpuTemperature(void) {

View File

@ -177,11 +177,10 @@ extern "C" {
if (UsePSRAM()) {
be_map_insert_int(vm, "psram", ESP.getPsramSize() / 1024);
be_map_insert_int(vm, "psram_free", ESP.getFreePsram() / 1024);
} else {
// IRAM information
int32_t iram_free = (int32_t)heap_caps_get_free_size(MALLOC_CAP_32BIT) - (int32_t)heap_caps_get_free_size(MALLOC_CAP_8BIT);
be_map_insert_int(vm, "iram_free", iram_free / 1024);
}
// IRAM information
int32_t iram_free = (int32_t)heap_caps_get_free_size(MALLOC_CAP_32BIT) - (int32_t)heap_caps_get_free_size(MALLOC_CAP_8BIT);
be_map_insert_int(vm, "iram_free", iram_free / 1024);
be_pop(vm, 1);
be_return(vm);
}

View File

@ -89,21 +89,7 @@ extern "C" {
#ifdef USE_BERRY_IRAM
return special_malloc32(size);
#else
return special_malloc(size);
#endif
}
void *berry_realloc32(void *ptr, size_t size) {
#ifdef USE_BERRY_IRAM
return special_realloc32(ptr, size);
#else
return special_realloc(ptr, size);
#endif
}
void *berry_calloc32(size_t num, size_t size) {
#ifdef USE_BERRY_IRAM
return special_calloc32(num, size);
#else
return special_calloc(num, size);
return NULL; /* return NULL to indicate that IRAM is not enabled */
#endif
}
@ -234,8 +220,14 @@ void BerryObservability(bvm *vm, int event...) {
uint32_t gc_elapsed = millis() - gc_time;
uint32_t vm_scanned = va_arg(param, uint32_t);
uint32_t vm_freed = va_arg(param, uint32_t);
AddLog(LOG_LEVEL_DEBUG_MORE, D_LOG_BERRY "GC from %i to %i bytes, objects freed %i/%i (in %d ms)",
vm_usage, vm_usage2, vm_freed, vm_scanned, gc_elapsed);
size_t slots_used_before_gc = va_arg(param, size_t);
size_t slots_allocated_before_gc = va_arg(param, size_t);
size_t slots_used_after_gc = va_arg(param, size_t);
size_t slots_allocated_after_gc = va_arg(param, size_t);
AddLog(LOG_LEVEL_DEBUG_MORE, D_LOG_BERRY "GC from %i to %i bytes, objects freed %i/%i (in %d ms) - slots from %i/%i to %i/%i",
vm_usage, vm_usage2, vm_freed, vm_scanned, gc_elapsed,
slots_used_before_gc, slots_allocated_before_gc,
slots_used_after_gc, slots_allocated_after_gc);
// make new threshold tighter when we reach high memory usage
if (!UsePSRAM() && vm->gc.threshold > 20*1024) {
vm->gc.threshold = vm->gc.usage + 10*1024; // increase by only 10 KB