From 8402fa5ae9532f13e7c717bf5b70a1036cf906dc Mon Sep 17 00:00:00 2001 From: Lorenzooone Date: Wed, 18 Sep 2019 16:51:53 +0200 Subject: [PATCH 1/3] Improve buffer storage performance by 1/3 Also improves buffer loading performance by 1/5. --- src/c/vwf.c | 91 +++++++++++++++++++++++++++--------------- src/m2-vwf-entries.asm | 2 + 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/c/vwf.c b/src/c/vwf.c index ba4d742..28e199e 100644 --- a/src/c/vwf.c +++ b/src/c/vwf.c @@ -1696,43 +1696,70 @@ void print_blankstr_buffer(int x, int y, int width, byte *dest) void load_pixels_overworld_buffer() { byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); - for(int y = 0; y < 0x10; y++) - for(int x = 0; x < 0x1C; x++) - { - int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1); - int tile = get_tile_number_with_offset(x + 1, y + 1); - int foregroundRow = 0xFFFFFFFF; - buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); - buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); - buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); - buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow); - buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); - buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); - buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); - buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); - } + for(int i = 0; i < 8 * 0x1C; i++) + { + //Doing this saves about 100k cycles during load + int tile = m2_coord_table[i] + *tile_offset; + int addedValue = (i >> 5) << 6; + int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; + int foregroundRow = 0xFFFFFFFF; + buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); + buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); + buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); + buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow); + buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); + buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); + buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); + buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); + tile_buffer += 0x20; + tile += 0x20; + buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); + buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); + buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); + buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow); + buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); + buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); + buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); + buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); + } } void store_pixels_overworld_buffer(int totalYs) { byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); - for(int y = 0; y < totalYs; y++) - for(int x = 0; x < 0x1C; x++) - { - int tile = get_tile_number_with_offset(x + 1, y + 1); - int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1); - int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); - unsigned int first_half = bufferValues[0]; - unsigned int second_half = bufferValues[1]; - vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; - vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; - vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; - vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; - vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; - vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; - } + totalYs >>= 1; + int total = totalYs * 0x1C; + for(int i = 0; i < total; i++) + { + //Not using functions for the tile values saves about 30k cycles on average + int tile = m2_coord_table[i] + *tile_offset; + int addedValue = (i >> 5) << 6; + int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; + int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); + unsigned int first_half = bufferValues[0]; + unsigned int second_half = bufferValues[1]; + vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; + vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; + vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; + vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; + vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; + vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; + vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; + vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; + //Do the tile right below (Saves about 50k cycles on average) + tile += 0x20; + bufferValues += 0x40; + first_half = bufferValues[0]; + second_half = bufferValues[1]; + vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; + vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; + vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; + vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; + vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; + vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; + vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; + vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; + } } // x,y: tile coordinates diff --git a/src/m2-vwf-entries.asm b/src/m2-vwf-entries.asm index e2e1b2b..4603013 100644 --- a/src/m2-vwf-entries.asm +++ b/src/m2-vwf-entries.asm @@ -2977,6 +2977,7 @@ pop {r0-r3,pc} //Stores the buffer into the vram. This avoids screen tearing. store_pixels_overworld: push {r0-r3,lr} +swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively mov r0,#0x10 bl store_pixels_overworld_buffer pop {r0-r3,pc} @@ -2985,6 +2986,7 @@ pop {r0-r3,pc} //Stores the buffer into the vram. This avoids screen tearing. store_pixels_overworld_psi_window: push {r0-r3,lr} +swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively mov r0,#0xA bl store_pixels_overworld_buffer pop {r0-r3,pc} From b4f6f78981e3af9866ec299fd05a56ddb87d8653 Mon Sep 17 00:00:00 2001 From: Lorenzooone Date: Wed, 18 Sep 2019 16:59:50 +0200 Subject: [PATCH 2/3] Improve indentation --- src/c/vwf.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/c/vwf.c b/src/c/vwf.c index 28e199e..8a4728c 100644 --- a/src/c/vwf.c +++ b/src/c/vwf.c @@ -1698,9 +1698,9 @@ void load_pixels_overworld_buffer() byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); for(int i = 0; i < 8 * 0x1C; i++) { - //Doing this saves about 100k cycles during load + //Doing this saves about 100k cycles during load int tile = m2_coord_table[i] + *tile_offset; - int addedValue = (i >> 5) << 6; + int addedValue = (i >> 5) << 6; int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; int foregroundRow = 0xFFFFFFFF; buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); @@ -1711,8 +1711,8 @@ void load_pixels_overworld_buffer() buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); - tile_buffer += 0x20; - tile += 0x20; + tile_buffer += 0x20; + tile += 0x20; buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); @@ -1727,13 +1727,13 @@ void load_pixels_overworld_buffer() void store_pixels_overworld_buffer(int totalYs) { byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); - totalYs >>= 1; - int total = totalYs * 0x1C; + totalYs >>= 1; + int total = totalYs * 0x1C; for(int i = 0; i < total; i++) - { - //Not using functions for the tile values saves about 30k cycles on average + { + //Not using functions for the tile values saves about 30k cycles on average int tile = m2_coord_table[i] + *tile_offset; - int addedValue = (i >> 5) << 6; + int addedValue = (i >> 5) << 6; int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); unsigned int first_half = bufferValues[0]; @@ -1746,7 +1746,7 @@ void store_pixels_overworld_buffer(int totalYs) vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; - //Do the tile right below (Saves about 50k cycles on average) + //Do the tile right below (Saves about 50k cycles on average) tile += 0x20; bufferValues += 0x40; first_half = bufferValues[0]; From c8e592ad66b99d5a7ca28cd87f86972690de32ff Mon Sep 17 00:00:00 2001 From: Lorenzooone Date: Wed, 18 Sep 2019 22:56:27 +0200 Subject: [PATCH 3/3] Improve buffer loading again Makes it 3/5 the initial speed --- src/c/vwf.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/c/vwf.c b/src/c/vwf.c index 8a4728c..4a35c73 100644 --- a/src/c/vwf.c +++ b/src/c/vwf.c @@ -1703,24 +1703,32 @@ void load_pixels_overworld_buffer() int addedValue = (i >> 5) << 6; int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; int foregroundRow = 0xFFFFFFFF; - buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); - buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); - buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); - buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow); - buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); - buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); - buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); - buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); - tile_buffer += 0x20; + //Reduce total amount of stores from 16 to 4 + int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); + unsigned int first_half; + unsigned int second_half; + first_half = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); + first_half |= reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow) << 8; + first_half |= reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow) << 0x10; + first_half |= reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow) << 0x18; + second_half = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); + second_half |= reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow) << 8; + second_half |= reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow) << 0x10; + second_half |= reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow) << 0x18; + bufferValues[0] = first_half; + bufferValues[1] = second_half; + bufferValues += 0x40; tile += 0x20; - buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); - buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow); - buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow); - buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow); - buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); - buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow); - buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow); - buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow); + first_half = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow); + first_half |= reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow) << 8; + first_half |= reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow) << 0x10; + first_half |= reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow) << 0x18; + second_half = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow); + second_half |= reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow) << 8; + second_half |= reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow) << 0x10; + second_half |= reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow) << 0x18; + bufferValues[0] = first_half; + bufferValues[1] = second_half; } }