Merge pull request #78 from Lorenzooone/window-text-buffering

Improve 1bpp buffer performances
2019-09-18 17:58:33 -04:00 · 2019-09-18 17:58:33 -04:00 · a3f86fcf02
parent e7ef55de93 c8e592ad66
commit a3f86fcf02
2 changed files with 69 additions and 32 deletions
--- a/src/c/vwf.c
+++ b/src/c/vwf.c
@ -1696,43 +1696,78 @@ void print_blankstr_buffer(int x, int y, int width, byte *dest)
 void load_pixels_overworld_buffer()
 {
    byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
-    for(int y = 0; y < 0x10; y++)
-        for(int x = 0; x < 0x1C; x++)
-        {
-            int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1);
-            int tile = get_tile_number_with_offset(x + 1, y + 1);
-            int foregroundRow = 0xFFFFFFFF;
-            buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
-            buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow);
-            buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow);
-            buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow);
-            buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
-            buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow);
-            buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow);
-            buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow);
-        }
+    for(int i = 0; i < 8 * 0x1C; i++)
+    {
+        //Doing this saves about 100k cycles during load
+        int tile = m2_coord_table[i] + *tile_offset;
+        int addedValue = (i >> 5) << 6;
+        int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
+        int foregroundRow = 0xFFFFFFFF;
+        //Reduce total amount of stores from 16 to 4
+        int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
+        unsigned int first_half;
+        unsigned int second_half;
+        first_half = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow) << 8;
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow) << 0x10;
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow) << 0x18;
+        second_half = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow) << 8;
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow) << 0x10;
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow) << 0x18;
+        bufferValues[0] = first_half;
+        bufferValues[1] = second_half;
+        bufferValues += 0x40;
+        tile += 0x20;
+        first_half = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow) << 8;
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow) << 0x10;
+        first_half |= reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow) << 0x18;
+        second_half = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow) << 8;
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow) << 0x10;
+        second_half |= reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow) << 0x18;
+        bufferValues[0] = first_half;
+        bufferValues[1] = second_half;
+    }
 }

 void store_pixels_overworld_buffer(int totalYs)
 {
    byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
-    for(int y = 0; y < totalYs; y++)
-        for(int x = 0; x < 0x1C; x++)
-        {
-            int tile = get_tile_number_with_offset(x + 1, y + 1);
-            int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1);
-            int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
-            unsigned int first_half = bufferValues[0];
-            unsigned int second_half = bufferValues[1];
-            vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
-            vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
-            vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
-            vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
-            vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
-            vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
-            vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
-            vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
-        }
+    totalYs >>= 1;
+    int total = totalYs * 0x1C;
+    for(int i = 0; i < total; i++)
+    {
+        //Not using functions for the tile values saves about 30k cycles on average
+        int tile = m2_coord_table[i] + *tile_offset;
+        int addedValue = (i >> 5) << 6;
+        int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
+        int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
+        unsigned int first_half = bufferValues[0];
+        unsigned int second_half = bufferValues[1];
+        vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
+        vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
+        vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
+        vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
+        vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
+        vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
+        vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
+        vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
+        //Do the tile right below (Saves about 50k cycles on average)
+        tile += 0x20;
+        bufferValues += 0x40;
+        first_half = bufferValues[0];
+        second_half = bufferValues[1];
+        vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
+        vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
+        vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
+        vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
+        vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
+        vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
+        vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
+        vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
+    }
 }

 // x,y: tile coordinates
--- a/src/m2-vwf-entries.asm
+++ b/src/m2-vwf-entries.asm
@ -2977,6 +2977,7 @@ pop     {r0-r3,pc}
 //Stores the buffer into the vram. This avoids screen tearing.
 store_pixels_overworld:
 push    {r0-r3,lr}
+swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively
 mov     r0,#0x10
 bl      store_pixels_overworld_buffer
 pop     {r0-r3,pc}
@ -2985,6 +2986,7 @@ pop     {r0-r3,pc}
 //Stores the buffer into the vram. This avoids screen tearing.
 store_pixels_overworld_psi_window:
 push    {r0-r3,lr}
+swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively
 mov     r0,#0xA
 bl      store_pixels_overworld_buffer
 pop     {r0-r3,pc}