Improve buffer storage performance by 1/3
Also improves buffer loading performance by 1/5.
This commit is contained in:
parent
2f2970aae0
commit
8402fa5ae9
43
src/c/vwf.c
43
src/c/vwf.c
|
@ -1696,11 +1696,12 @@ void print_blankstr_buffer(int x, int y, int width, byte *dest)
|
||||||
void load_pixels_overworld_buffer()
|
void load_pixels_overworld_buffer()
|
||||||
{
|
{
|
||||||
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
|
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
|
||||||
for(int y = 0; y < 0x10; y++)
|
for(int i = 0; i < 8 * 0x1C; i++)
|
||||||
for(int x = 0; x < 0x1C; x++)
|
|
||||||
{
|
{
|
||||||
int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1);
|
//Doing this saves about 100k cycles during load
|
||||||
int tile = get_tile_number_with_offset(x + 1, y + 1);
|
int tile = m2_coord_table[i] + *tile_offset;
|
||||||
|
int addedValue = (i >> 5) << 6;
|
||||||
|
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
|
||||||
int foregroundRow = 0xFFFFFFFF;
|
int foregroundRow = 0xFFFFFFFF;
|
||||||
buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
|
buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
|
||||||
buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow);
|
buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow);
|
||||||
|
@ -1709,6 +1710,16 @@ void load_pixels_overworld_buffer()
|
||||||
buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
|
buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
|
||||||
buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow);
|
buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow);
|
||||||
buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow);
|
buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow);
|
||||||
|
tile_buffer += 0x20;
|
||||||
|
tile += 0x20;
|
||||||
|
buffer[(tile_buffer * 8) + 0] = reduce_bit_depth(vram[(tile * 8) + 0], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 1] = reduce_bit_depth(vram[(tile * 8) + 1], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 2] = reduce_bit_depth(vram[(tile * 8) + 2], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 3] = reduce_bit_depth(vram[(tile * 8) + 3], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 4] = reduce_bit_depth(vram[(tile * 8) + 4], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 5] = reduce_bit_depth(vram[(tile * 8) + 5], foregroundRow);
|
||||||
|
buffer[(tile_buffer * 8) + 6] = reduce_bit_depth(vram[(tile * 8) + 6], foregroundRow);
|
||||||
buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow);
|
buffer[(tile_buffer * 8) + 7] = reduce_bit_depth(vram[(tile * 8) + 7], foregroundRow);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1716,11 +1727,14 @@ void load_pixels_overworld_buffer()
|
||||||
void store_pixels_overworld_buffer(int totalYs)
|
void store_pixels_overworld_buffer(int totalYs)
|
||||||
{
|
{
|
||||||
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
|
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
|
||||||
for(int y = 0; y < totalYs; y++)
|
totalYs >>= 1;
|
||||||
for(int x = 0; x < 0x1C; x++)
|
int total = totalYs * 0x1C;
|
||||||
|
for(int i = 0; i < total; i++)
|
||||||
{
|
{
|
||||||
int tile = get_tile_number_with_offset(x + 1, y + 1);
|
//Not using functions for the tile values saves about 30k cycles on average
|
||||||
int tile_buffer = get_tile_number_with_offset_buffer(x + 1, y + 1);
|
int tile = m2_coord_table[i] + *tile_offset;
|
||||||
|
int addedValue = (i >> 5) << 6;
|
||||||
|
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
|
||||||
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
|
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
|
||||||
unsigned int first_half = bufferValues[0];
|
unsigned int first_half = bufferValues[0];
|
||||||
unsigned int second_half = bufferValues[1];
|
unsigned int second_half = bufferValues[1];
|
||||||
|
@ -1731,6 +1745,19 @@ void store_pixels_overworld_buffer(int totalYs)
|
||||||
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
|
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
|
||||||
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
|
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
|
||||||
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
|
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
|
||||||
|
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
|
||||||
|
//Do the tile right below (Saves about 50k cycles on average)
|
||||||
|
tile += 0x20;
|
||||||
|
bufferValues += 0x40;
|
||||||
|
first_half = bufferValues[0];
|
||||||
|
second_half = bufferValues[1];
|
||||||
|
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
|
||||||
|
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
|
||||||
|
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
|
||||||
|
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
|
||||||
|
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
|
||||||
|
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
|
||||||
|
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
|
||||||
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
|
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2977,6 +2977,7 @@ pop {r0-r3,pc}
|
||||||
//Stores the buffer into the vram. This avoids screen tearing.
|
//Stores the buffer into the vram. This avoids screen tearing.
|
||||||
store_pixels_overworld:
|
store_pixels_overworld:
|
||||||
push {r0-r3,lr}
|
push {r0-r3,lr}
|
||||||
|
swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively
|
||||||
mov r0,#0x10
|
mov r0,#0x10
|
||||||
bl store_pixels_overworld_buffer
|
bl store_pixels_overworld_buffer
|
||||||
pop {r0-r3,pc}
|
pop {r0-r3,pc}
|
||||||
|
@ -2985,6 +2986,7 @@ pop {r0-r3,pc}
|
||||||
//Stores the buffer into the vram. This avoids screen tearing.
|
//Stores the buffer into the vram. This avoids screen tearing.
|
||||||
store_pixels_overworld_psi_window:
|
store_pixels_overworld_psi_window:
|
||||||
push {r0-r3,lr}
|
push {r0-r3,lr}
|
||||||
|
swi #5 //The improved performances allow using a VBlank before the storage in order to prevent screen tearing effectively
|
||||||
mov r0,#0xA
|
mov r0,#0xA
|
||||||
bl store_pixels_overworld_buffer
|
bl store_pixels_overworld_buffer
|
||||||
pop {r0-r3,pc}
|
pop {r0-r3,pc}
|
||||||
|
|
Loading…
Reference in New Issue