Improve 1bpp buffer storing performance by using an edited coord_table and (if useful) loading the m2_bits_to_nybbles table in IWRAM

This commit is contained in:
Lorenzooone 2019-09-24 19:14:01 +02:00
parent 24fbc3098e
commit b372862942
4 changed files with 182 additions and 67 deletions

View File

@ -799,7 +799,7 @@ int print_menu_string(WINDOW* window)
break; break;
default: default:
looping = false; looping = false;
window->menu_text = NULL; //Otherwise it will keep printing indefinetly window->menu_text = NULL; //Otherwise it will keep printing indefinetly
break; break;
} }
} }
@ -1518,14 +1518,14 @@ int highlight_string(WINDOW* window, byte* str, unsigned short x, unsigned short
//Highlights "Talk to" //Highlights "Talk to"
void highlight_talk_to() void highlight_talk_to()
{ {
char Talk_to[] = "Talk to"; char Talk_to[] = "Talk to";
byte str[0xA]; byte str[0xA];
int i; int i;
for(i = 0; i < (sizeof(Talk_to) - 1); i++) for(i = 0; i < (sizeof(Talk_to) - 1); i++)
str[i] = encode_ascii(Talk_to[i]); str[i] = encode_ascii(Talk_to[i]);
str[i++] = 0; str[i++] = 0;
str[i] = 0xFF; str[i] = 0xFF;
highlight_string(getWindow(0), str, 1, 0, true); highlight_string(getWindow(0), str, 1, 0, true);
} }
unsigned short printstr_hlight_buffer(WINDOW* window, byte* str, unsigned short x, unsigned short y, bool highlight) unsigned short printstr_hlight_buffer(WINDOW* window, byte* str, unsigned short x, unsigned short y, bool highlight)
@ -1926,75 +1926,185 @@ void load_pixels_overworld_buffer()
void store_pixels_overworld_buffer(int totalYs) void store_pixels_overworld_buffer(int totalYs)
{ {
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); int tile = *tile_offset;
byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER));
totalYs >>= 1; totalYs >>= 1;
int total = totalYs * 0x1C; int total = totalYs * 0x1C;
for(int i = 0; i < total; i++) int* topBufferValues = (int*)(&buffer[tile * 8]);
int* bottomBufferValues = topBufferValues + 0x40;
int* topTilePointer;
int* bottomTilePointer;
int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast;
int bits_to_nybbles_array[0x100];
//It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM)
//From a full copy it saves about 15k cycles
if(total > 0x40)
{
cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100);
bits_to_nybbles_pointer = bits_to_nybbles_array;
}
int nextValue = 0x20;
int i = 0;
while(i < total)
{ {
//Not using functions for the tile values saves about 30k cycles on average //Not using functions for the tile values saves about 30k cycles on average
int tile = m2_coord_table[i] + *tile_offset; //Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average from a full copy
int addedValue = (i >> 5) << 6; //m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; tile = m2_coord_table_fast_progression[i];
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); int remainingTiles = tile >> 0xB;
unsigned int first_half = bufferValues[0]; tile = (tile & 0x7FF) + (*tile_offset);
unsigned int second_half = bufferValues[1]; topTilePointer = &vram[(tile * 8)];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; bottomTilePointer = topTilePointer + (0x20 * 8);
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; if(i == nextValue)
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; {
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; nextValue += 0x20;
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; topBufferValues += 0x40;
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; bottomBufferValues += 0x40;
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; }
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; i++;
//Do the tile right below (Saves about 50k cycles on average) unsigned int first_half = *(topBufferValues++);
tile += 0x20; unsigned int second_half = *(topBufferValues++);
bufferValues += 0x40; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
first_half = bufferValues[0]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
second_half = bufferValues[1]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; first_half = *(bottomBufferValues++);
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; second_half = *(bottomBufferValues++);
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
while(remainingTiles > 0)
{
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
first_half = *(topBufferValues++);
second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
remainingTiles--;
}
} }
} }
void store_pixels_overworld_buffer_totalTiles(int totalTiles) void store_pixels_overworld_buffer_totalTiles(int totalTiles)
{ {
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); int tile = *tile_offset;
for(int i = 0; i < totalTiles; i++) byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER));
int* topBufferValues = (int*)(&buffer[tile * 8]);
int* bottomBufferValues = topBufferValues + 0x40;
int* topTilePointer;
int* bottomTilePointer;
int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast;
int bits_to_nybbles_array[0x100];
//It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM)
//From a full copy it saves about 15k cycles
if(totalTiles > 0x40)
{
cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100);
bits_to_nybbles_pointer = bits_to_nybbles_array;
}
int nextValue = 0x20;
int i = 0;
while(i < totalTiles)
{ {
//Not using functions for the tile values saves about 30k cycles on average //Not using functions for the tile values saves about 30k cycles on average
int tile = m2_coord_table[i] + *tile_offset; //Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average
int addedValue = (i >> 5) << 6; //m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; tile = m2_coord_table_fast_progression[i];
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); int remainingTiles = tile >> 0xB;
unsigned int first_half = bufferValues[0]; tile = (tile & 0x7FF) + (*tile_offset);
unsigned int second_half = bufferValues[1]; topTilePointer = &vram[(tile * 8)];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; bottomTilePointer = topTilePointer + (0x20 * 8);
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; if(i == nextValue)
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; {
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; nextValue += 0x20;
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; topBufferValues += 0x40;
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; bottomBufferValues += 0x40;
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; }
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; i++;
//Do the tile right below (Saves about 50k cycles on average) unsigned int first_half = *(topBufferValues++);
tile += 0x20; unsigned int second_half = *(topBufferValues++);
bufferValues += 0x40; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
first_half = bufferValues[0]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
second_half = bufferValues[1]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; first_half = *(bottomBufferValues++);
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; second_half = *(bottomBufferValues++);
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
while(remainingTiles > 0 && i < totalTiles)
{
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
first_half = *(topBufferValues++);
second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
remainingTiles--;
}
} }
} }

View File

@ -124,6 +124,7 @@ void load_pixels_overworld_buffer();
void store_pixels_overworld_buffer(int totalYs); void store_pixels_overworld_buffer(int totalYs);
void store_pixels_overworld_buffer_totalTiles(int totalTiles); void store_pixels_overworld_buffer_totalTiles(int totalTiles);
extern unsigned short m2_coord_table_fast_progression[];
extern unsigned short m2_coord_table[]; extern unsigned short m2_coord_table[];
extern byte m2_ness_name[]; extern byte m2_ness_name[];
extern int m2_bits_to_nybbles[]; extern int m2_bits_to_nybbles[];

Binary file not shown.

View File

@ -1693,6 +1693,10 @@ m2_font_relocate:
m2_coord_table: m2_coord_table:
.incbin "data/m2-coord-table.bin" .incbin "data/m2-coord-table.bin"
// Co-ordinate table, version which has 5 bits used for how many consecutive tiles there are after each tile
m2_coord_table_fast_progression:
.incbin "data/m2-coord-table-fast-progression.bin"
// EB fonts // EB fonts
m2_font_table: m2_font_table:
dw m2_font_main dw m2_font_main