Improve 1bpp buffer storing performance by using an edited coord_table and (if useful) loading the m2_bits_to_nybbles table in IWRAM

This commit is contained in:
Lorenzooone 2019-09-24 19:14:01 +02:00
parent 24fbc3098e
commit b372862942
4 changed files with 182 additions and 67 deletions

View File

@ -799,7 +799,7 @@ int print_menu_string(WINDOW* window)
break;
default:
looping = false;
window->menu_text = NULL; //Otherwise it will keep printing indefinetly
window->menu_text = NULL; //Otherwise it will keep printing indefinetly
break;
}
}
@ -1518,14 +1518,14 @@ int highlight_string(WINDOW* window, byte* str, unsigned short x, unsigned short
//Highlights "Talk to"
void highlight_talk_to()
{
char Talk_to[] = "Talk to";
byte str[0xA];
int i;
for(i = 0; i < (sizeof(Talk_to) - 1); i++)
str[i] = encode_ascii(Talk_to[i]);
str[i++] = 0;
str[i] = 0xFF;
highlight_string(getWindow(0), str, 1, 0, true);
char Talk_to[] = "Talk to";
byte str[0xA];
int i;
for(i = 0; i < (sizeof(Talk_to) - 1); i++)
str[i] = encode_ascii(Talk_to[i]);
str[i++] = 0;
str[i] = 0xFF;
highlight_string(getWindow(0), str, 1, 0, true);
}
unsigned short printstr_hlight_buffer(WINDOW* window, byte* str, unsigned short x, unsigned short y, bool highlight)
@ -1926,75 +1926,185 @@ void load_pixels_overworld_buffer()
void store_pixels_overworld_buffer(int totalYs)
{
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
int tile = *tile_offset;
byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER));
totalYs >>= 1;
int total = totalYs * 0x1C;
for(int i = 0; i < total; i++)
int* topBufferValues = (int*)(&buffer[tile * 8]);
int* bottomBufferValues = topBufferValues + 0x40;
int* topTilePointer;
int* bottomTilePointer;
int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast;
int bits_to_nybbles_array[0x100];
//It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM)
//From a full copy it saves about 15k cycles
if(total > 0x40)
{
cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100);
bits_to_nybbles_pointer = bits_to_nybbles_array;
}
int nextValue = 0x20;
int i = 0;
while(i < total)
{
//Not using functions for the tile values saves about 30k cycles on average
int tile = m2_coord_table[i] + *tile_offset;
int addedValue = (i >> 5) << 6;
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
unsigned int first_half = bufferValues[0];
unsigned int second_half = bufferValues[1];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
//Do the tile right below (Saves about 50k cycles on average)
tile += 0x20;
bufferValues += 0x40;
first_half = bufferValues[0];
second_half = bufferValues[1];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
//Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average from a full copy
//m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short
tile = m2_coord_table_fast_progression[i];
int remainingTiles = tile >> 0xB;
tile = (tile & 0x7FF) + (*tile_offset);
topTilePointer = &vram[(tile * 8)];
bottomTilePointer = topTilePointer + (0x20 * 8);
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
unsigned int first_half = *(topBufferValues++);
unsigned int second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
while(remainingTiles > 0)
{
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
first_half = *(topBufferValues++);
second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
remainingTiles--;
}
}
}
void store_pixels_overworld_buffer_totalTiles(int totalTiles)
{
byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER));
for(int i = 0; i < totalTiles; i++)
int tile = *tile_offset;
byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER));
int* topBufferValues = (int*)(&buffer[tile * 8]);
int* bottomBufferValues = topBufferValues + 0x40;
int* topTilePointer;
int* bottomTilePointer;
int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast;
int bits_to_nybbles_array[0x100];
//It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM)
//From a full copy it saves about 15k cycles
if(totalTiles > 0x40)
{
cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100);
bits_to_nybbles_pointer = bits_to_nybbles_array;
}
int nextValue = 0x20;
int i = 0;
while(i < totalTiles)
{
//Not using functions for the tile values saves about 30k cycles on average
int tile = m2_coord_table[i] + *tile_offset;
int addedValue = (i >> 5) << 6;
int tile_buffer = (i & 0x1F) + addedValue + *tile_offset;
int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]);
unsigned int first_half = bufferValues[0];
unsigned int second_half = bufferValues[1];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
//Do the tile right below (Saves about 50k cycles on average)
tile += 0x20;
bufferValues += 0x40;
first_half = bufferValues[0];
second_half = bufferValues[1];
vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF];
vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF];
vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF];
vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF];
vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF];
vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF];
vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF];
vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF];
//Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average
//m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short
tile = m2_coord_table_fast_progression[i];
int remainingTiles = tile >> 0xB;
tile = (tile & 0x7FF) + (*tile_offset);
topTilePointer = &vram[(tile * 8)];
bottomTilePointer = topTilePointer + (0x20 * 8);
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
unsigned int first_half = *(topBufferValues++);
unsigned int second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
while(remainingTiles > 0 && i < totalTiles)
{
if(i == nextValue)
{
nextValue += 0x20;
topBufferValues += 0x40;
bottomBufferValues += 0x40;
}
i++;
first_half = *(topBufferValues++);
second_half = *(topBufferValues++);
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
first_half = *(bottomBufferValues++);
second_half = *(bottomBufferValues++);
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF];
*(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF];
remainingTiles--;
}
}
}

View File

@ -124,6 +124,7 @@ void load_pixels_overworld_buffer();
void store_pixels_overworld_buffer(int totalYs);
void store_pixels_overworld_buffer_totalTiles(int totalTiles);
extern unsigned short m2_coord_table_fast_progression[];
extern unsigned short m2_coord_table[];
extern byte m2_ness_name[];
extern int m2_bits_to_nybbles[];

Binary file not shown.

View File

@ -1693,6 +1693,10 @@ m2_font_relocate:
m2_coord_table:
.incbin "data/m2-coord-table.bin"
// Co-ordinate table, version which has 5 bits used for how many consecutive tiles there are after each tile
m2_coord_table_fast_progression:
.incbin "data/m2-coord-table-fast-progression.bin"
// EB fonts
m2_font_table:
dw m2_font_main