From b372862942a8001e27225b131a5d64f9f683a25a Mon Sep 17 00:00:00 2001 From: Lorenzooone Date: Tue, 24 Sep 2019 19:14:01 +0200 Subject: [PATCH] Improve 1bpp buffer storing performance by using an edited coord_table and (if useful) loading the m2_bits_to_nybbles table in IWRAM --- src/c/vwf.c | 244 ++++++++++++++----- src/c/vwf.h | 1 + src/data/m2-coord-table-fast-progression.bin | Bin 0 -> 448 bytes src/m2-hack.asm | 4 + 4 files changed, 182 insertions(+), 67 deletions(-) create mode 100644 src/data/m2-coord-table-fast-progression.bin diff --git a/src/c/vwf.c b/src/c/vwf.c index 56c8c75..2289bd2 100644 --- a/src/c/vwf.c +++ b/src/c/vwf.c @@ -799,7 +799,7 @@ int print_menu_string(WINDOW* window) break; default: looping = false; - window->menu_text = NULL; //Otherwise it will keep printing indefinetly + window->menu_text = NULL; //Otherwise it will keep printing indefinetly break; } } @@ -1518,14 +1518,14 @@ int highlight_string(WINDOW* window, byte* str, unsigned short x, unsigned short //Highlights "Talk to" void highlight_talk_to() { - char Talk_to[] = "Talk to"; - byte str[0xA]; - int i; - for(i = 0; i < (sizeof(Talk_to) - 1); i++) - str[i] = encode_ascii(Talk_to[i]); - str[i++] = 0; - str[i] = 0xFF; - highlight_string(getWindow(0), str, 1, 0, true); + char Talk_to[] = "Talk to"; + byte str[0xA]; + int i; + for(i = 0; i < (sizeof(Talk_to) - 1); i++) + str[i] = encode_ascii(Talk_to[i]); + str[i++] = 0; + str[i] = 0xFF; + highlight_string(getWindow(0), str, 1, 0, true); } unsigned short printstr_hlight_buffer(WINDOW* window, byte* str, unsigned short x, unsigned short y, bool highlight) @@ -1926,75 +1926,185 @@ void load_pixels_overworld_buffer() void store_pixels_overworld_buffer(int totalYs) { - byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); + int tile = *tile_offset; + byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER)); totalYs >>= 1; int total = totalYs * 0x1C; - for(int i = 0; i < total; i++) + int* topBufferValues = (int*)(&buffer[tile * 8]); + int* bottomBufferValues = topBufferValues + 0x40; + int* topTilePointer; + int* bottomTilePointer; + int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast; + int bits_to_nybbles_array[0x100]; + //It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM) + //From a full copy it saves about 15k cycles + if(total > 0x40) + { + cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100); + bits_to_nybbles_pointer = bits_to_nybbles_array; + } + int nextValue = 0x20; + int i = 0; + while(i < total) { //Not using functions for the tile values saves about 30k cycles on average - int tile = m2_coord_table[i] + *tile_offset; - int addedValue = (i >> 5) << 6; - int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; - int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); - unsigned int first_half = bufferValues[0]; - unsigned int second_half = bufferValues[1]; - vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; - vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; - vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; - vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; - vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; - vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; - //Do the tile right below (Saves about 50k cycles on average) - tile += 0x20; - bufferValues += 0x40; - first_half = bufferValues[0]; - second_half = bufferValues[1]; - vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; - vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; - vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; - vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; - vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; - vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; + //Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average from a full copy + //m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short + tile = m2_coord_table_fast_progression[i]; + int remainingTiles = tile >> 0xB; + tile = (tile & 0x7FF) + (*tile_offset); + topTilePointer = &vram[(tile * 8)]; + bottomTilePointer = topTilePointer + (0x20 * 8); + if(i == nextValue) + { + nextValue += 0x20; + topBufferValues += 0x40; + bottomBufferValues += 0x40; + } + i++; + unsigned int first_half = *(topBufferValues++); + unsigned int second_half = *(topBufferValues++); + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + first_half = *(bottomBufferValues++); + second_half = *(bottomBufferValues++); + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + + while(remainingTiles > 0) + { + if(i == nextValue) + { + nextValue += 0x20; + topBufferValues += 0x40; + bottomBufferValues += 0x40; + } + i++; + first_half = *(topBufferValues++); + second_half = *(topBufferValues++); + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + first_half = *(bottomBufferValues++); + second_half = *(bottomBufferValues++); + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + remainingTiles--; + } } } void store_pixels_overworld_buffer_totalTiles(int totalTiles) { - byte* buffer = (byte*)(OVERWORLD_BUFFER - ((*tile_offset) * TILESET_OFFSET_BUFFER_MULTIPLIER)); - for(int i = 0; i < totalTiles; i++) + int tile = *tile_offset; + byte* buffer = (byte*)(OVERWORLD_BUFFER - (tile * TILESET_OFFSET_BUFFER_MULTIPLIER)); + int* topBufferValues = (int*)(&buffer[tile * 8]); + int* bottomBufferValues = topBufferValues + 0x40; + int* topTilePointer; + int* bottomTilePointer; + int* bits_to_nybbles_pointer = m2_bits_to_nybbles_fast; + int bits_to_nybbles_array[0x100]; + //It's convenient to copy the table in IWRAM (about 0x400 cycles) only if we have more than 0x40 total tiles to copy ((total * 0x10 * 2) = total cycles used reading from EWRAM vs. (total * 0x10) + 0x400 = total cycles used writing to and reading from IWRAM) + //From a full copy it saves about 15k cycles + if(totalTiles > 0x40) + { + cpufastset(bits_to_nybbles_pointer, bits_to_nybbles_array, 0x100); + bits_to_nybbles_pointer = bits_to_nybbles_array; + } + int nextValue = 0x20; + int i = 0; + while(i < totalTiles) { //Not using functions for the tile values saves about 30k cycles on average - int tile = m2_coord_table[i] + *tile_offset; - int addedValue = (i >> 5) << 6; - int tile_buffer = (i & 0x1F) + addedValue + *tile_offset; - int* bufferValues = (int*)(&buffer[(tile_buffer * 8)]); - unsigned int first_half = bufferValues[0]; - unsigned int second_half = bufferValues[1]; - vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; - vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; - vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; - vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; - vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; - vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; - //Do the tile right below (Saves about 50k cycles on average) - tile += 0x20; - bufferValues += 0x40; - first_half = bufferValues[0]; - second_half = bufferValues[1]; - vram[(tile * 8) + 0] = m2_bits_to_nybbles_fast[(first_half >> 0) & 0xFF]; - vram[(tile * 8) + 1] = m2_bits_to_nybbles_fast[(first_half >> 8) & 0xFF]; - vram[(tile * 8) + 2] = m2_bits_to_nybbles_fast[(first_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 3] = m2_bits_to_nybbles_fast[(first_half >> 0x18) & 0xFF]; - vram[(tile * 8) + 4] = m2_bits_to_nybbles_fast[(second_half >> 0) & 0xFF]; - vram[(tile * 8) + 5] = m2_bits_to_nybbles_fast[(second_half >> 8) & 0xFF]; - vram[(tile * 8) + 6] = m2_bits_to_nybbles_fast[(second_half >> 0x10) & 0xFF]; - vram[(tile * 8) + 7] = m2_bits_to_nybbles_fast[(second_half >> 0x18) & 0xFF]; + //Using pointers + a way to keep track of subsequent tiles saves 50k cycles on average + //m2_coord_table_fast_progression has the tile number and the number of tiles used without interruction after it in a single short + tile = m2_coord_table_fast_progression[i]; + int remainingTiles = tile >> 0xB; + tile = (tile & 0x7FF) + (*tile_offset); + topTilePointer = &vram[(tile * 8)]; + bottomTilePointer = topTilePointer + (0x20 * 8); + if(i == nextValue) + { + nextValue += 0x20; + topBufferValues += 0x40; + bottomBufferValues += 0x40; + } + i++; + unsigned int first_half = *(topBufferValues++); + unsigned int second_half = *(topBufferValues++); + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + first_half = *(bottomBufferValues++); + second_half = *(bottomBufferValues++); + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + + while(remainingTiles > 0 && i < totalTiles) + { + if(i == nextValue) + { + nextValue += 0x20; + topBufferValues += 0x40; + bottomBufferValues += 0x40; + } + i++; + first_half = *(topBufferValues++); + second_half = *(topBufferValues++); + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(topTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + first_half = *(bottomBufferValues++); + second_half = *(bottomBufferValues++); + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(first_half >> 0x18) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 8) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x10) & 0xFF]; + *(bottomTilePointer++) = bits_to_nybbles_pointer[(second_half >> 0x18) & 0xFF]; + remainingTiles--; + } } } diff --git a/src/c/vwf.h b/src/c/vwf.h index 4996f4d..ee5a4d6 100644 --- a/src/c/vwf.h +++ b/src/c/vwf.h @@ -124,6 +124,7 @@ void load_pixels_overworld_buffer(); void store_pixels_overworld_buffer(int totalYs); void store_pixels_overworld_buffer_totalTiles(int totalTiles); +extern unsigned short m2_coord_table_fast_progression[]; extern unsigned short m2_coord_table[]; extern byte m2_ness_name[]; extern int m2_bits_to_nybbles[]; diff --git a/src/data/m2-coord-table-fast-progression.bin b/src/data/m2-coord-table-fast-progression.bin new file mode 100644 index 0000000000000000000000000000000000000000..c100d2c480bc18c16ec8cf137dcf68dcfbfb5a9f GIT binary patch literal 448 zcmV;x0YCl#_yO<&=mX#c*agrA$Opg(xCyWds0*MBm<^B)h!21ecoA?CXcJ%*SQStf zNEbjDI2kY+C>tOg7#$EE2p<4I_(AYO=tJN{*hSDr$Vb3PxJj@|s7s(sm`#vQh);k} zcu{atXj5QRSXEF~NLN5uI9V`SC|e+07+nxv2wwnzV1ZbIP=iQ>K!rGlFo!6JAc+`? z5Q_+m0FEG!7?2Q=2$BH6_`&eP=)>T|*u~Js$j89QxXG}}sLPy>e*#*%C$p^s*xe2ifsSBYD znGKN+i4TDgc@c3EX%k@-Srt(hNf$vFIT-zc|~zX zX-8p5SxHe!NlQUYIZZK6DNi9#8Bq~Z2~z=p`GN6*>4V{f*@e-D$%ny+xrwogsf(eE znT?T-iI0Jhd699FX_H};S(QNzp;lIny!JDb*p?8P^fm3E2VG@4lD- literal 0 HcmV?d00001 diff --git a/src/m2-hack.asm b/src/m2-hack.asm index 3811845..5914001 100644 --- a/src/m2-hack.asm +++ b/src/m2-hack.asm @@ -1693,6 +1693,10 @@ m2_font_relocate: m2_coord_table: .incbin "data/m2-coord-table.bin" +// Co-ordinate table, version which has 5 bits used for how many consecutive tiles there are after each tile +m2_coord_table_fast_progression: +.incbin "data/m2-coord-table-fast-progression.bin" + // EB fonts m2_font_table: dw m2_font_main