m2_vwf: //============================================================================== // int get_tile_number(int x, int y) // In: // r0: x // r1: y // Out: // r0: tile number //============================================================================== .get_tile_number: push {r1-r5,lr} ldr r4,=#m2_coord_table sub r0,r0,#1 sub r1,r1,#1 lsl r2,r1,#0x1F lsr r2,r2,#0x1F lsr r1,r1,#1 lsl r5,r1,#4 sub r5,r5,r1 sub r5,r5,r1 lsl r5,r5,#2 lsl r0,r0,#1 add r4,r4,r0 add r4,r4,r5 ldrh r0,[r4,#0] lsl r2,r2,#5 add r0,r0,r2 pop {r1-r5,pc} //============================================================================== // void weld_entry(WINDOW* window, byte* chr) // In: // r0: address of window data // r1: address of char to print //============================================================================== //-------------------------------- .weld_entry: push {r0-r7,lr} add sp,#-28 mov r5,r0 //-------------------------------- // Get the char ldrb r0,[r1,#0] sub r0,#0x50 bpl + mov r0,#0x1F // Replace char with ? if it's invalid b .char_custom + cmp r0,#0x60 bcc .char_custom mov r0,#0x1F .char_custom: str r0,[sp,#0x0] // [sp+0] = char //-------------------------------- // Get the current X ldrh r1,[r5,#0x22] ldrh r2,[r5,#0x2A] add r1,r1,r2 str r1,[sp,#4] lsl r0,r1,#3 ldrh r1,[r5,#2] str r1,[sp,#24] add r0,r0,r1 // Current pixel X str r0,[sp,#20] // Get the current Y ldrh r1,[r5,#0x24] ldrh r3,[r5,#0x2C] add r1,r1,r3 str r1,[sp,#8] lsl r1,r1,#3 //-------------------------------- // Print ldr r2,[sp,#0x0] mov r3,#0 bl .print_character str r0,[sp,#12] //-------------------------------- // Figure out new window coords ldr r0,[sp,#20] ldr r1,[sp,#12] add r0,r0,r1 // Store new window coords lsr r1,r0,#3 ldrh r2,[r5,#0x22] sub r1,r1,r2 strh r1,[r5,#0x2A] // Store new pixel X lsl r0,r0,#29 lsr r0,r0,#29 strh r0,[r5,#2] //-------------------------------- add sp,#28 pop {r0-r7,pc} //============================================================================= // void print_character(int x, int y, int chr, int font) // In: // r0: x (pixel) // r1: y (pixel) // r2: character // r3: font // 0: main // 1: saturn // 2: tiny // Out: // r0: virtual width //============================================================================= .print_character: push {r1-r7,lr} mov r4,r8 mov r5,r9 mov r6,r10 mov r7,r11 push {r4-r7} mov r4,r12 push {r4} add sp,#-24 mov r10,r0 mov r11,r1 mov r12,r2 mov r5,r3 //---------------------------------------- ldr r3,=#0x30051EC ldrh r4,[r3,#0] // Tile offset add r3,#0x3C ldrh r6,[r3,#0] // Palette mask add r3,#0x48 ldr r7,[r3,#0] // Tilemap address lsr r0,r0,#3 lsr r1,r1,#3 lsl r1,r1,#5 add r0,r0,r1 lsl r0,r0,#1 add r7,r7,r0 // Local tilemap address mov r8,r4 //---------------------------------------- ldr r0,=#m2_widths_table lsl r1,r5,#2 // Font number * 4 ldr r0,[r0,r1] mov r3,r12 // Character lsl r2,r3,#1 ldrb r1,[r0,r2] // Virtual width mov r9,r1 add r2,r2,#1 ldrb r0,[r0,r2] // Render width cmp r0,#0 beq + // Don't bother rendering a zero-width character ldr r2,=#m2_height_table ldrb r2,[r2,r5] str r2,[sp,#16] // No more registers, gotta store this on the stack mov r3,sp strb r0,[r3,#9] strb r2,[r3,#12] mov r1,r10 lsl r1,r1,#29 lsr r1,r1,#29 strb r1,[r3,#8] mov r1,#4 strb r1,[r3,#10] mov r1,#0xF strb r1,[r3,#11] //---------------------------------------- mov r0,r10 mov r1,r11 lsr r0,r0,#3 lsr r1,r1,#3 bl .get_tile_number add r4,r0,r4 lsl r0,r4,#5 mov r1,#6 lsl r1,r1,#0x18 add r0,r0,r1 // VRAM address str r0,[sp,#0] //---------------------------------------- ldr r0,=#m2_font_table lsl r1,r5,#2 ldr r0,[r0,r1] mov r1,r12 lsl r1,r1,#5 add r0,r0,r1 // Glyph address str r0,[sp,#4] //---------------------------------------- // Render left portion mov r0,sp bl .print_left //---------------------------------------- // Update the map orr r4,r6 mov r1,r7 - strh r4,[r1,#0] add r4,#0x20 add r1,#0x40 sub r2,r2,#1 bne - add r7,r7,#2 //---------------------------------------- // Now we've rendered the left portion; // we need to determine whether or not to render the right portion ldrb r1,[r0,#8] // VRAM x offset str r1,[sp,#20] // No more registers, gotta store this on the stack ldrb r2,[r0,#9] // Render width add r2,r1,r2 cmp r2,#8 bls + // We still have more to render; figure out how much we already rendered mov r3,#8 sub r3,r3,r1 strb r3,[r0,#8] // Allocate a new tile mov r0,r10 mov r1,r11 lsr r0,r0,#3 add r0,r0,#1 lsr r1,r1,#3 bl .get_tile_number add r0,r8 mov r4,r0 lsl r0,r0,#5 mov r1,#6 lsl r1,r1,#0x18 add r0,r0,r1 str r0,[sp,#0] mov r0,sp bl .print_right //---------------------------------------- // Update the map orr r4,r6 mov r1,r7 ldr r2,[sp,#16] - strh r4,[r1,#0] add r4,#0x20 add r1,#0x40 sub r2,r2,#1 bne - add r7,r7,#2 //---------------------------------------- // Now we've rendered the left and right portions; // we need to determin whether or not to do a final // right portion for super wide characters ldr r1,[sp,#20] // Original pixel X offset ldrb r2,[r0,#9] // Render width add r2,r1,r2 // Right side of glyph cmp r2,#16 bls + // We have one more chunk to render; figure out how much we already rendered mov r3,#16 sub r3,r3,r1 strb r3,[r0,#8] // Allocate a new tile mov r0,r10 mov r1,r11 lsr r0,r0,#3 add r0,r0,#2 lsr r1,r1,#3 bl .get_tile_number add r0,r8 mov r4,r0 lsl r0,r0,#5 mov r1,#6 lsl r1,r1,#0x18 add r0,r0,r1 str r0,[sp,#0] mov r0,sp bl .print_right //---------------------------------------- // Update the map orr r4,r6 mov r1,r7 ldr r2,[sp,#16] - strh r4,[r1,#0] add r4,#0x20 add r1,#0x40 sub r2,r2,#1 bne - add r7,r7,#2 //---------------------------------------- + mov r0,r9 add sp,#24 pop {r4} mov r12,r4 pop {r4-r7} mov r8,r4 mov r9,r5 mov r10,r6 mov r11,r7 pop {r1-r7,pc} //============================================================================= // void print_left(void* structPointer) //============================================================================= // In: // r0: struct pointer // [r0+0]: VRAM address // [r0+4]: glyph address // [r0+8]: VRAM x offset (byte) // [r0+9]: render width (byte) // [r0+10]: background index (byte) // [r0+11]: foreground index (byte) // [r0+12]: height in tiles (byte) // [r0+13]: (3 bytes) .print_left: push {r0-r7,lr} mov r7,r0 //---------------------------------------- ldr r6,[r7,#0] // VRAM address ldr r3,[r7,#4] // Glyph address ldrb r4,[r7,#12] // Height in tiles .print_left_loop: mov r5,#8 - ldr r0,[r6,#0] // 4BPP VRAM row ldrb r1,[r7,#11] // Foreground index bl .reduce_bit_depth // Returns r0 = 1BPP VRAM row ldrb r1,[r7,#9] // Glyph render width mov r2,#32 sub r2,r2,r1 ldrb r1,[r3,#0] // Glyph row lsl r1,r2 // Cut off the pixels we don't want to render lsr r1,r2 ldrb r2,[r7,#8] // X offset lsl r1,r2 lsl r1,r1,#0x18 lsr r1,r1,#0x18 orr r0,r1 ldrb r1,[r7,#10] ldrb r2,[r7,#11] bl .expand_bit_depth str r0,[r6,#0] add r6,r6,#4 add r3,r3,#1 sub r5,r5,#1 bne - mov r0,#0x1F lsl r0,r0,#5 add r6,r0,r6 add r3,#8 sub r4,r4,#1 bne .print_left_loop //---------------------------------------- pop {r0-r7,pc} //============================================================================= // void print_right(void* structPointer) //============================================================================= // In: // r0: struct pointer // [r0+0]: VRAM address // [r0+4]: glyph address // [r0+8]: glyph x offset (byte) // [r0+9]: render width (byte) // [r0+10]: background index (byte) // [r0+11]: foreground index (byte) // [r0+12]: height in tiles (byte) // [r0+13]: (3 bytes) .print_right: push {r0-r7,lr} mov r7,r0 //---------------------------------------- ldr r6,[r7,#0] // VRAM address ldr r3,[r7,#4] // Glyph address ldrb r4,[r7,#12] // Height in tiles .print_right_loop: mov r5,#8 - ldr r0,[r6,#0] // 4BPP VRAM row ldrb r1,[r7,#11] // Foreground index bl .reduce_bit_depth // Returns r0 = 1BPP VRAM row ldrb r1,[r7,#9] // Glyph render width mov r2,#32 sub r2,r2,r1 ldrb r1,[r3,#0] // Glyph row lsl r1,r2 // Cut off the pixels we don't want to render lsr r1,r2 ldrb r2,[r7,#8] // X offset lsr r1,r2 lsl r1,r1,#0x18 lsr r1,r1,#0x18 orr r0,r1 ldrb r1,[r7,#10] ldrb r2,[r7,#11] bl .expand_bit_depth str r0,[r6,#0] add r6,r6,#4 add r3,r3,#1 sub r5,r5,#1 bne - mov r0,#0x1F lsl r0,r0,#5 add r6,r0,r6 add r3,#8 sub r4,r4,#1 bne .print_right_loop //---------------------------------------- pop {r0-r7,pc} //============================================================================== // byte reduce_bit_depth(int pixels) // In: // r0: row of 4BPP pixels // r1: foreground index // Out: // r0: row of 1BPP pixels //============================================================================== // Some notes: // - to go faster, load in constants manually using PC-relative loads // instead of the ldr rX,=#Y pseudoinstruction (which implicitly branches) // - in order to do this properly the instructions need to be 32-bit aligned, // hence there are some alignment hacks below // - the goal is to reduce the 4BPP row of pixels in r0 to a 1BPP row according // to the foreground index in r1 // - this is achieved quickly using a lookup // - first step is to set all foreground pixels (each pixel is a nybble in r0) to 0, // and all background pixels to non-zero // - this is done by XOR-ing r0 with a row of foreground pixels, where a row of // foreground pixels is just r1*0x11111111 // - when we index into the lookup table using the resulting XOR-ed value, we'll get // a 1BPP value where each corresponding 0-nybble (a foreground pixel) is a 1 // and any corresponding non-zero-nybble is a 0 // - to keep the lookup table at a reasonable size we'll go 4 pixels at a time: // there are thus 16^4 = 65536 possible index values and the lookup table will be 64KB // - this uses 63 cycles while the previous method used 273 cycles // Alignment hack ldr r0,=#0xDEADBEEF .reduce_bit_depth: push {r1-r3,lr} ldr r3,[pc,#32] // 0x11111111 mul r1,r3 ldr r2,[pc,#32] // m2_nybbles_to_bits eor r0,r1 lsl r1,r0,#16 lsr r1,r1,#16 lsr r0,r0,#16 ldrb r3,[r2,r0] ldrb r0,[r2,r1] lsl r3,r3,#4 orr r0,r3 pop {r1-r3,pc} // Literal pool ldr r0,=#0xDEADBEEF dd 0x11111111 dd m2_nybbles_to_bits //============================================================================== // int expand_bit_depth(byte pixels) // In: // r0: row of 1BPP pixels // r1: background index // r2: foreground index // Out: // r0: row of 4BPP pixels //============================================================================== // - similar to reduce_bit_depth, we go fast using a lookup table // - there are really 16 lookup tables, one for each possible value of r1/r2 // - we simply look up the word at (table + (r0*4) + (r1*1024)) to get the 4BPP // expanded version of r0 using colour index r1 (or r2) // - do it once for foreground, then invert r0 and do it again for background // - XOR the two values together to get the final 4BPP row of pixels // - this uses 61 cycles while the previous method used 287 cycles // Alignment hack ldr r0,=#0xDEADBEEF .expand_bit_depth: push {r1-r6,lr} ldr r6,[pc,#36] // m2_bits_to_nybbles // Foreground lsl r4,r2,#10 lsl r3,r0,#2 add r5,r4,r6 ldr r2,[r5,r3] // Background lsl r4,r1,#10 add r5,r4,r6 mov r4,#0xFF eor r0,r4 lsl r3,r0,#2 ldr r1,[r5,r3] orr r2,r1 mov r0,r2 pop {r1-r6,pc} // Literal pool ldr r0,=#0xDEADBEEF dd m2_bits_to_nybbles