m2_vwf:

//==============================================================================
// int get_tile_number(int x, int y)
//    In:
//        r0: x
//        r1: y
//    Out:
//        r0: tile number
//==============================================================================

.get_tile_number:

push    {r1-r5,lr}
ldr     r4,=#m2_coord_table
sub     r0,r0,#1
sub     r1,r1,#1
lsl     r2,r1,#0x1F
lsr     r2,r2,#0x1F
lsr     r1,r1,#1
lsl     r5,r1,#4
sub     r5,r5,r1
sub     r5,r5,r1
lsl     r5,r5,#2
lsl     r0,r0,#1
add     r4,r4,r0
add     r4,r4,r5
ldrh    r0,[r4,#0]
lsl     r2,r2,#5
add     r0,r0,r2
pop     {r1-r5,pc}


//==============================================================================
// void weld_entry(WINDOW* window, byte* chr)
//    In:
//        r0: address of window data
//        r1: address of char to print
//==============================================================================

//--------------------------------
.weld_entry:
push    {r0-r7,lr}
add     sp,#-28
mov     r5,r0

//--------------------------------
// Get the char
ldrb    r0,[r1,#0]
sub     r0,#0x50
bpl     +
mov     r0,#0x1F            // Replace char with ? if it's invalid
b       .char_custom
+
cmp     r0,#0x60
bcc     .char_custom
mov     r0,#0x1F

.char_custom:
str     r0,[sp,#0x0]
// [sp+0] = char

//--------------------------------
// Get the current X
ldrh    r1,[r5,#0x22]
ldrh    r2,[r5,#0x2A]
add     r1,r1,r2
str     r1,[sp,#4]
lsl     r0,r1,#3
ldrh    r1,[r5,#2]
str     r1,[sp,#24]
add     r0,r0,r1             // Current pixel X
str     r0,[sp,#20]

// Get the current Y
ldrh    r1,[r5,#0x24]
ldrh    r3,[r5,#0x2C]
add     r1,r1,r3
str     r1,[sp,#8]
lsl     r1,r1,#3

//--------------------------------
// Print
ldr     r2,[sp,#0x0]
mov     r3,#0
bl      .print_character
str     r0,[sp,#12]

//--------------------------------
// Figure out new window coords
ldr     r0,[sp,#20]
ldr     r1,[sp,#12]
add     r0,r0,r1

// Store new window coords
lsr     r1,r0,#3
ldrh    r2,[r5,#0x22]
sub     r1,r1,r2
strh    r1,[r5,#0x2A]

// Store new pixel X
lsl     r0,r0,#29
lsr     r0,r0,#29
strh    r0,[r5,#2]

//--------------------------------
add     sp,#28
pop     {r0-r7,pc}


//=============================================================================
// void print_character(int x, int y, int chr, int font)
// In:
//    r0: x (pixel)
//    r1: y (pixel)
//    r2: character
//    r3: font
//        0: main
//        1: saturn
//        2: tiny
// Out:
//    r0: virtual width
//=============================================================================

.print_character:

push    {r1-r7,lr}
mov     r4,r8
mov     r5,r9
mov     r6,r10
mov     r7,r11
push    {r4-r7}
mov     r4,r12
push    {r4}
add     sp,#-24

mov     r10,r0
mov     r11,r1
mov     r12,r2
mov     r5,r3

//----------------------------------------
ldr     r3,=#0x30051EC
ldrh    r4,[r3,#0]           // Tile offset
add     r3,#0x3C
ldrh    r6,[r3,#0]           // Palette mask
add     r3,#0x48
ldr     r7,[r3,#0]           // Tilemap address
lsr     r0,r0,#3
lsr     r1,r1,#3
lsl     r1,r1,#5
add     r0,r0,r1
lsl     r0,r0,#1
add     r7,r7,r0             // Local tilemap address
mov     r8,r4

//----------------------------------------
ldr     r0,=#m2_widths_table
lsl     r1,r5,#2             // Font number * 4
ldr     r0,[r0,r1]
mov     r3,r12               // Character
lsl     r2,r3,#1
ldrb    r1,[r0,r2]           // Virtual width
mov     r9,r1
add     r2,r2,#1
ldrb    r0,[r0,r2]           // Render width
cmp     r0,#0
beq     +                    // Don't bother rendering a zero-width character
ldr     r2,=#m2_height_table
ldrb    r2,[r2,r5]
str     r2,[sp,#16]          // No more registers, gotta store this on the stack
mov     r3,sp
strb    r0,[r3,#9]
strb    r2,[r3,#12]
mov     r1,r10
lsl     r1,r1,#29
lsr     r1,r1,#29
strb    r1,[r3,#8]
mov     r1,#4
strb    r1,[r3,#10]
mov     r1,#0xF
strb    r1,[r3,#11]

//----------------------------------------
mov     r0,r10
mov     r1,r11
lsr     r0,r0,#3
lsr     r1,r1,#3
bl      .get_tile_number
add     r4,r0,r4
lsl     r0,r4,#5
mov     r1,#6
lsl     r1,r1,#0x18
add     r0,r0,r1             // VRAM address
str     r0,[sp,#0]

//----------------------------------------
ldr     r0,=#m2_font_table
lsl     r1,r5,#2
ldr     r0,[r0,r1]
mov     r1,r12
lsl     r1,r1,#5
add     r0,r0,r1             // Glyph address
str     r0,[sp,#4]

//----------------------------------------
// Render left portion
mov     r0,sp
bl      .print_left

//----------------------------------------
// Update the map
orr     r4,r6
mov     r1,r7
-
strh    r4,[r1,#0]
add     r4,#0x20
add     r1,#0x40
sub     r2,r2,#1
bne     -
add     r7,r7,#2

//----------------------------------------
// Now we've rendered the left portion;
// we need to determine whether or not to render the right portion
ldrb    r1,[r0,#8]           // VRAM x offset
str     r1,[sp,#20]          // No more registers, gotta store this on the stack
ldrb    r2,[r0,#9]           // Render width
add     r2,r1,r2
cmp     r2,#8
bls     +

// We still have more to render; figure out how much we already rendered
mov     r3,#8
sub     r3,r3,r1
strb    r3,[r0,#8]

// Allocate a new tile
mov     r0,r10
mov     r1,r11
lsr     r0,r0,#3
add     r0,r0,#1
lsr     r1,r1,#3
bl      .get_tile_number
add     r0,r8
mov     r4,r0
lsl     r0,r0,#5
mov     r1,#6
lsl     r1,r1,#0x18
add     r0,r0,r1
str     r0,[sp,#0]
mov     r0,sp
bl      .print_right

//----------------------------------------
// Update the map
orr     r4,r6
mov     r1,r7
ldr     r2,[sp,#16]
-
strh    r4,[r1,#0]
add     r4,#0x20
add     r1,#0x40
sub     r2,r2,#1
bne     -
add     r7,r7,#2

//----------------------------------------
// Now we've rendered the left and right portions;
// we need to determin whether or not to do a final
// right portion for super wide characters
ldr     r1,[sp,#20]          // Original pixel X offset
ldrb    r2,[r0,#9]           // Render width
add     r2,r1,r2             // Right side of glyph
cmp     r2,#16
bls     +

// We have one more chunk to render; figure out how much we already rendered
mov     r3,#16
sub     r3,r3,r1
strb    r3,[r0,#8]

// Allocate a new tile
mov     r0,r10
mov     r1,r11
lsr     r0,r0,#3
add     r0,r0,#2
lsr     r1,r1,#3
bl      .get_tile_number
add     r0,r8
mov     r4,r0
lsl     r0,r0,#5
mov     r1,#6
lsl     r1,r1,#0x18
add     r0,r0,r1
str     r0,[sp,#0]
mov     r0,sp
bl      .print_right

//----------------------------------------
// Update the map
orr     r4,r6
mov     r1,r7
ldr     r2,[sp,#16]
-
strh    r4,[r1,#0]
add     r4,#0x20
add     r1,#0x40
sub     r2,r2,#1
bne     -
add     r7,r7,#2

//----------------------------------------
+
mov     r0,r9
add     sp,#24
pop     {r4}
mov     r12,r4
pop     {r4-r7}
mov     r8,r4
mov     r9,r5
mov     r10,r6
mov     r11,r7
pop     {r1-r7,pc}


//=============================================================================
// void print_left(void* structPointer)
//=============================================================================

// In:
// r0: struct pointer
// [r0+0]: VRAM address
// [r0+4]: glyph address
// [r0+8]: VRAM x offset (byte)
// [r0+9]: render width (byte)
// [r0+10]: background index (byte)
// [r0+11]: foreground index (byte)
// [r0+12]: height in tiles (byte)
// [r0+13]: <unused> (3 bytes)

.print_left:

push    {r0-r7,lr}
mov     r7,r0

//----------------------------------------
ldr     r6,[r7,#0]           // VRAM address
ldr     r3,[r7,#4]           // Glyph address
ldrb    r4,[r7,#12]          // Height in tiles

.print_left_loop:
mov     r5,#8
-
ldr     r0,[r6,#0]           // 4BPP VRAM row
ldrb    r1,[r7,#11]          // Foreground index
bl      .reduce_bit_depth    // Returns r0 = 1BPP VRAM row
ldrb    r1,[r7,#9]           // Glyph render width
mov     r2,#32
sub     r2,r2,r1
ldrb    r1,[r3,#0]           // Glyph row
lsl     r1,r2                // Cut off the pixels we don't want to render
lsr     r1,r2
ldrb    r2,[r7,#8]           // X offset
lsl     r1,r2
lsl     r1,r1,#0x18
lsr     r1,r1,#0x18
orr     r0,r1
ldrb    r1,[r7,#10]
ldrb    r2,[r7,#11]
bl      .expand_bit_depth
str     r0,[r6,#0]
add     r6,r6,#4
add     r3,r3,#1
sub     r5,r5,#1
bne     -
mov     r0,#0x1F
lsl     r0,r0,#5
add     r6,r0,r6
add     r3,#8
sub     r4,r4,#1
bne     .print_left_loop

//----------------------------------------
pop     {r0-r7,pc}


//=============================================================================
// void print_right(void* structPointer)
//=============================================================================

// In:
// r0: struct pointer
// [r0+0]: VRAM address
// [r0+4]: glyph address
// [r0+8]: glyph x offset (byte)
// [r0+9]: render width (byte)
// [r0+10]: background index (byte)
// [r0+11]: foreground index (byte)
// [r0+12]: height in tiles (byte)
// [r0+13]: <unused> (3 bytes)

.print_right:

push    {r0-r7,lr}
mov     r7,r0

//----------------------------------------
ldr     r6,[r7,#0]           // VRAM address
ldr     r3,[r7,#4]           // Glyph address
ldrb    r4,[r7,#12]          // Height in tiles

.print_right_loop:
mov     r5,#8
-
ldr     r0,[r6,#0]           // 4BPP VRAM row
ldrb    r1,[r7,#11]          // Foreground index
bl      .reduce_bit_depth    // Returns r0 = 1BPP VRAM row
ldrb    r1,[r7,#9]           // Glyph render width
mov     r2,#32
sub     r2,r2,r1
ldrb    r1,[r3,#0]           // Glyph row
lsl     r1,r2                // Cut off the pixels we don't want to render
lsr     r1,r2
ldrb    r2,[r7,#8]           // X offset
lsr     r1,r2
lsl     r1,r1,#0x18
lsr     r1,r1,#0x18
orr     r0,r1
ldrb    r1,[r7,#10]
ldrb    r2,[r7,#11]
bl      .expand_bit_depth
str     r0,[r6,#0]
add     r6,r6,#4
add     r3,r3,#1
sub     r5,r5,#1
bne     -
mov     r0,#0x1F
lsl     r0,r0,#5
add     r6,r0,r6
add     r3,#8
sub     r4,r4,#1
bne     .print_right_loop

//----------------------------------------
pop     {r0-r7,pc}


//==============================================================================
// byte reduce_bit_depth(int pixels)
// In:
//    r0: row of 4BPP pixels
//    r1: foreground index
// Out:
//    r0: row of 1BPP pixels
//==============================================================================

// Some notes:
// - to go faster, load in constants manually using PC-relative loads
//   instead of the ldr rX,=#Y pseudoinstruction (which implicitly branches)
// - in order to do this properly the instructions need to be 32-bit aligned,
//   hence there are some alignment hacks below
// - the goal is to reduce the 4BPP row of pixels in r0 to a 1BPP row according
//   to the foreground index in r1
// - this is achieved quickly using a lookup
// - first step is to set all foreground pixels (each pixel is a nybble in r0) to 0,
//   and all background pixels to non-zero
// - this is done by XOR-ing r0 with a row of foreground pixels, where a row of
//   foreground pixels is just r1*0x11111111
// - when we index into the lookup table using the resulting XOR-ed value, we'll get
//   a 1BPP value where each corresponding 0-nybble (a foreground pixel) is a 1
//   and any corresponding non-zero-nybble is a 0
// - to keep the lookup table at a reasonable size we'll go 4 pixels at a time:
//   there are thus 16^4 = 65536 possible index values and the lookup table will be 64KB
// - this uses 63 cycles while the previous method used 273 cycles

// Alignment hack
ldr     r0,=#0xDEADBEEF

.reduce_bit_depth:
push    {r1-r3,lr}

ldr     r3,[pc,#32] // 0x11111111
mul     r1,r3
ldr     r2,[pc,#32] // m2_nybbles_to_bits
eor     r0,r1

lsl     r1,r0,#16
lsr     r1,r1,#16
lsr     r0,r0,#16
ldrb    r3,[r2,r0]
ldrb    r0,[r2,r1]
lsl     r3,r3,#4
orr     r0,r3

pop     {r1-r3,pc}

// Literal pool
ldr     r0,=#0xDEADBEEF
dd      0x11111111
dd      m2_nybbles_to_bits


//==============================================================================
// int expand_bit_depth(byte pixels)
// In:
//    r0: row of 1BPP pixels
//    r1: background index
//    r2: foreground index
// Out:
//    r0: row of 4BPP pixels
//==============================================================================

// - similar to reduce_bit_depth, we go fast using a lookup table
// - there are really 16 lookup tables, one for each possible value of r1/r2
// - we simply look up the word at (table + (r0*4) + (r1*1024)) to get the 4BPP
//   expanded version of r0 using colour index r1 (or r2)
// - do it once for foreground, then invert r0 and do it again for background
// - XOR the two values together to get the final 4BPP row of pixels
// - this uses 61 cycles while the previous method used 287 cycles

// Alignment hack
ldr     r0,=#0xDEADBEEF

.expand_bit_depth:
push    {r1-r6,lr}
ldr     r6,[pc,#36] // m2_bits_to_nybbles

// Foreground
lsl     r4,r2,#10
lsl     r3,r0,#2
add     r5,r4,r6
ldr     r2,[r5,r3]

// Background
lsl     r4,r1,#10
add     r5,r4,r6
mov     r4,#0xFF
eor     r0,r4
lsl     r3,r0,#2
ldr     r1,[r5,r3]

orr     r2,r1
mov     r0,r2

pop     {r1-r6,pc}

// Literal pool
ldr     r0,=#0xDEADBEEF
dd      m2_bits_to_nybbles