reduce_bit_depth now uses a lookup table

2017-03-10 00:44:39 -05:00 · 2017-03-10 00:44:39 -05:00 · fa29c67534
parent 674942a6d9
commit fa29c67534
3 changed files with 45 additions and 22 deletions
--- a/m2-hack.asm
+++ b/m2-hack.asm
@ -99,6 +99,10 @@ m2_widths_saturn:
 m2_bits_to_nybbles:
 incbin m2-bits-to-nybbles.bin

+m2_nybbles_to_bits:
+incbin m2-nybbles-to-bits.bin
+
+
 //==============================================================================
 // Misc
 //==============================================================================
--- a/m2-nybbles-to-bits.bin
+++ b/m2-nybbles-to-bits.bin
--- a/m2-vwf.asm
+++ b/m2-vwf.asm
@ -455,31 +455,50 @@ pop     {r0-r7,pc}
 //    r0: row of 1BPP pixels
 //==============================================================================

+// Some notes:
+// - to go faster, load in constants manually using PC-relative loads
+//   instead of the ldr rX,=#Y pseudoinstruction (which implicitly branches)
+// - in order to do this properly the instructions need to be 32-bit aligned,
+//   hence there are some alignment hacks below
+// - the goal is to reduce the 4BPP row of pixels in r0 to a 1BPP row according
+//   to the foreground index in r1
+// - this is achieved quickly using a lookup
+// - first step is to set all foreground pixels (each pixel is a nybble in r0) to 0,
+//   and all background pixels to non-zero
+// - this is done by XOR-ing r0 with a row of foreground pixels, where a row of
+//   foreground pixels is just r1*0x11111111
+// - when we index into the lookup table using the resulting XOR-ed value, we'll get
+//   a 1BPP value where each corresponding 0-nybble (a foreground pixel) is a 1
+//   and any corresponding non-zero-nybble is a 0
+// - to keep the lookup table at a reasonable size we'll go 4 pixels at a time:
+//   there are thus 16^4 = 65536 possible index values and the lookup table will be 64KB
+// - this uses 63 cycles while the previous method used 273 cycles
+
+// Alignment hack
+ldr     r0,=#0xDEADBEEF
+
 .reduce_bit_depth:
-push    {r1-r6,lr}
-mov     r3,r0
-mov     r0,#0
-mov     r4,#0xF
-mov     r5,#1
-mov     r6,#28
+push    {r1-r3,lr}

-//--------------------------------
-
-mov     r2,r3
-lsr     r2,r6
-and     r2,r4
-cmp     r1,r2
-bne     +
-orr     r0,r5
-+
-sub     r6,r6,#4
-bmi     +
-lsl     r0,r0,#1
-b       -
+ldr     r3,[pc,#32] // 0x11111111
+mul     r1,r3
+ldr     r2,[pc,#32] // m2_nybbles_to_bits
+eor     r0,r1

-//--------------------------------
-+
-pop     {r1-r6,pc}
+lsl     r1,r0,#16
+lsr     r1,r1,#16
+lsr     r0,r0,#16
+ldrb    r3,[r2,r0]
+ldrb    r0,[r2,r1]
+lsl     r3,r3,#4
+orr     r0,r3
+
+pop     {r1-r3,pc}
+
+// Literal pool
+ldr     r0,=#0xDEADBEEF
+dd      0x11111111
+dd      m2_nybbles_to_bits


 //==============================================================================