diff --git a/libpixelflinger/t32cb16blend.S b/libpixelflinger/t32cb16blend.S index d4b257981..caf9eb7cd 100644 --- a/libpixelflinger/t32cb16blend.S +++ b/libpixelflinger/t32cb16blend.S @@ -21,53 +21,80 @@ .global scanline_t32cb16blend_arm -// uses r6, r7, lr -.macro pixel, DREG, SRC, FB, OFFSET +/* + * .macro pixel + * + * \DREG is a 32-bit register containing *two* original destination RGB565 + * pixels, with the even one in the low-16 bits, and the odd one in the + * high 16 bits. + * + * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. + * + * \FB is a target register that will contain the blended pixel values. + * + * \ODD is either 0 or 1 and indicates if we're blending the lower or + * upper 16-bit pixels in DREG into FB + * + * + * clobbered: r6, r7, lr + * + */ - // SRC = AARRGGBB +.macro pixel, DREG, SRC, FB, ODD + + // SRC = 0xAABBGGRR mov r7, \SRC, lsr #24 // sA add r7, r7, r7, lsr #7 // sA + (sA >> 7) rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) 1: -.if \OFFSET +.if \ODD // red - mov lr, \DREG, lsr #(\OFFSET + 6 + 5) + mov lr, \DREG, lsr #(16 + 11) smulbb lr, r7, lr mov r6, \SRC, lsr #3 and r6, r6, #0x1F add lr, r6, lr, lsr #8 - orr \FB, lr, lsl #(\OFFSET + 11) + cmp lr, #0x1F + orrhs \FB, \FB, #(0x1F<<(16 + 11)) + orrlo \FB, \FB, lr, lsl #(16 + 11) // green - and r6, \DREG, #(0x3F<<(\OFFSET + 5)) + and r6, \DREG, #(0x3F<<(16 + 5)) smulbt r6, r7, r6 mov lr, \SRC, lsr #(8+2) and lr, lr, #0x3F add r6, lr, r6, lsr #(5+8) - orr \FB, \FB, r6, lsl #(\OFFSET + 5) + cmp r6, #0x3F + orrhs \FB, \FB, #(0x3F<<(16 + 5)) + orrlo \FB, \FB, r6, lsl #(16 + 5) // blue - and lr, \DREG, #(0x1F << \OFFSET) + and lr, \DREG, #(0x1F << 16) smulbt lr, r7, lr mov r6, \SRC, lsr #(8+8+3) and r6, r6, #0x1F add lr, r6, lr, lsr #8 - orr \FB, \FB, lr, lsl #\OFFSET + cmp lr, #0x1F + orrhs \FB, \FB, #(0x1F << 16) + orrlo \FB, \FB, lr, lsl #16 .else // red - mov lr, \DREG, lsr #(6+5) + mov lr, \DREG, lsr #11 and lr, lr, #0x1F smulbb lr, r7, lr mov r6, \SRC, lsr #3 and r6, r6, #0x1F add lr, r6, lr, lsr #8 - mov \FB, lr, lsl #11 + cmp lr, #0x1F + movhs \FB, #(0x1F<<11) + movlo \FB, lr, lsl #11 + // green and r6, \DREG, #(0x3F<<5) @@ -75,7 +102,9 @@ mov lr, \SRC, lsr #(8+2) and lr, lr, #0x3F add r6, lr, r6, lsr #(5+8) - orr \FB, \FB, r6, lsl #5 + cmp r6, #0x3F + orrhs \FB, \FB, #(0x3F<<5) + orrlo \FB, \FB, r6, lsl #5 // blue and lr, \DREG, #0x1F @@ -83,7 +112,9 @@ mov r6, \SRC, lsr #(8+8+3) and r6, r6, #0x1F add lr, r6, lr, lsr #8 - orr \FB, \FB, lr + cmp lr, #0x1F + orrhs \FB, \FB, #0x1F + orrlo \FB, \FB, lr .endif @@ -128,7 +159,7 @@ aligned: subs r2, r2, #2 blo 9f - // The main loop is unrolled twice and process 4 pixels + // The main loop is unrolled twice and processes 4 pixels 8: ldmia r1!, {r4, r5} // stream the source pld [r1, #32] @@ -142,7 +173,7 @@ aligned: // stream the destination pld [r0, #32] pixel r3, r4, r12, 0 - pixel r3, r5, r12, 16 + pixel r3, r5, r12, 1 // effectively, we're getting write-combining by virtue of the // cpu's write-back cache. str r12, [r0, #-4]