platform_system_core/libpixelflinger/t32cb16blend.S

/* libs/pixelflinger/t32cb16blend.S
**
** Copyright 2006, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
**     http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/


	.text
	.align

	.global scanline_t32cb16blend_arm

// uses r6, r7, lr

.macro pixel,   DREG, SRC, FB, OFFSET

    // SRC = AARRGGBB
    mov     r7, \SRC, lsr #24           // sA
    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))

1:

.if \OFFSET

    // red
    mov     lr, \DREG, lsr #(\OFFSET + 6 + 5)
    smulbb  lr, r7, lr
    mov     r6, \SRC, lsr #3
    and     r6, r6, #0x1F
    add     lr, r6, lr, lsr #8
    orr     \FB, lr, lsl #(\OFFSET + 11)

        // green
        and     r6, \DREG, #(0x3F<<(\OFFSET + 5))
        smulbt  r6, r7, r6
        mov     lr, \SRC, lsr #(8+2)
        and     lr, lr, #0x3F
        add     r6, lr, r6, lsr #(5+8)
        orr     \FB, \FB, r6, lsl #(\OFFSET + 5)

            // blue
            and     lr, \DREG, #(0x1F << \OFFSET)
            smulbt  lr, r7, lr
            mov     r6, \SRC, lsr #(8+8+3)
            and     r6, r6, #0x1F
            add     lr, r6, lr, lsr #8
            orr     \FB, \FB, lr, lsl #\OFFSET

.else

    // red
    mov     lr, \DREG, lsr #(6+5)
    and     lr, lr, #0x1F
    smulbb  lr, r7, lr
    mov     r6, \SRC, lsr #3
    and     r6, r6, #0x1F
    add     lr, r6, lr, lsr #8
    mov     \FB, lr, lsl #11

        // green
        and     r6, \DREG, #(0x3F<<5)
        smulbb  r6, r7, r6
        mov     lr, \SRC, lsr #(8+2)
        and     lr, lr, #0x3F
        add     r6, lr, r6, lsr #(5+8)
        orr     \FB, \FB, r6, lsl #5

            // blue
            and     lr, \DREG, #0x1F
            smulbb  lr, r7, lr
            mov     r6, \SRC, lsr #(8+8+3)
            and     r6, r6, #0x1F
            add     lr, r6, lr, lsr #8
            orr     \FB, \FB, lr

.endif

    .endm


// r0:  dst ptr
// r1:  src ptr
// r2:  count
// r3:  d
// r4:  s0
// r5:  s1
// r6:  pixel
// r7:  pixel
// r8:  free
// r9:  free
// r10: free
// r11: free
// r12: scratch
// r14: pixel

scanline_t32cb16blend_arm:
    stmfd	sp!, {r4-r7, lr}

    pld     [r0]
    pld     [r1]

    // align DST to 32 bits
    tst     r0, #0x3
    beq     aligned
    subs    r2, r2, #1
    ldmlofd	sp!, {r4-r7, lr}        // return
    bxlo    lr

last:
    ldr     r4, [r1], #4
    ldrh    r3, [r0]
    pixel   r3, r4, r12, 0
    strh    r12, [r0], #2

aligned:
    subs    r2, r2, #2
    blo     9f

    // The main loop is unrolled twice and process 4 pixels
8:  ldmia   r1!, {r4, r5}
    // stream the source
    pld     [r1, #32]
    add     r0, r0, #4
    // it's all zero, skip this pixel
    orrs    r3, r4, r5
    beq     7f

    // load the destination
    ldr     r3, [r0, #-4]
    // stream the destination
    pld     [r0, #32]
    pixel   r3, r4, r12, 0
    pixel   r3, r5, r12, 16
    // effectively, we're getting write-combining by virtue of the
    // cpu's write-back cache.
    str     r12, [r0, #-4]

    // 2nd iterration of the loop, don't stream anything
    subs    r2, r2, #2
    movlt   r4, r5
    blt     9f
    ldmia   r1!, {r4, r5}
    add     r0, r0, #4
    orrs    r3, r4, r5
    beq     7f
    ldr     r3, [r0, #-4]
    pixel   r3, r4, r12, 0
    pixel   r3, r5, r12, 16
    str     r12, [r0, #-4]


7:  subs    r2, r2, #2
    bhs     8b
    mov     r4, r5

9:  adds    r2, r2, #1
    ldmlofd sp!, {r4-r7, lr}        // return
    bxlo    lr
    b       last