/* libs/pixelflinger/codeflinger/GGLAssembler.cpp
**
** Copyright 2006, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
**     http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/

#define LOG_TAG "GGLAssembler"

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>

#include <log/log.h>

#include "GGLAssembler.h"

namespace android {

// ----------------------------------------------------------------------------

GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
    : ARMAssemblerProxy(target),
      RegisterAllocator(ARMAssemblerProxy::getCodegenArch()), mOptLevel(7)
{
}

GGLAssembler::~GGLAssembler()
{
}

void GGLAssembler::prolog()
{
    ARMAssemblerProxy::prolog();
}

void GGLAssembler::epilog(uint32_t touched)
{
    ARMAssemblerProxy::epilog(touched);
}

void GGLAssembler::reset(int opt_level)
{
    ARMAssemblerProxy::reset();
    RegisterAllocator::reset();
    mOptLevel = opt_level;
}

// ---------------------------------------------------------------------------

int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
{
    int err = 0;
    int opt_level = mOptLevel;
    while (opt_level >= 0) {
        reset(opt_level);
        err = scanline_core(needs, c);
        if (err == 0)
            break;
        opt_level--;
    }
    
    // XXX: in theory, pcForLabel is not valid before generate()
    uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
    uint32_t* fragment_end_pc = pcForLabel("epilog");
    const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
    
    // build a name for our pipeline
    char name[64];    
    sprintf(name,
            "scanline__%08X:%08X_%08X_%08X [%3d ipp]",
            needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);

    if (err) {
        ALOGE("Error while generating ""%s""\n", name);
        disassemble(name);
        return -1;
    }

    return generate(name);
}

int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
{
    int64_t duration = ggl_system_time();

    mBlendFactorCached = 0;
    mBlending = 0;
    mMasking = 0;
    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
    mBuilderContext.needs = needs;
    mBuilderContext.c = c;
    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];

    // ------------------------------------------------------------------------

    decodeLogicOpNeeds(needs);

    decodeTMUNeeds(needs, c);

    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));

    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrc == GGL_DST_ALPHA)) {
            mBlendSrc = GGL_ONE;
        }
        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrcA == GGL_DST_ALPHA)) {
            mBlendSrcA = GGL_ONE;
        }
        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDst == GGL_DST_ALPHA)) {
            mBlendDst = GGL_ONE;
        }
        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDstA == GGL_DST_ALPHA)) {
            mBlendDstA = GGL_ONE;
        }
    }

    // if we need the framebuffer, read it now
    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
                            blending_codes(mBlendSrcA, mBlendDstA);

    // XXX: handle special cases, destination not modified...
    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
        // Destination unmodified (beware of logic ops)
    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
        // Destination is zero (beware of logic ops)
    }
    
    int fbComponents = 0;
    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
    for (int i=0 ; i<4 ; i++) {
        const int mask = 1<<i;
        component_info_t& info = mInfo[i];
        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
            fs = GGL_ONE;
        info.masked =   !!(masking & mask);
        info.inDest =   !info.masked && mCbFormat.c[i].h && 
                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
        if (mCbFormat.components >= GGL_LUMINANCE &&
                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
            info.inDest = false;
        }
        info.needed =   (i==GGLFormat::ALPHA) && 
                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
        info.replaced = !!(mTextureMachine.replaced & mask);
        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
        info.smooth =   mSmooth && info.iterated;
        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));

        mBlending |= (info.blend ? mask : 0);
        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
        fbComponents |= mCbFormat.c[i].h ? mask : 0;
    }

    mAllMasked = (mMasking == fbComponents);
    if (mAllMasked) {
        mDithering = 0;
    }
    
    fragment_parts_t parts;

    // ------------------------------------------------------------------------
    prolog();
    // ------------------------------------------------------------------------

    build_scanline_prolog(parts, needs);

    if (registerFile().status())
        return registerFile().status();

    // ------------------------------------------------------------------------
    label("fragment_loop");
    // ------------------------------------------------------------------------
    {
        Scratch regs(registerFile());

        if (mDithering) {
            // update the dither index.
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
            ADD(AL, 0, parts.count.reg, parts.count.reg,
                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
        }

        // XXX: could we do an early alpha-test here in some cases?
        // It would probaly be used only with smooth-alpha and no texture
        // (or no alpha component in the texture).

        // Early z-test
        if (mAlphaTest==GGL_ALWAYS) {
            build_depth_test(parts, Z_TEST|Z_WRITE);
        } else {
            // we cannot do the z-write here, because
            // it might be killed by the alpha-test later
            build_depth_test(parts, Z_TEST);
        }

        { // texture coordinates
            Scratch scratches(registerFile());

            // texel generation
            build_textures(parts, regs);
            if (registerFile().status())
                return registerFile().status();
        }

        if ((blending & (FACTOR_DST|BLEND_DST)) || 
                (mMasking && !mAllMasked) ||
                (mLogicOp & LOGIC_OP_DST)) 
        {
            // blending / logic_op / masking need the framebuffer
            mDstPixel.setTo(regs.obtain(), &mCbFormat);

            // load the framebuffer pixel
            comment("fetch color-buffer");
            load(parts.cbPtr, mDstPixel);
        }

        if (registerFile().status())
            return registerFile().status();

        pixel_t pixel;
        int directTex = mTextureMachine.directTexture;
        if (directTex | parts.packed) {
            // note: we can't have both here
            // iterated color or direct texture
            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
            pixel.flags &= ~CORRUPTIBLE;
        } else {
            if (mDithering) {
                const int ctxtReg = mBuilderContext.Rctx;
                const int mask = GGL_DITHER_SIZE-1;
                parts.dither = reg_t(regs.obtain());
                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
                ADDR_ADD(AL, 0, parts.dither.reg, ctxtReg, parts.dither.reg);
                LDRB(AL, parts.dither.reg, parts.dither.reg,
                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
            }
        
            // allocate a register for the resulting pixel
            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);

            build_component(pixel, parts, GGLFormat::ALPHA,    regs);

            if (mAlphaTest!=GGL_ALWAYS) {
                // only handle the z-write part here. We know z-test
                // was successful, as well as alpha-test.
                build_depth_test(parts, Z_WRITE);
            }

            build_component(pixel, parts, GGLFormat::RED,      regs);
            build_component(pixel, parts, GGLFormat::GREEN,    regs);
            build_component(pixel, parts, GGLFormat::BLUE,     regs);

            pixel.flags |= CORRUPTIBLE;
        }

        if (registerFile().status())
            return registerFile().status();
        
        if (pixel.reg == -1) {
            // be defensive here. if we're here it's probably
            // that this whole fragment is a no-op.
            pixel = mDstPixel;
        }
        
        if (!mAllMasked) {
            // logic operation
            build_logic_op(pixel, regs);
    
            // masking
            build_masking(pixel, regs); 
    
            comment("store");
            store(parts.cbPtr, pixel, WRITE_BACK);
        }
    }

    if (registerFile().status())
        return registerFile().status();

    // update the iterated color...
    if (parts.reload != 3) {
        build_smooth_shade(parts);
    }

    // update iterated z
    build_iterate_z(parts);

    // update iterated fog
    build_iterate_f(parts);

    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
    B(PL, "fragment_loop");
    label("epilog");
    epilog(registerFile().touched());

    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
        if (mDepthTest!=GGL_ALWAYS) {
            label("discard_before_textures");
            build_iterate_texture_coordinates(parts);
        }
        label("discard_after_textures");
        build_smooth_shade(parts);
        build_iterate_z(parts);
        build_iterate_f(parts);
        if (!mAllMasked) {
            ADDR_ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
        }
        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
        B(PL, "fragment_loop");
        epilog(registerFile().touched());
    }

    return registerFile().status();
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_scanline_prolog(
    fragment_parts_t& parts, const needs_t& needs)
{
    Scratch scratches(registerFile());
    int Rctx = mBuilderContext.Rctx;

    // compute count
    comment("compute ct (# of pixels to process)");
    parts.count.setTo(obtainReg());
    int Rx = scratches.obtain();    
    int Ry = scratches.obtain();
    CONTEXT_LOAD(Rx, iterators.xl);
    CONTEXT_LOAD(parts.count.reg, iterators.xr);
    CONTEXT_LOAD(Ry, iterators.y);

    // parts.count = iterators.xr - Rx
    SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
    SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));

    if (mDithering) {
        // parts.count.reg = 0xNNNNXXDD
        // NNNN = count-1
        // DD   = dither offset
        // XX   = 0xxxxxxx (x = garbage)
        Scratch scratches(registerFile());
        int tx = scratches.obtain();
        int ty = scratches.obtain();
        AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
        AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
        ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
        ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
    } else {
        // parts.count.reg = 0xNNNN0000
        // NNNN = count-1
        MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
    }

    if (!mAllMasked) {
        // compute dst ptr
        comment("compute color-buffer pointer");
        const int cb_bits = mCbFormat.size*8;
        int Rs = scratches.obtain();
        parts.cbPtr.setTo(obtainReg(), cb_bits);
        CONTEXT_LOAD(Rs, state.buffers.color.stride);
        CONTEXT_ADDR_LOAD(parts.cbPtr.reg, state.buffers.color.data);
        SMLABB(AL, Rs, Ry, Rs, Rx);  // Rs = Rx + Ry*Rs
        base_offset(parts.cbPtr, parts.cbPtr, Rs);
        scratches.recycle(Rs);
    }
    
    // init fog
    const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
    if (need_fog) {
        comment("compute initial fog coordinate");
        Scratch scratches(registerFile());
        int dfdx = scratches.obtain();
        int ydfdy = scratches.obtain();
        int f = ydfdy;
        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
        CONTEXT_LOAD(ydfdy, iterators.ydfdy);
        MLA(AL, 0, f, Rx, dfdx, ydfdy);
        CONTEXT_STORE(f, generated_vars.f);
    }

    // init Z coordinate
    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
        parts.z = reg_t(obtainReg());
        comment("compute initial Z coordinate");
        Scratch scratches(registerFile());
        int dzdx = scratches.obtain();
        int ydzdy = parts.z.reg;
        CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
        CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
        MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);

        // we're going to index zbase of parts.count
        // zbase = base + (xl-count + stride*y)*2
        int Rs = dzdx;
        int zbase = scratches.obtain();
        CONTEXT_LOAD(Rs, state.buffers.depth.stride);
        CONTEXT_ADDR_LOAD(zbase, state.buffers.depth.data);
        SMLABB(AL, Rs, Ry, Rs, Rx);
        ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
        ADDR_ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
        CONTEXT_ADDR_STORE(zbase, generated_vars.zbase);
    }

    // init texture coordinates
    init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
    scratches.recycle(Ry);

    // iterated color
    init_iterated_color(parts, reg_t(Rx));

    // init coverage factor application (anti-aliasing)
    if (mAA) {
        parts.covPtr.setTo(obtainReg(), 16);
        CONTEXT_ADDR_LOAD(parts.covPtr.reg, state.buffers.coverage);
        ADDR_ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
    }
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_component( pixel_t& pixel,
                                    const fragment_parts_t& parts,
                                    int component,
                                    Scratch& regs)
{
    static char const * comments[] = {"alpha", "red", "green", "blue"};
    comment(comments[component]);

    // local register file
    Scratch scratches(registerFile());
    const int dst_component_size = pixel.component_size(component);

    component_t temp(-1);
    build_incoming_component( temp, dst_component_size,
            parts, component, scratches, regs);

    if (mInfo[component].inDest) {

        // blending...
        build_blending( temp, mDstPixel, component, scratches );

        // downshift component and rebuild pixel...
        downshift(pixel, component, temp, parts.dither);
    }
}

void GGLAssembler::build_incoming_component(
                                    component_t& temp,
                                    int dst_size,
                                    const fragment_parts_t& parts,
                                    int component,
                                    Scratch& scratches,
                                    Scratch& global_regs)
{
    const uint32_t component_mask = 1<<component;

    // Figure out what we need for the blending stage...
    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
        fs = GGL_ONE;
    }

    // Figure out what we need to extract and for what reason
    const int blending = blending_codes(fs, fd);

    // Are we actually going to blend?
    const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
    
    // expand the source if the destination has more bits
    int need_expander = false;
    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
        texture_unit_t& tmu = mTextureMachine.tmu[i];
        if ((tmu.format_idx) &&
            (parts.texel[i].component_size(component) < dst_size)) {
            need_expander = true;
        }
    }

    // do we need to extract this component?
    const bool multiTexture = mTextureMachine.activeUnits > 1;
    const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
                                        (isAlphaSourceNeeded());
    int need_extract = mInfo[component].needed;
    if (mInfo[component].inDest)
    {
        need_extract |= ((need_blending ?
                (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
        need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
        need_extract |= mInfo[component].smooth;
        need_extract |= mInfo[component].fog;
        need_extract |= mDithering;
        need_extract |= multiTexture;
    }

    if (need_extract) {
        Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
        component_t fragment;

        // iterated color
        build_iterated_color(fragment, parts, component, regs);

        // texture environement (decal, modulate, replace)
        build_texture_environment(fragment, parts, component, regs);

        // expand the source if the destination has more bits
        if (need_expander && (fragment.size() < dst_size)) {
            // we're here only if we fetched a texel
            // (so we know for sure fragment is CORRUPTIBLE)
            expand(fragment, fragment, dst_size);
        }

        // We have a few specific things to do for the alpha-channel
        if ((component==GGLFormat::ALPHA) &&
            (mInfo[component].needed || fragment.size()<dst_size))
        {
            // convert to integer_t first and make sure
            // we don't corrupt a needed register
            if (fragment.l) {
                component_t incoming(fragment);
                modify(fragment, regs);
                MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
                fragment.h -= fragment.l;
                fragment.l = 0;
            }

            // coverage factor application
            build_coverage_application(fragment, parts, regs);

            // alpha-test
            build_alpha_test(fragment, parts);

            if (blend_needs_alpha_source) {
                // We keep only 8 bits for the blending stage
                const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
                if (fragment.flags & CORRUPTIBLE) {
                    fragment.flags &= ~CORRUPTIBLE;
                    mAlphaSource.setTo(fragment.reg,
                            fragment.size(), fragment.flags);
                    if (shift) {
                        MOV(AL, 0, mAlphaSource.reg,
                            reg_imm(mAlphaSource.reg, LSR, shift));
                    }
                } else {
                    // XXX: it would better to do this in build_blend_factor()
                    // so we can avoid the extra MOV below.
                    mAlphaSource.setTo(regs.obtain(),
                            fragment.size(), CORRUPTIBLE);
                    if (shift) {
                        MOV(AL, 0, mAlphaSource.reg,
                            reg_imm(fragment.reg, LSR, shift));
                    } else {
                        MOV(AL, 0, mAlphaSource.reg, fragment.reg);
                    }
                }
                mAlphaSource.s -= shift;
            }
        }

        // fog...
        build_fog( fragment, component, regs );

        temp = fragment;
    } else {
        if (mInfo[component].inDest) {
            // extraction not needed and replace
            // we just select the right component
            if ((mTextureMachine.replaced & component_mask) == 0) {
                // component wasn't replaced, so use it!
                temp = component_t(parts.iterated, component);
            }
            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
                const texture_unit_t& tmu = mTextureMachine.tmu[i];
                if ((tmu.mask & component_mask) &&
                    ((tmu.replaced & component_mask) == 0)) {
                    temp = component_t(parts.texel[i], component);
                }
            }
        }
    }
}

bool GGLAssembler::isAlphaSourceNeeded() const
{
    // XXX: also needed for alpha-test
    const int bs = mBlendSrc;
    const int bd = mBlendDst;
    return  bs==GGL_SRC_ALPHA_SATURATE ||
            bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
            bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ; 
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
{
    if (mSmooth && !parts.iterated_packed) {
        // update the iterated color in a pipelined way...
        comment("update iterated color");
        Scratch scratches(registerFile());

        const int reload = parts.reload;
        for (int i=0 ; i<4 ; i++) {
            if (!mInfo[i].iterated) 
                continue;
                
            int c = parts.argb[i].reg;
            int dx = parts.argb_dx[i].reg;
            
            if (reload & 1) {
                c = scratches.obtain();
                CONTEXT_LOAD(c, generated_vars.argb[i].c);
            }
            if (reload & 2) {
                dx = scratches.obtain();
                CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
            }
            
            if (mSmooth) {
                ADD(AL, 0, c, c, dx);
            }
            
            if (reload & 1) {
                CONTEXT_STORE(c, generated_vars.argb[i].c);
                scratches.recycle(c);
            }
            if (reload & 2) {
                scratches.recycle(dx);
            }
        }
    }
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_coverage_application(component_t& fragment,
        const fragment_parts_t& parts, Scratch& regs)
{
    // here fragment.l is guarenteed to be 0
    if (mAA) {
        // coverages are 1.15 fixed-point numbers
        comment("coverage application");

        component_t incoming(fragment);
        modify(fragment, regs);

        Scratch scratches(registerFile());
        int cf = scratches.obtain();
        LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
        if (fragment.h > 31) {
            fragment.h--;
            SMULWB(AL, fragment.reg, incoming.reg, cf);
        } else {
            MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
            SMULWB(AL, fragment.reg, fragment.reg, cf);
        }
    }
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_alpha_test(component_t& fragment,
                                    const fragment_parts_t& /*parts*/)
{
    if (mAlphaTest != GGL_ALWAYS) {
        comment("Alpha Test");
        Scratch scratches(registerFile());
        int ref = scratches.obtain();
        const int shift = GGL_COLOR_BITS-fragment.size();
        CONTEXT_LOAD(ref, state.alpha_test.ref);
        if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
        else       CMP(AL, fragment.reg, ref);
        int cc = NV;
        switch (mAlphaTest) {
        case GGL_NEVER:     cc = NV;    break;
        case GGL_LESS:      cc = LT;    break;
        case GGL_EQUAL:     cc = EQ;    break;
        case GGL_LEQUAL:    cc = LS;    break;
        case GGL_GREATER:   cc = HI;    break;
        case GGL_NOTEQUAL:  cc = NE;    break;
        case GGL_GEQUAL:    cc = HS;    break;
        }
        B(cc^1, "discard_after_textures");
    }
}

// ---------------------------------------------------------------------------
            
void GGLAssembler::build_depth_test(
        const fragment_parts_t& parts, uint32_t mask)
{
    mask &= Z_TEST|Z_WRITE;
    const needs_t& needs = mBuilderContext.needs;
    const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
    Scratch scratches(registerFile());

    if (mDepthTest != GGL_ALWAYS || zmask) {
        int cc=AL, ic=AL;
        switch (mDepthTest) {
        case GGL_LESS:      ic = HI;    break;
        case GGL_EQUAL:     ic = EQ;    break;
        case GGL_LEQUAL:    ic = HS;    break;
        case GGL_GREATER:   ic = LT;    break;
        case GGL_NOTEQUAL:  ic = NE;    break;
        case GGL_GEQUAL:    ic = LS;    break;
        case GGL_NEVER:
            // this never happens, because it's taken care of when 
            // computing the needs. but we keep it for completness.
            comment("Depth Test (NEVER)");
            B(AL, "discard_before_textures");
            return;
        case GGL_ALWAYS:
            // we're here because zmask is enabled
            mask &= ~Z_TEST;    // test always passes.
            break;
        }
        
        // inverse the condition
        cc = ic^1;
        
        if ((mask & Z_WRITE) && !zmask) {
            mask &= ~Z_WRITE;
        }
        
        if (!mask)
            return;

        comment("Depth Test");

        int zbase = scratches.obtain();
        int depth = scratches.obtain();
        int z = parts.z.reg;
        
        CONTEXT_ADDR_LOAD(zbase, generated_vars.zbase);  // stall
        ADDR_SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
            // above does zbase = zbase + ((count >> 16) << 1)

        if (mask & Z_TEST) {
            LDRH(AL, depth, zbase);  // stall
            CMP(AL, depth, reg_imm(z, LSR, 16));
            B(cc, "discard_before_textures");
        }
        if (mask & Z_WRITE) {
            if (mask == Z_WRITE) {
                // only z-write asked, cc is meaningless
                ic = AL;
            }
            MOV(AL, 0, depth, reg_imm(z, LSR, 16));
            STRH(ic, depth, zbase);
        }
    }
}

void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
{
    const needs_t& needs = mBuilderContext.needs;
    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
        Scratch scratches(registerFile());
        int dzdx = scratches.obtain();
        CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
        ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx); 
    }
}

void GGLAssembler::build_iterate_f(const fragment_parts_t& /*parts*/)
{
    const needs_t& needs = mBuilderContext.needs;
    if (GGL_READ_NEEDS(P_FOG, needs.p)) {
        Scratch scratches(registerFile());
        int dfdx = scratches.obtain();
        int f = scratches.obtain();
        CONTEXT_LOAD(f,     generated_vars.f);
        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
        ADD(AL, 0, f, f, dfdx);
        CONTEXT_STORE(f,    generated_vars.f);
    }
}

// ---------------------------------------------------------------------------

void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
{
    const needs_t& needs = mBuilderContext.needs;
    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
    if (opcode == GGL_COPY)
        return;
    
    comment("logic operation");

    pixel_t s(pixel);
    if (!(pixel.flags & CORRUPTIBLE)) {
        pixel.reg = regs.obtain();
        pixel.flags |= CORRUPTIBLE;
    }
    
    pixel_t d(mDstPixel);
    switch(opcode) {
    case GGL_CLEAR:         MOV(AL, 0, pixel.reg, imm(0));          break;
    case GGL_AND:           AND(AL, 0, pixel.reg, s.reg, d.reg);    break;
    case GGL_AND_REVERSE:   BIC(AL, 0, pixel.reg, s.reg, d.reg);    break;
    case GGL_COPY:                                                  break;
    case GGL_AND_INVERTED:  BIC(AL, 0, pixel.reg, d.reg, s.reg);    break;
    case GGL_NOOP:          MOV(AL, 0, pixel.reg, d.reg);           break;
    case GGL_XOR:           EOR(AL, 0, pixel.reg, s.reg, d.reg);    break;
    case GGL_OR:            ORR(AL, 0, pixel.reg, s.reg, d.reg);    break;
    case GGL_NOR:           ORR(AL, 0, pixel.reg, s.reg, d.reg);
                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
    case GGL_EQUIV:         EOR(AL, 0, pixel.reg, s.reg, d.reg);
                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
    case GGL_INVERT:        MVN(AL, 0, pixel.reg, d.reg);           break;
    case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
                            BIC(AL, 0, pixel.reg, d.reg, s.reg);
                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
    case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg);           break;
    case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
                            BIC(AL, 0, pixel.reg, s.reg, d.reg);
                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
    case GGL_NAND:          AND(AL, 0, pixel.reg, s.reg, d.reg);
                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
    case GGL_SET:           MVN(AL, 0, pixel.reg, imm(0));          break;
    };        
}

// ---------------------------------------------------------------------------

static uint32_t find_bottom(uint32_t val)
{
    uint32_t i = 0;
    while (!(val & (3<<i)))
        i+= 2;
    return i;
}

static void normalize(uint32_t& val, uint32_t& rot)
{
    rot = 0;
    while (!(val&3)  || (val & 0xFC000000)) {
        uint32_t newval;
        newval = val >> 2;
        newval |= (val&3) << 30;
        val = newval;
        rot += 2;
        if (rot == 32) {
            rot = 0;
            break;
        }
    }
}

void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
{
    uint32_t rot;
    uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
    mask &= size;

    if (mask == size) {
        if (d != s)
            MOV( AL, 0, d, s);
        return;
    }
    
    if ((getCodegenArch() == CODEGEN_ARCH_MIPS) ||
        (getCodegenArch() == CODEGEN_ARCH_MIPS64)) {
        // MIPS can do 16-bit imm in 1 instr, 32-bit in 3 instr
        // the below ' while (mask)' code is buggy on mips
        // since mips returns true on isValidImmediate()
        // then we get multiple AND instr (positive logic)
        AND( AL, 0, d, s, imm(mask) );
        return;
    }
    else if (getCodegenArch() == CODEGEN_ARCH_ARM64) {
        AND( AL, 0, d, s, imm(mask) );
        return;
    }

    int negative_logic = !isValidImmediate(mask);
    if (negative_logic) {
        mask = ~mask & size;
    }
    normalize(mask, rot);

    if (mask) {
        while (mask) {
            uint32_t bitpos = find_bottom(mask);
            int shift = rot + bitpos;
            uint32_t m = mask & (0xff << bitpos);
            mask &= ~m;
            m >>= bitpos;
            int32_t newMask =  (m<<shift) | (m>>(32-shift));
            if (!negative_logic) {
                AND( AL, 0, d, s, imm(newMask) );
            } else {
                BIC( AL, 0, d, s, imm(newMask) );
            }
            s = d;
        }
    } else {
        MOV( AL, 0, d, imm(0));
    }
}		

void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
{
    if (!mMasking || mAllMasked) {
        return;
    }

    comment("color mask");

    pixel_t fb(mDstPixel);
    pixel_t s(pixel);
    if (!(pixel.flags & CORRUPTIBLE)) {
        pixel.reg = regs.obtain();
        pixel.flags |= CORRUPTIBLE;
    }

    int mask = 0;
    for (int i=0 ; i<4 ; i++) {
        const int component_mask = 1<<i;
        const int h = fb.format.c[i].h;
        const int l = fb.format.c[i].l;
        if (h && (!(mMasking & component_mask))) {
            mask |= ((1<<(h-l))-1) << l;
        }
    }

    // There is no need to clear the masked components of the source
    // (unless we applied a logic op), because they're already zeroed 
    // by construction (masked components are not computed)

    if (mLogicOp) {
        const needs_t& needs = mBuilderContext.needs;
        const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
        if (opcode != GGL_CLEAR) {
            // clear masked component of source
            build_and_immediate(pixel.reg, s.reg, mask, fb.size());
            s = pixel;
        }
    }

    // clear non masked components of destination
    build_and_immediate(fb.reg, fb.reg, ~mask, fb.size()); 

    // or back the channels that were masked
    if (s.reg == fb.reg) {
         // this is in fact a MOV
        if (s.reg == pixel.reg) {
            // ugh. this in in fact a nop
        } else {
            MOV(AL, 0, pixel.reg, fb.reg);
        }
    } else {
        ORR(AL, 0, pixel.reg, s.reg, fb.reg);
    }
}

// ---------------------------------------------------------------------------

void GGLAssembler::base_offset(
        const pointer_t& d, const pointer_t& b, const reg_t& o)
{
    switch (b.size) {
    case 32:
        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
        break;
    case 24:
        if (d.reg == b.reg) {
            ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
            ADDR_ADD(AL, 0, d.reg, d.reg, o.reg);
        } else {
            ADDR_ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
            ADDR_ADD(AL, 0, d.reg, d.reg, b.reg);
        }
        break;
    case 16:
        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
        break;
    case 8:
        ADDR_ADD(AL, 0, d.reg, b.reg, o.reg);
        break;
    }
}

// ----------------------------------------------------------------------------
// cheezy register allocator...
// ----------------------------------------------------------------------------

// Modified to support MIPS processors, in a very simple way. We retain the
// (Arm) limit of 16 total registers, but shift the mapping of those registers
// from 0-15, to 2-17. Register 0 on Mips cannot be used as GP registers, and
// register 1 has a traditional use as a temp).

RegisterAllocator::RegisterAllocator(int arch) : mRegs(arch)
{
}

void RegisterAllocator::reset()
{
    mRegs.reset();
}

int RegisterAllocator::reserveReg(int reg)
{
    return mRegs.reserve(reg);
}

int RegisterAllocator::obtainReg()
{
    return mRegs.obtain();
}

void RegisterAllocator::recycleReg(int reg)
{
    mRegs.recycle(reg);
}

RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
{
    return mRegs;
}

// ----------------------------------------------------------------------------

RegisterAllocator::RegisterFile::RegisterFile(int codegen_arch)
    : mRegs(0), mTouched(0), mStatus(0), mArch(codegen_arch), mRegisterOffset(0)
{
    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
    }
    reserve(ARMAssemblerInterface::SP);
    reserve(ARMAssemblerInterface::PC);
}

RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs, int codegen_arch)
    : mRegs(rhs.mRegs), mTouched(rhs.mTouched), mArch(codegen_arch), mRegisterOffset(0)
{
    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
    }
}

RegisterAllocator::RegisterFile::~RegisterFile()
{
}

bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
{
    return (mRegs == rhs.mRegs);
}

void RegisterAllocator::RegisterFile::reset()
{
    mRegs = mTouched = mStatus = 0;
    reserve(ARMAssemblerInterface::SP);
    reserve(ARMAssemblerInterface::PC);
}

// RegisterFile::reserve() take a register parameter in the
// range 0-15 (Arm compatible), but on a Mips processor, will
// return the actual allocated register in the range 2-17.
int RegisterAllocator::RegisterFile::reserve(int reg)
{
    reg += mRegisterOffset;
    LOG_ALWAYS_FATAL_IF(isUsed(reg),
                        "reserving register %d, but already in use",
                        reg);
    mRegs |= (1<<reg);
    mTouched |= mRegs;
    return reg;
}

// This interface uses regMask in range 2-17 on MIPS, no translation.
void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
{
    mRegs |= regMask;
    mTouched |= regMask;
}

int RegisterAllocator::RegisterFile::isUsed(int reg) const
{
    LOG_ALWAYS_FATAL_IF(reg>=16+(int)mRegisterOffset, "invalid register %d", reg);
    return mRegs & (1<<reg);
}

int RegisterAllocator::RegisterFile::obtain()
{
    const char priorityList[14] = {  0,  1, 2, 3, 
                                    12, 14, 4, 5, 
                                     6,  7, 8, 9,
                                    10, 11 };
    const int nbreg = sizeof(priorityList);
    int i, r, reg;
    for (i=0 ; i<nbreg ; i++) {
        r = priorityList[i];
        if (!isUsed(r + mRegisterOffset)) {
            break;
        }
    }
    // this is not an error anymore because, we'll try again with
    // a lower optimization level.
    //ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
    if (i >= nbreg) {
        mStatus |= OUT_OF_REGISTERS;
        // we return SP so we can more easily debug things
        // the code will never be run anyway.
        return ARMAssemblerInterface::SP; 
    }
    reg = reserve(r);  // Param in Arm range 0-15, returns range 2-17 on Mips.
    return reg;
}

bool RegisterAllocator::RegisterFile::hasFreeRegs() const
{
    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
    return ((regs & 0xFFFF) == 0xFFFF) ? false : true;
}

int RegisterAllocator::RegisterFile::countFreeRegs() const
{
    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
    int f = ~regs & 0xFFFF;
    // now count number of 1
   f = (f & 0x5555) + ((f>>1) & 0x5555);
   f = (f & 0x3333) + ((f>>2) & 0x3333);
   f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
   f = (f & 0x00FF) + ((f>>8) & 0x00FF);
   return f;
}

void RegisterAllocator::RegisterFile::recycle(int reg)
{
    // commented out, since common failure of running out of regs
    // triggers this assertion. Since the code is not execectued
    // in that case, it does not matter. No reason to FATAL err.
    // LOG_FATAL_IF(!isUsed(reg),
    //         "recycling unallocated register %d",
    //         reg);
    mRegs &= ~(1<<reg);
}

void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
{
    // commented out, since common failure of running out of regs
    // triggers this assertion. Since the code is not execectued
    // in that case, it does not matter. No reason to FATAL err.
    // LOG_FATAL_IF((mRegs & regMask)!=regMask,
    //         "recycling unallocated registers "
    //         "(recycle=%08x, allocated=%08x, unallocated=%08x)",
    //         regMask, mRegs, mRegs&regMask);
    mRegs &= ~regMask;
}

uint32_t RegisterAllocator::RegisterFile::touched() const
{
    return mTouched;
}

// ----------------------------------------------------------------------------

}; // namespace android